]> granicus.if.org Git - postgresql/blob - src/backend/commands/vacuumlazy.c
pgindent run for 9.5
[postgresql] / src / backend / commands / vacuumlazy.c
1 /*-------------------------------------------------------------------------
2  *
3  * vacuumlazy.c
4  *        Concurrent ("lazy") vacuuming.
5  *
6  *
7  * The major space usage for LAZY VACUUM is storage for the array of dead
8  * tuple TIDs, with the next biggest need being storage for per-disk-page
9  * free space info.  We want to ensure we can vacuum even the very largest
10  * relations with finite memory space usage.  To do that, we set upper bounds
11  * on the number of tuples and pages we will keep track of at once.
12  *
13  * We are willing to use at most maintenance_work_mem (or perhaps
14  * autovacuum_work_mem) memory space to keep track of dead tuples.  We
15  * initially allocate an array of TIDs of that size, with an upper limit that
16  * depends on table size (this limit ensures we don't allocate a huge area
17  * uselessly for vacuuming small tables).  If the array threatens to overflow,
18  * we suspend the heap scan phase and perform a pass of index cleanup and page
19  * compaction, then resume the heap scan with an empty TID array.
20  *
21  * If we're processing a table with no indexes, we can just vacuum each page
22  * as we go; there's no need to save up multiple tuples to minimize the number
23  * of index scans performed.  So we don't use maintenance_work_mem memory for
24  * the TID array, just enough to hold as many heap tuples as fit on one page.
25  *
26  *
27  * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
28  * Portions Copyright (c) 1994, Regents of the University of California
29  *
30  *
31  * IDENTIFICATION
32  *        src/backend/commands/vacuumlazy.c
33  *
34  *-------------------------------------------------------------------------
35  */
36 #include "postgres.h"
37
38 #include <math.h>
39
40 #include "access/genam.h"
41 #include "access/heapam.h"
42 #include "access/heapam_xlog.h"
43 #include "access/htup_details.h"
44 #include "access/multixact.h"
45 #include "access/transam.h"
46 #include "access/visibilitymap.h"
47 #include "access/xlog.h"
48 #include "catalog/catalog.h"
49 #include "catalog/storage.h"
50 #include "commands/dbcommands.h"
51 #include "commands/vacuum.h"
52 #include "miscadmin.h"
53 #include "pgstat.h"
54 #include "portability/instr_time.h"
55 #include "postmaster/autovacuum.h"
56 #include "storage/bufmgr.h"
57 #include "storage/freespace.h"
58 #include "storage/lmgr.h"
59 #include "utils/lsyscache.h"
60 #include "utils/memutils.h"
61 #include "utils/pg_rusage.h"
62 #include "utils/timestamp.h"
63 #include "utils/tqual.h"
64
65
66 /*
67  * Space/time tradeoff parameters: do these need to be user-tunable?
68  *
69  * To consider truncating the relation, we want there to be at least
70  * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
71  * is less) potentially-freeable pages.
72  */
73 #define REL_TRUNCATE_MINIMUM    1000
74 #define REL_TRUNCATE_FRACTION   16
75
76 /*
77  * Timing parameters for truncate locking heuristics.
78  *
79  * These were not exposed as user tunable GUC values because it didn't seem
80  * that the potential for improvement was great enough to merit the cost of
81  * supporting them.
82  */
83 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL             20              /* ms */
84 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL              50              /* ms */
85 #define VACUUM_TRUNCATE_LOCK_TIMEOUT                    5000    /* ms */
86
87 /*
88  * Guesstimation of number of dead tuples per page.  This is used to
89  * provide an upper limit to memory allocated when vacuuming small
90  * tables.
91  */
92 #define LAZY_ALLOC_TUPLES               MaxHeapTuplesPerPage
93
94 /*
95  * Before we consider skipping a page that's marked as clean in
96  * visibility map, we must've seen at least this many clean pages.
97  */
98 #define SKIP_PAGES_THRESHOLD    ((BlockNumber) 32)
99
100 typedef struct LVRelStats
101 {
102         /* hasindex = true means two-pass strategy; false means one-pass */
103         bool            hasindex;
104         /* Overall statistics about rel */
105         BlockNumber old_rel_pages;      /* previous value of pg_class.relpages */
106         BlockNumber rel_pages;          /* total number of pages */
107         BlockNumber scanned_pages;      /* number of pages we examined */
108         BlockNumber pinskipped_pages;           /* # of pages we skipped due to a pin */
109         double          scanned_tuples; /* counts only tuples on scanned pages */
110         double          old_rel_tuples; /* previous value of pg_class.reltuples */
111         double          new_rel_tuples; /* new estimated total # of tuples */
112         double          new_dead_tuples;        /* new estimated total # of dead tuples */
113         BlockNumber pages_removed;
114         double          tuples_deleted;
115         BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
116         /* List of TIDs of tuples we intend to delete */
117         /* NB: this list is ordered by TID address */
118         int                     num_dead_tuples;        /* current # of entries */
119         int                     max_dead_tuples;        /* # slots allocated in array */
120         ItemPointer dead_tuples;        /* array of ItemPointerData */
121         int                     num_index_scans;
122         TransactionId latestRemovedXid;
123         bool            lock_waiter_detected;
124 } LVRelStats;
125
126
127 /* A few variables that don't seem worth passing around as parameters */
128 static int      elevel = -1;
129
130 static TransactionId OldestXmin;
131 static TransactionId FreezeLimit;
132 static MultiXactId MultiXactCutoff;
133
134 static BufferAccessStrategy vac_strategy;
135
136
137 /* non-export function prototypes */
138 static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
139                            Relation *Irel, int nindexes, bool scan_all);
140 static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
141 static bool lazy_check_needs_freeze(Buffer buf);
142 static void lazy_vacuum_index(Relation indrel,
143                                   IndexBulkDeleteResult **stats,
144                                   LVRelStats *vacrelstats);
145 static void lazy_cleanup_index(Relation indrel,
146                                    IndexBulkDeleteResult *stats,
147                                    LVRelStats *vacrelstats);
148 static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
149                                  int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer);
150 static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
151 static BlockNumber count_nondeletable_pages(Relation onerel,
152                                                  LVRelStats *vacrelstats);
153 static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
154 static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
155                                            ItemPointer itemptr);
156 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
157 static int      vac_cmp_itemptr(const void *left, const void *right);
158 static bool heap_page_is_all_visible(Relation rel, Buffer buf,
159                                                  TransactionId *visibility_cutoff_xid);
160
161
162 /*
163  *      lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
164  *
165  *              This routine vacuums a single heap, cleans out its indexes, and
166  *              updates its relpages and reltuples statistics.
167  *
168  *              At entry, we have already established a transaction and opened
169  *              and locked the relation.
170  */
171 void
172 lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params,
173                                 BufferAccessStrategy bstrategy)
174 {
175         LVRelStats *vacrelstats;
176         Relation   *Irel;
177         int                     nindexes;
178         BlockNumber possibly_freeable;
179         PGRUsage        ru0;
180         TimestampTz starttime = 0;
181         long            secs;
182         int                     usecs;
183         double          read_rate,
184                                 write_rate;
185         bool            scan_all;               /* should we scan all pages? */
186         bool            scanned_all;    /* did we actually scan all pages? */
187         TransactionId xidFullScanLimit;
188         MultiXactId mxactFullScanLimit;
189         BlockNumber new_rel_pages;
190         double          new_rel_tuples;
191         BlockNumber new_rel_allvisible;
192         double          new_live_tuples;
193         TransactionId new_frozen_xid;
194         MultiXactId new_min_multi;
195
196         Assert(params != NULL);
197
198         /* measure elapsed time iff autovacuum logging requires it */
199         if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
200         {
201                 pg_rusage_init(&ru0);
202                 starttime = GetCurrentTimestamp();
203         }
204
205         if (options & VACOPT_VERBOSE)
206                 elevel = INFO;
207         else
208                 elevel = DEBUG2;
209
210         vac_strategy = bstrategy;
211
212         vacuum_set_xid_limits(onerel,
213                                                   params->freeze_min_age,
214                                                   params->freeze_table_age,
215                                                   params->multixact_freeze_min_age,
216                                                   params->multixact_freeze_table_age,
217                                                   &OldestXmin, &FreezeLimit, &xidFullScanLimit,
218                                                   &MultiXactCutoff, &mxactFullScanLimit);
219
220         /*
221          * We request a full scan if either the table's frozen Xid is now older
222          * than or equal to the requested Xid full-table scan limit; or if the
223          * table's minimum MultiXactId is older than or equal to the requested
224          * mxid full-table scan limit.
225          */
226         scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
227                                                                                          xidFullScanLimit);
228         scan_all |= MultiXactIdPrecedesOrEquals(onerel->rd_rel->relminmxid,
229                                                                                         mxactFullScanLimit);
230
231         vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
232
233         vacrelstats->old_rel_pages = onerel->rd_rel->relpages;
234         vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples;
235         vacrelstats->num_index_scans = 0;
236         vacrelstats->pages_removed = 0;
237         vacrelstats->lock_waiter_detected = false;
238
239         /* Open all indexes of the relation */
240         vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
241         vacrelstats->hasindex = (nindexes > 0);
242
243         /* Do the vacuuming */
244         lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all);
245
246         /* Done with indexes */
247         vac_close_indexes(nindexes, Irel, NoLock);
248
249         /*
250          * Compute whether we actually scanned the whole relation. If we did, we
251          * can adjust relfrozenxid and relminmxid.
252          *
253          * NB: We need to check this before truncating the relation, because that
254          * will change ->rel_pages.
255          */
256         if (vacrelstats->scanned_pages < vacrelstats->rel_pages)
257         {
258                 Assert(!scan_all);
259                 scanned_all = false;
260         }
261         else
262                 scanned_all = true;
263
264         /*
265          * Optionally truncate the relation.
266          *
267          * Don't even think about it unless we have a shot at releasing a goodly
268          * number of pages.  Otherwise, the time taken isn't worth it.
269          */
270         possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
271         if (possibly_freeable > 0 &&
272                 (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
273                  possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION))
274                 lazy_truncate_heap(onerel, vacrelstats);
275
276         /* Vacuum the Free Space Map */
277         FreeSpaceMapVacuum(onerel);
278
279         /*
280          * Update statistics in pg_class.
281          *
282          * A corner case here is that if we scanned no pages at all because every
283          * page is all-visible, we should not update relpages/reltuples, because
284          * we have no new information to contribute.  In particular this keeps us
285          * from replacing relpages=reltuples=0 (which means "unknown tuple
286          * density") with nonzero relpages and reltuples=0 (which means "zero
287          * tuple density") unless there's some actual evidence for the latter.
288          *
289          * We do update relallvisible even in the corner case, since if the table
290          * is all-visible we'd definitely like to know that.  But clamp the value
291          * to be not more than what we're setting relpages to.
292          *
293          * Also, don't change relfrozenxid/relminmxid if we skipped any pages,
294          * since then we don't know for certain that all tuples have a newer xmin.
295          */
296         new_rel_pages = vacrelstats->rel_pages;
297         new_rel_tuples = vacrelstats->new_rel_tuples;
298         if (vacrelstats->scanned_pages == 0 && new_rel_pages > 0)
299         {
300                 new_rel_pages = vacrelstats->old_rel_pages;
301                 new_rel_tuples = vacrelstats->old_rel_tuples;
302         }
303
304         new_rel_allvisible = visibilitymap_count(onerel);
305         if (new_rel_allvisible > new_rel_pages)
306                 new_rel_allvisible = new_rel_pages;
307
308         new_frozen_xid = scanned_all ? FreezeLimit : InvalidTransactionId;
309         new_min_multi = scanned_all ? MultiXactCutoff : InvalidMultiXactId;
310
311         vac_update_relstats(onerel,
312                                                 new_rel_pages,
313                                                 new_rel_tuples,
314                                                 new_rel_allvisible,
315                                                 vacrelstats->hasindex,
316                                                 new_frozen_xid,
317                                                 new_min_multi,
318                                                 false);
319
320         /* report results to the stats collector, too */
321         new_live_tuples = new_rel_tuples - vacrelstats->new_dead_tuples;
322         if (new_live_tuples < 0)
323                 new_live_tuples = 0;    /* just in case */
324
325         pgstat_report_vacuum(RelationGetRelid(onerel),
326                                                  onerel->rd_rel->relisshared,
327                                                  new_live_tuples,
328                                                  vacrelstats->new_dead_tuples);
329
330         /* and log the action if appropriate */
331         if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
332         {
333                 TimestampTz endtime = GetCurrentTimestamp();
334
335                 if (params->log_min_duration == 0 ||
336                         TimestampDifferenceExceeds(starttime, endtime,
337                                                                            params->log_min_duration))
338                 {
339                         StringInfoData buf;
340
341                         TimestampDifference(starttime, endtime, &secs, &usecs);
342
343                         read_rate = 0;
344                         write_rate = 0;
345                         if ((secs > 0) || (usecs > 0))
346                         {
347                                 read_rate = (double) BLCKSZ *VacuumPageMiss / (1024 * 1024) /
348                                                         (secs + usecs / 1000000.0);
349                                 write_rate = (double) BLCKSZ *VacuumPageDirty / (1024 * 1024) /
350                                                         (secs + usecs / 1000000.0);
351                         }
352
353                         /*
354                          * This is pretty messy, but we split it up so that we can skip
355                          * emitting individual parts of the message when not applicable.
356                          */
357                         initStringInfo(&buf);
358                         appendStringInfo(&buf, _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"),
359                                                          get_database_name(MyDatabaseId),
360                                                          get_namespace_name(RelationGetNamespace(onerel)),
361                                                          RelationGetRelationName(onerel),
362                                                          vacrelstats->num_index_scans);
363                         appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins\n"),
364                                                          vacrelstats->pages_removed,
365                                                          vacrelstats->rel_pages,
366                                                          vacrelstats->pinskipped_pages);
367                         appendStringInfo(&buf,
368                                                          _("tuples: %.0f removed, %.0f remain, %.0f are dead but not yet removable\n"),
369                                                          vacrelstats->tuples_deleted,
370                                                          vacrelstats->new_rel_tuples,
371                                                          vacrelstats->new_dead_tuples);
372                         appendStringInfo(&buf,
373                                                  _("buffer usage: %d hits, %d misses, %d dirtied\n"),
374                                                          VacuumPageHit,
375                                                          VacuumPageMiss,
376                                                          VacuumPageDirty);
377                         appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
378                                                          read_rate, write_rate);
379                         appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0));
380
381                         ereport(LOG,
382                                         (errmsg_internal("%s", buf.data)));
383                         pfree(buf.data);
384                 }
385         }
386 }
387
388 /*
389  * For Hot Standby we need to know the highest transaction id that will
390  * be removed by any change. VACUUM proceeds in a number of passes so
391  * we need to consider how each pass operates. The first phase runs
392  * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
393  * progresses - these will have a latestRemovedXid on each record.
394  * In some cases this removes all of the tuples to be removed, though
395  * often we have dead tuples with index pointers so we must remember them
396  * for removal in phase 3. Index records for those rows are removed
397  * in phase 2 and index blocks do not have MVCC information attached.
398  * So before we can allow removal of any index tuples we need to issue
399  * a WAL record containing the latestRemovedXid of rows that will be
400  * removed in phase three. This allows recovery queries to block at the
401  * correct place, i.e. before phase two, rather than during phase three
402  * which would be after the rows have become inaccessible.
403  */
404 static void
405 vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
406 {
407         /*
408          * Skip this for relations for which no WAL is to be written, or if we're
409          * not trying to support archive recovery.
410          */
411         if (!RelationNeedsWAL(rel) || !XLogIsNeeded())
412                 return;
413
414         /*
415          * No need to write the record at all unless it contains a valid value
416          */
417         if (TransactionIdIsValid(vacrelstats->latestRemovedXid))
418                 (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid);
419 }
420
421 /*
422  *      lazy_scan_heap() -- scan an open heap relation
423  *
424  *              This routine prunes each page in the heap, which will among other
425  *              things truncate dead tuples to dead line pointers, defragment the
426  *              page, and set commit status bits (see heap_page_prune).  It also builds
427  *              lists of dead tuples and pages with free space, calculates statistics
428  *              on the number of live tuples in the heap, and marks pages as
429  *              all-visible if appropriate.  When done, or when we run low on space for
430  *              dead-tuple TIDs, invoke vacuuming of indexes and call lazy_vacuum_heap
431  *              to reclaim dead line pointers.
432  *
433  *              If there are no indexes then we can reclaim line pointers on the fly;
434  *              dead line pointers need only be retained until all index pointers that
435  *              reference them have been killed.
436  */
437 static void
438 lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
439                            Relation *Irel, int nindexes, bool scan_all)
440 {
441         BlockNumber nblocks,
442                                 blkno;
443         HeapTupleData tuple;
444         char       *relname;
445         BlockNumber empty_pages,
446                                 vacuumed_pages;
447         double          num_tuples,
448                                 tups_vacuumed,
449                                 nkeep,
450                                 nunused;
451         IndexBulkDeleteResult **indstats;
452         int                     i;
453         PGRUsage        ru0;
454         Buffer          vmbuffer = InvalidBuffer;
455         BlockNumber next_not_all_visible_block;
456         bool            skipping_all_visible_blocks;
457         xl_heap_freeze_tuple *frozen;
458         StringInfoData buf;
459
460         pg_rusage_init(&ru0);
461
462         relname = RelationGetRelationName(onerel);
463         ereport(elevel,
464                         (errmsg("vacuuming \"%s.%s\"",
465                                         get_namespace_name(RelationGetNamespace(onerel)),
466                                         relname)));
467
468         empty_pages = vacuumed_pages = 0;
469         num_tuples = tups_vacuumed = nkeep = nunused = 0;
470
471         indstats = (IndexBulkDeleteResult **)
472                 palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
473
474         nblocks = RelationGetNumberOfBlocks(onerel);
475         vacrelstats->rel_pages = nblocks;
476         vacrelstats->scanned_pages = 0;
477         vacrelstats->nonempty_pages = 0;
478         vacrelstats->latestRemovedXid = InvalidTransactionId;
479
480         lazy_space_alloc(vacrelstats, nblocks);
481         frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage);
482
483         /*
484          * We want to skip pages that don't require vacuuming according to the
485          * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD
486          * consecutive pages.  Since we're reading sequentially, the OS should be
487          * doing readahead for us, so there's no gain in skipping a page now and
488          * then; that's likely to disable readahead and so be counterproductive.
489          * Also, skipping even a single page means that we can't update
490          * relfrozenxid, so we only want to do it if we can skip a goodly number
491          * of pages.
492          *
493          * Before entering the main loop, establish the invariant that
494          * next_not_all_visible_block is the next block number >= blkno that's not
495          * all-visible according to the visibility map, or nblocks if there's no
496          * such block.  Also, we set up the skipping_all_visible_blocks flag,
497          * which is needed because we need hysteresis in the decision: once we've
498          * started skipping blocks, we may as well skip everything up to the next
499          * not-all-visible block.
500          *
501          * Note: if scan_all is true, we won't actually skip any pages; but we
502          * maintain next_not_all_visible_block anyway, so as to set up the
503          * all_visible_according_to_vm flag correctly for each page.
504          *
505          * Note: The value returned by visibilitymap_test could be slightly
506          * out-of-date, since we make this test before reading the corresponding
507          * heap page or locking the buffer.  This is OK.  If we mistakenly think
508          * that the page is all-visible when in fact the flag's just been cleared,
509          * we might fail to vacuum the page.  But it's OK to skip pages when
510          * scan_all is not set, so no great harm done; the next vacuum will find
511          * them.  If we make the reverse mistake and vacuum a page unnecessarily,
512          * it'll just be a no-op.
513          */
514         for (next_not_all_visible_block = 0;
515                  next_not_all_visible_block < nblocks;
516                  next_not_all_visible_block++)
517         {
518                 if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer))
519                         break;
520                 vacuum_delay_point();
521         }
522         if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD)
523                 skipping_all_visible_blocks = true;
524         else
525                 skipping_all_visible_blocks = false;
526
527         for (blkno = 0; blkno < nblocks; blkno++)
528         {
529                 Buffer          buf;
530                 Page            page;
531                 OffsetNumber offnum,
532                                         maxoff;
533                 bool            tupgone,
534                                         hastup;
535                 int                     prev_dead_count;
536                 int                     nfrozen;
537                 Size            freespace;
538                 bool            all_visible_according_to_vm;
539                 bool            all_visible;
540                 bool            has_dead_tuples;
541                 TransactionId visibility_cutoff_xid = InvalidTransactionId;
542
543                 if (blkno == next_not_all_visible_block)
544                 {
545                         /* Time to advance next_not_all_visible_block */
546                         for (next_not_all_visible_block++;
547                                  next_not_all_visible_block < nblocks;
548                                  next_not_all_visible_block++)
549                         {
550                                 if (!visibilitymap_test(onerel, next_not_all_visible_block,
551                                                                                 &vmbuffer))
552                                         break;
553                                 vacuum_delay_point();
554                         }
555
556                         /*
557                          * We know we can't skip the current block.  But set up
558                          * skipping_all_visible_blocks to do the right thing at the
559                          * following blocks.
560                          */
561                         if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD)
562                                 skipping_all_visible_blocks = true;
563                         else
564                                 skipping_all_visible_blocks = false;
565                         all_visible_according_to_vm = false;
566                 }
567                 else
568                 {
569                         /* Current block is all-visible */
570                         if (skipping_all_visible_blocks && !scan_all)
571                                 continue;
572                         all_visible_according_to_vm = true;
573                 }
574
575                 vacuum_delay_point();
576
577                 /*
578                  * If we are close to overrunning the available space for dead-tuple
579                  * TIDs, pause and do a cycle of vacuuming before we tackle this page.
580                  */
581                 if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
582                         vacrelstats->num_dead_tuples > 0)
583                 {
584                         /*
585                          * Before beginning index vacuuming, we release any pin we may
586                          * hold on the visibility map page.  This isn't necessary for
587                          * correctness, but we do it anyway to avoid holding the pin
588                          * across a lengthy, unrelated operation.
589                          */
590                         if (BufferIsValid(vmbuffer))
591                         {
592                                 ReleaseBuffer(vmbuffer);
593                                 vmbuffer = InvalidBuffer;
594                         }
595
596                         /* Log cleanup info before we touch indexes */
597                         vacuum_log_cleanup_info(onerel, vacrelstats);
598
599                         /* Remove index entries */
600                         for (i = 0; i < nindexes; i++)
601                                 lazy_vacuum_index(Irel[i],
602                                                                   &indstats[i],
603                                                                   vacrelstats);
604                         /* Remove tuples from heap */
605                         lazy_vacuum_heap(onerel, vacrelstats);
606
607                         /*
608                          * Forget the now-vacuumed tuples, and press on, but be careful
609                          * not to reset latestRemovedXid since we want that value to be
610                          * valid.
611                          */
612                         vacrelstats->num_dead_tuples = 0;
613                         vacrelstats->num_index_scans++;
614                 }
615
616                 /*
617                  * Pin the visibility map page in case we need to mark the page
618                  * all-visible.  In most cases this will be very cheap, because we'll
619                  * already have the correct page pinned anyway.  However, it's
620                  * possible that (a) next_not_all_visible_block is covered by a
621                  * different VM page than the current block or (b) we released our pin
622                  * and did a cycle of index vacuuming.
623                  */
624                 visibilitymap_pin(onerel, blkno, &vmbuffer);
625
626                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
627                                                                  RBM_NORMAL, vac_strategy);
628
629                 /* We need buffer cleanup lock so that we can prune HOT chains. */
630                 if (!ConditionalLockBufferForCleanup(buf))
631                 {
632                         /*
633                          * If we're not scanning the whole relation to guard against XID
634                          * wraparound, it's OK to skip vacuuming a page.  The next vacuum
635                          * will clean it up.
636                          */
637                         if (!scan_all)
638                         {
639                                 ReleaseBuffer(buf);
640                                 vacrelstats->pinskipped_pages++;
641                                 continue;
642                         }
643
644                         /*
645                          * If this is a wraparound checking vacuum, then we read the page
646                          * with share lock to see if any xids need to be frozen. If the
647                          * page doesn't need attention we just skip and continue. If it
648                          * does, we wait for cleanup lock.
649                          *
650                          * We could defer the lock request further by remembering the page
651                          * and coming back to it later, or we could even register
652                          * ourselves for multiple buffers and then service whichever one
653                          * is received first.  For now, this seems good enough.
654                          */
655                         LockBuffer(buf, BUFFER_LOCK_SHARE);
656                         if (!lazy_check_needs_freeze(buf))
657                         {
658                                 UnlockReleaseBuffer(buf);
659                                 vacrelstats->scanned_pages++;
660                                 vacrelstats->pinskipped_pages++;
661                                 continue;
662                         }
663                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
664                         LockBufferForCleanup(buf);
665                         /* drop through to normal processing */
666                 }
667
668                 vacrelstats->scanned_pages++;
669
670                 page = BufferGetPage(buf);
671
672                 if (PageIsNew(page))
673                 {
674                         /*
675                          * An all-zeroes page could be left over if a backend extends the
676                          * relation but crashes before initializing the page. Reclaim such
677                          * pages for use.
678                          *
679                          * We have to be careful here because we could be looking at a
680                          * page that someone has just added to the relation and not yet
681                          * been able to initialize (see RelationGetBufferForTuple). To
682                          * protect against that, release the buffer lock, grab the
683                          * relation extension lock momentarily, and re-lock the buffer. If
684                          * the page is still uninitialized by then, it must be left over
685                          * from a crashed backend, and we can initialize it.
686                          *
687                          * We don't really need the relation lock when this is a new or
688                          * temp relation, but it's probably not worth the code space to
689                          * check that, since this surely isn't a critical path.
690                          *
691                          * Note: the comparable code in vacuum.c need not worry because
692                          * it's got exclusive lock on the whole relation.
693                          */
694                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
695                         LockRelationForExtension(onerel, ExclusiveLock);
696                         UnlockRelationForExtension(onerel, ExclusiveLock);
697                         LockBufferForCleanup(buf);
698                         if (PageIsNew(page))
699                         {
700                                 ereport(WARNING,
701                                 (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
702                                                 relname, blkno)));
703                                 PageInit(page, BufferGetPageSize(buf), 0);
704                                 empty_pages++;
705                         }
706                         freespace = PageGetHeapFreeSpace(page);
707                         MarkBufferDirty(buf);
708                         UnlockReleaseBuffer(buf);
709
710                         RecordPageWithFreeSpace(onerel, blkno, freespace);
711                         continue;
712                 }
713
714                 if (PageIsEmpty(page))
715                 {
716                         empty_pages++;
717                         freespace = PageGetHeapFreeSpace(page);
718
719                         /* empty pages are always all-visible */
720                         if (!PageIsAllVisible(page))
721                         {
722                                 START_CRIT_SECTION();
723
724                                 /* mark buffer dirty before writing a WAL record */
725                                 MarkBufferDirty(buf);
726
727                                 /*
728                                  * It's possible that another backend has extended the heap,
729                                  * initialized the page, and then failed to WAL-log the page
730                                  * due to an ERROR.  Since heap extension is not WAL-logged,
731                                  * recovery might try to replay our record setting the page
732                                  * all-visible and find that the page isn't initialized, which
733                                  * will cause a PANIC.  To prevent that, check whether the
734                                  * page has been previously WAL-logged, and if not, do that
735                                  * now.
736                                  */
737                                 if (RelationNeedsWAL(onerel) &&
738                                         PageGetLSN(page) == InvalidXLogRecPtr)
739                                         log_newpage_buffer(buf, true);
740
741                                 PageSetAllVisible(page);
742                                 visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
743                                                                   vmbuffer, InvalidTransactionId);
744                                 END_CRIT_SECTION();
745                         }
746
747                         UnlockReleaseBuffer(buf);
748                         RecordPageWithFreeSpace(onerel, blkno, freespace);
749                         continue;
750                 }
751
752                 /*
753                  * Prune all HOT-update chains in this page.
754                  *
755                  * We count tuples removed by the pruning step as removed by VACUUM.
756                  */
757                 tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
758                                                                                  &vacrelstats->latestRemovedXid);
759
760                 /*
761                  * Now scan the page to collect vacuumable items and check for tuples
762                  * requiring freezing.
763                  */
764                 all_visible = true;
765                 has_dead_tuples = false;
766                 nfrozen = 0;
767                 hastup = false;
768                 prev_dead_count = vacrelstats->num_dead_tuples;
769                 maxoff = PageGetMaxOffsetNumber(page);
770
771                 /*
772                  * Note: If you change anything in the loop below, also look at
773                  * heap_page_is_all_visible to see if that needs to be changed.
774                  */
775                 for (offnum = FirstOffsetNumber;
776                          offnum <= maxoff;
777                          offnum = OffsetNumberNext(offnum))
778                 {
779                         ItemId          itemid;
780
781                         itemid = PageGetItemId(page, offnum);
782
783                         /* Unused items require no processing, but we count 'em */
784                         if (!ItemIdIsUsed(itemid))
785                         {
786                                 nunused += 1;
787                                 continue;
788                         }
789
790                         /* Redirect items mustn't be touched */
791                         if (ItemIdIsRedirected(itemid))
792                         {
793                                 hastup = true;  /* this page won't be truncatable */
794                                 continue;
795                         }
796
797                         ItemPointerSet(&(tuple.t_self), blkno, offnum);
798
799                         /*
800                          * DEAD item pointers are to be vacuumed normally; but we don't
801                          * count them in tups_vacuumed, else we'd be double-counting (at
802                          * least in the common case where heap_page_prune() just freed up
803                          * a non-HOT tuple).
804                          */
805                         if (ItemIdIsDead(itemid))
806                         {
807                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
808                                 all_visible = false;
809                                 continue;
810                         }
811
812                         Assert(ItemIdIsNormal(itemid));
813
814                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
815                         tuple.t_len = ItemIdGetLength(itemid);
816                         tuple.t_tableOid = RelationGetRelid(onerel);
817
818                         tupgone = false;
819
820                         switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
821                         {
822                                 case HEAPTUPLE_DEAD:
823
824                                         /*
825                                          * Ordinarily, DEAD tuples would have been removed by
826                                          * heap_page_prune(), but it's possible that the tuple
827                                          * state changed since heap_page_prune() looked.  In
828                                          * particular an INSERT_IN_PROGRESS tuple could have
829                                          * changed to DEAD if the inserter aborted.  So this
830                                          * cannot be considered an error condition.
831                                          *
832                                          * If the tuple is HOT-updated then it must only be
833                                          * removed by a prune operation; so we keep it just as if
834                                          * it were RECENTLY_DEAD.  Also, if it's a heap-only
835                                          * tuple, we choose to keep it, because it'll be a lot
836                                          * cheaper to get rid of it in the next pruning pass than
837                                          * to treat it like an indexed tuple.
838                                          */
839                                         if (HeapTupleIsHotUpdated(&tuple) ||
840                                                 HeapTupleIsHeapOnly(&tuple))
841                                                 nkeep += 1;
842                                         else
843                                                 tupgone = true; /* we can delete the tuple */
844                                         all_visible = false;
845                                         break;
846                                 case HEAPTUPLE_LIVE:
847                                         /* Tuple is good --- but let's do some validity checks */
848                                         if (onerel->rd_rel->relhasoids &&
849                                                 !OidIsValid(HeapTupleGetOid(&tuple)))
850                                                 elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
851                                                          relname, blkno, offnum);
852
853                                         /*
854                                          * Is the tuple definitely visible to all transactions?
855                                          *
856                                          * NB: Like with per-tuple hint bits, we can't set the
857                                          * PD_ALL_VISIBLE flag if the inserter committed
858                                          * asynchronously. See SetHintBits for more info. Check
859                                          * that the tuple is hinted xmin-committed because of
860                                          * that.
861                                          */
862                                         if (all_visible)
863                                         {
864                                                 TransactionId xmin;
865
866                                                 if (!HeapTupleHeaderXminCommitted(tuple.t_data))
867                                                 {
868                                                         all_visible = false;
869                                                         break;
870                                                 }
871
872                                                 /*
873                                                  * The inserter definitely committed. But is it old
874                                                  * enough that everyone sees it as committed?
875                                                  */
876                                                 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
877                                                 if (!TransactionIdPrecedes(xmin, OldestXmin))
878                                                 {
879                                                         all_visible = false;
880                                                         break;
881                                                 }
882
883                                                 /* Track newest xmin on page. */
884                                                 if (TransactionIdFollows(xmin, visibility_cutoff_xid))
885                                                         visibility_cutoff_xid = xmin;
886                                         }
887                                         break;
888                                 case HEAPTUPLE_RECENTLY_DEAD:
889
890                                         /*
891                                          * If tuple is recently deleted then we must not remove it
892                                          * from relation.
893                                          */
894                                         nkeep += 1;
895                                         all_visible = false;
896                                         break;
897                                 case HEAPTUPLE_INSERT_IN_PROGRESS:
898                                         /* This is an expected case during concurrent vacuum */
899                                         all_visible = false;
900                                         break;
901                                 case HEAPTUPLE_DELETE_IN_PROGRESS:
902                                         /* This is an expected case during concurrent vacuum */
903                                         all_visible = false;
904                                         break;
905                                 default:
906                                         elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
907                                         break;
908                         }
909
910                         if (tupgone)
911                         {
912                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
913                                 HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
914                                                                                          &vacrelstats->latestRemovedXid);
915                                 tups_vacuumed += 1;
916                                 has_dead_tuples = true;
917                         }
918                         else
919                         {
920                                 num_tuples += 1;
921                                 hastup = true;
922
923                                 /*
924                                  * Each non-removable tuple must be checked to see if it needs
925                                  * freezing.  Note we already have exclusive buffer lock.
926                                  */
927                                 if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit,
928                                                                                   MultiXactCutoff, &frozen[nfrozen]))
929                                         frozen[nfrozen++].offset = offnum;
930                         }
931                 }                                               /* scan along page */
932
933                 /*
934                  * If we froze any tuples, mark the buffer dirty, and write a WAL
935                  * record recording the changes.  We must log the changes to be
936                  * crash-safe against future truncation of CLOG.
937                  */
938                 if (nfrozen > 0)
939                 {
940                         START_CRIT_SECTION();
941
942                         MarkBufferDirty(buf);
943
944                         /* execute collected freezes */
945                         for (i = 0; i < nfrozen; i++)
946                         {
947                                 ItemId          itemid;
948                                 HeapTupleHeader htup;
949
950                                 itemid = PageGetItemId(page, frozen[i].offset);
951                                 htup = (HeapTupleHeader) PageGetItem(page, itemid);
952
953                                 heap_execute_freeze_tuple(htup, &frozen[i]);
954                         }
955
956                         /* Now WAL-log freezing if necessary */
957                         if (RelationNeedsWAL(onerel))
958                         {
959                                 XLogRecPtr      recptr;
960
961                                 recptr = log_heap_freeze(onerel, buf, FreezeLimit,
962                                                                                  frozen, nfrozen);
963                                 PageSetLSN(page, recptr);
964                         }
965
966                         END_CRIT_SECTION();
967                 }
968
969                 /*
970                  * If there are no indexes then we can vacuum the page right now
971                  * instead of doing a second scan.
972                  */
973                 if (nindexes == 0 &&
974                         vacrelstats->num_dead_tuples > 0)
975                 {
976                         /* Remove tuples from heap */
977                         lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer);
978                         has_dead_tuples = false;
979
980                         /*
981                          * Forget the now-vacuumed tuples, and press on, but be careful
982                          * not to reset latestRemovedXid since we want that value to be
983                          * valid.
984                          */
985                         vacrelstats->num_dead_tuples = 0;
986                         vacuumed_pages++;
987                 }
988
989                 freespace = PageGetHeapFreeSpace(page);
990
991                 /* mark page all-visible, if appropriate */
992                 if (all_visible && !all_visible_according_to_vm)
993                 {
994                         /*
995                          * It should never be the case that the visibility map page is set
996                          * while the page-level bit is clear, but the reverse is allowed
997                          * (if checksums are not enabled).  Regardless, set the both bits
998                          * so that we get back in sync.
999                          *
1000                          * NB: If the heap page is all-visible but the VM bit is not set,
1001                          * we don't need to dirty the heap page.  However, if checksums
1002                          * are enabled, we do need to make sure that the heap page is
1003                          * dirtied before passing it to visibilitymap_set(), because it
1004                          * may be logged.  Given that this situation should only happen in
1005                          * rare cases after a crash, it is not worth optimizing.
1006                          */
1007                         PageSetAllVisible(page);
1008                         MarkBufferDirty(buf);
1009                         visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
1010                                                           vmbuffer, visibility_cutoff_xid);
1011                 }
1012
1013                 /*
1014                  * As of PostgreSQL 9.2, the visibility map bit should never be set if
1015                  * the page-level bit is clear.  However, it's possible that the bit
1016                  * got cleared after we checked it and before we took the buffer
1017                  * content lock, so we must recheck before jumping to the conclusion
1018                  * that something bad has happened.
1019                  */
1020                 else if (all_visible_according_to_vm && !PageIsAllVisible(page)
1021                                  && visibilitymap_test(onerel, blkno, &vmbuffer))
1022                 {
1023                         elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
1024                                  relname, blkno);
1025                         visibilitymap_clear(onerel, blkno, vmbuffer);
1026                 }
1027
1028                 /*
1029                  * It's possible for the value returned by GetOldestXmin() to move
1030                  * backwards, so it's not wrong for us to see tuples that appear to
1031                  * not be visible to everyone yet, while PD_ALL_VISIBLE is already
1032                  * set. The real safe xmin value never moves backwards, but
1033                  * GetOldestXmin() is conservative and sometimes returns a value
1034                  * that's unnecessarily small, so if we see that contradiction it just
1035                  * means that the tuples that we think are not visible to everyone yet
1036                  * actually are, and the PD_ALL_VISIBLE flag is correct.
1037                  *
1038                  * There should never be dead tuples on a page with PD_ALL_VISIBLE
1039                  * set, however.
1040                  */
1041                 else if (PageIsAllVisible(page) && has_dead_tuples)
1042                 {
1043                         elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
1044                                  relname, blkno);
1045                         PageClearAllVisible(page);
1046                         MarkBufferDirty(buf);
1047                         visibilitymap_clear(onerel, blkno, vmbuffer);
1048                 }
1049
1050                 UnlockReleaseBuffer(buf);
1051
1052                 /* Remember the location of the last page with nonremovable tuples */
1053                 if (hastup)
1054                         vacrelstats->nonempty_pages = blkno + 1;
1055
1056                 /*
1057                  * If we remembered any tuples for deletion, then the page will be
1058                  * visited again by lazy_vacuum_heap, which will compute and record
1059                  * its post-compaction free space.  If not, then we're done with this
1060                  * page, so remember its free space as-is.  (This path will always be
1061                  * taken if there are no indexes.)
1062                  */
1063                 if (vacrelstats->num_dead_tuples == prev_dead_count)
1064                         RecordPageWithFreeSpace(onerel, blkno, freespace);
1065         }
1066
1067         pfree(frozen);
1068
1069         /* save stats for use later */
1070         vacrelstats->scanned_tuples = num_tuples;
1071         vacrelstats->tuples_deleted = tups_vacuumed;
1072         vacrelstats->new_dead_tuples = nkeep;
1073
1074         /* now we can compute the new value for pg_class.reltuples */
1075         vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false,
1076                                                                                                                  nblocks,
1077                                                                                                   vacrelstats->scanned_pages,
1078                                                                                                                  num_tuples);
1079
1080         /*
1081          * Release any remaining pin on visibility map page.
1082          */
1083         if (BufferIsValid(vmbuffer))
1084         {
1085                 ReleaseBuffer(vmbuffer);
1086                 vmbuffer = InvalidBuffer;
1087         }
1088
1089         /* If any tuples need to be deleted, perform final vacuum cycle */
1090         /* XXX put a threshold on min number of tuples here? */
1091         if (vacrelstats->num_dead_tuples > 0)
1092         {
1093                 /* Log cleanup info before we touch indexes */
1094                 vacuum_log_cleanup_info(onerel, vacrelstats);
1095
1096                 /* Remove index entries */
1097                 for (i = 0; i < nindexes; i++)
1098                         lazy_vacuum_index(Irel[i],
1099                                                           &indstats[i],
1100                                                           vacrelstats);
1101                 /* Remove tuples from heap */
1102                 lazy_vacuum_heap(onerel, vacrelstats);
1103                 vacrelstats->num_index_scans++;
1104         }
1105
1106         /* Do post-vacuum cleanup and statistics update for each index */
1107         for (i = 0; i < nindexes; i++)
1108                 lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
1109
1110         /* If no indexes, make log report that lazy_vacuum_heap would've made */
1111         if (vacuumed_pages)
1112                 ereport(elevel,
1113                                 (errmsg("\"%s\": removed %.0f row versions in %u pages",
1114                                                 RelationGetRelationName(onerel),
1115                                                 tups_vacuumed, vacuumed_pages)));
1116
1117         /*
1118          * This is pretty messy, but we split it up so that we can skip emitting
1119          * individual parts of the message when not applicable.
1120          */
1121         initStringInfo(&buf);
1122         appendStringInfo(&buf,
1123                                          _("%.0f dead row versions cannot be removed yet.\n"),
1124                                          nkeep);
1125         appendStringInfo(&buf, _("There were %.0f unused item pointers.\n"),
1126                                          nunused);
1127         appendStringInfo(&buf, _("Skipped %u pages due to buffer pins.\n"),
1128                                          vacrelstats->pinskipped_pages);
1129         appendStringInfo(&buf, _("%u pages are entirely empty.\n"),
1130                                          empty_pages);
1131         appendStringInfo(&buf, _("%s."),
1132                                          pg_rusage_show(&ru0));
1133
1134         ereport(elevel,
1135                         (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
1136                                         RelationGetRelationName(onerel),
1137                                         tups_vacuumed, num_tuples,
1138                                         vacrelstats->scanned_pages, nblocks),
1139                          errdetail_internal("%s", buf.data)));
1140         pfree(buf.data);
1141 }
1142
1143
1144 /*
1145  *      lazy_vacuum_heap() -- second pass over the heap
1146  *
1147  *              This routine marks dead tuples as unused and compacts out free
1148  *              space on their pages.  Pages not having dead tuples recorded from
1149  *              lazy_scan_heap are not visited at all.
1150  *
1151  * Note: the reason for doing this as a second pass is we cannot remove
1152  * the tuples until we've removed their index entries, and we want to
1153  * process index entry removal in batches as large as possible.
1154  */
1155 static void
1156 lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
1157 {
1158         int                     tupindex;
1159         int                     npages;
1160         PGRUsage        ru0;
1161         Buffer          vmbuffer = InvalidBuffer;
1162
1163         pg_rusage_init(&ru0);
1164         npages = 0;
1165
1166         tupindex = 0;
1167         while (tupindex < vacrelstats->num_dead_tuples)
1168         {
1169                 BlockNumber tblk;
1170                 Buffer          buf;
1171                 Page            page;
1172                 Size            freespace;
1173
1174                 vacuum_delay_point();
1175
1176                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
1177                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
1178                                                                  vac_strategy);
1179                 if (!ConditionalLockBufferForCleanup(buf))
1180                 {
1181                         ReleaseBuffer(buf);
1182                         ++tupindex;
1183                         continue;
1184                 }
1185                 tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats,
1186                                                                         &vmbuffer);
1187
1188                 /* Now that we've compacted the page, record its available space */
1189                 page = BufferGetPage(buf);
1190                 freespace = PageGetHeapFreeSpace(page);
1191
1192                 UnlockReleaseBuffer(buf);
1193                 RecordPageWithFreeSpace(onerel, tblk, freespace);
1194                 npages++;
1195         }
1196
1197         if (BufferIsValid(vmbuffer))
1198         {
1199                 ReleaseBuffer(vmbuffer);
1200                 vmbuffer = InvalidBuffer;
1201         }
1202
1203         ereport(elevel,
1204                         (errmsg("\"%s\": removed %d row versions in %d pages",
1205                                         RelationGetRelationName(onerel),
1206                                         tupindex, npages),
1207                          errdetail("%s.",
1208                                            pg_rusage_show(&ru0))));
1209 }
1210
1211 /*
1212  *      lazy_vacuum_page() -- free dead tuples on a page
1213  *                                       and repair its fragmentation.
1214  *
1215  * Caller must hold pin and buffer cleanup lock on the buffer.
1216  *
1217  * tupindex is the index in vacrelstats->dead_tuples of the first dead
1218  * tuple for this page.  We assume the rest follow sequentially.
1219  * The return value is the first tupindex after the tuples of this page.
1220  */
1221 static int
1222 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
1223                                  int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer)
1224 {
1225         Page            page = BufferGetPage(buffer);
1226         OffsetNumber unused[MaxOffsetNumber];
1227         int                     uncnt = 0;
1228         TransactionId visibility_cutoff_xid;
1229
1230         START_CRIT_SECTION();
1231
1232         for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
1233         {
1234                 BlockNumber tblk;
1235                 OffsetNumber toff;
1236                 ItemId          itemid;
1237
1238                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
1239                 if (tblk != blkno)
1240                         break;                          /* past end of tuples for this block */
1241                 toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
1242                 itemid = PageGetItemId(page, toff);
1243                 ItemIdSetUnused(itemid);
1244                 unused[uncnt++] = toff;
1245         }
1246
1247         PageRepairFragmentation(page);
1248
1249         /*
1250          * Mark buffer dirty before we write WAL.
1251          */
1252         MarkBufferDirty(buffer);
1253
1254         /* XLOG stuff */
1255         if (RelationNeedsWAL(onerel))
1256         {
1257                 XLogRecPtr      recptr;
1258
1259                 recptr = log_heap_clean(onerel, buffer,
1260                                                                 NULL, 0, NULL, 0,
1261                                                                 unused, uncnt,
1262                                                                 vacrelstats->latestRemovedXid);
1263                 PageSetLSN(page, recptr);
1264         }
1265
1266         /*
1267          * End critical section, so we safely can do visibility tests (which
1268          * possibly need to perform IO and allocate memory!). If we crash now the
1269          * page (including the corresponding vm bit) might not be marked all
1270          * visible, but that's fine. A later vacuum will fix that.
1271          */
1272         END_CRIT_SECTION();
1273
1274         /*
1275          * Now that we have removed the dead tuples from the page, once again
1276          * check if the page has become all-visible.  The page is already marked
1277          * dirty, exclusively locked, and, if needed, a full page image has been
1278          * emitted in the log_heap_clean() above.
1279          */
1280         if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid))
1281                 PageSetAllVisible(page);
1282
1283         /*
1284          * All the changes to the heap page have been done. If the all-visible
1285          * flag is now set, also set the VM bit.
1286          */
1287         if (PageIsAllVisible(page) &&
1288                 !visibilitymap_test(onerel, blkno, vmbuffer))
1289         {
1290                 Assert(BufferIsValid(*vmbuffer));
1291                 visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr, *vmbuffer,
1292                                                   visibility_cutoff_xid);
1293         }
1294
1295         return tupindex;
1296 }
1297
1298 /*
1299  *      lazy_check_needs_freeze() -- scan page to see if any tuples
1300  *                                       need to be cleaned to avoid wraparound
1301  *
1302  * Returns true if the page needs to be vacuumed using cleanup lock.
1303  */
1304 static bool
1305 lazy_check_needs_freeze(Buffer buf)
1306 {
1307         Page            page;
1308         OffsetNumber offnum,
1309                                 maxoff;
1310         HeapTupleHeader tupleheader;
1311
1312         page = BufferGetPage(buf);
1313
1314         if (PageIsNew(page) || PageIsEmpty(page))
1315         {
1316                 /* PageIsNew probably shouldn't happen... */
1317                 return false;
1318         }
1319
1320         maxoff = PageGetMaxOffsetNumber(page);
1321         for (offnum = FirstOffsetNumber;
1322                  offnum <= maxoff;
1323                  offnum = OffsetNumberNext(offnum))
1324         {
1325                 ItemId          itemid;
1326
1327                 itemid = PageGetItemId(page, offnum);
1328
1329                 if (!ItemIdIsNormal(itemid))
1330                         continue;
1331
1332                 tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
1333
1334                 if (heap_tuple_needs_freeze(tupleheader, FreezeLimit,
1335                                                                         MultiXactCutoff, buf))
1336                         return true;
1337         }                                                       /* scan along page */
1338
1339         return false;
1340 }
1341
1342
1343 /*
1344  *      lazy_vacuum_index() -- vacuum one index relation.
1345  *
1346  *              Delete all the index entries pointing to tuples listed in
1347  *              vacrelstats->dead_tuples, and update running statistics.
1348  */
1349 static void
1350 lazy_vacuum_index(Relation indrel,
1351                                   IndexBulkDeleteResult **stats,
1352                                   LVRelStats *vacrelstats)
1353 {
1354         IndexVacuumInfo ivinfo;
1355         PGRUsage        ru0;
1356
1357         pg_rusage_init(&ru0);
1358
1359         ivinfo.index = indrel;
1360         ivinfo.analyze_only = false;
1361         ivinfo.estimated_count = true;
1362         ivinfo.message_level = elevel;
1363         ivinfo.num_heap_tuples = vacrelstats->old_rel_tuples;
1364         ivinfo.strategy = vac_strategy;
1365
1366         /* Do bulk deletion */
1367         *stats = index_bulk_delete(&ivinfo, *stats,
1368                                                            lazy_tid_reaped, (void *) vacrelstats);
1369
1370         ereport(elevel,
1371                         (errmsg("scanned index \"%s\" to remove %d row versions",
1372                                         RelationGetRelationName(indrel),
1373                                         vacrelstats->num_dead_tuples),
1374                          errdetail("%s.", pg_rusage_show(&ru0))));
1375 }
1376
1377 /*
1378  *      lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
1379  */
1380 static void
1381 lazy_cleanup_index(Relation indrel,
1382                                    IndexBulkDeleteResult *stats,
1383                                    LVRelStats *vacrelstats)
1384 {
1385         IndexVacuumInfo ivinfo;
1386         PGRUsage        ru0;
1387
1388         pg_rusage_init(&ru0);
1389
1390         ivinfo.index = indrel;
1391         ivinfo.analyze_only = false;
1392         ivinfo.estimated_count = (vacrelstats->scanned_pages < vacrelstats->rel_pages);
1393         ivinfo.message_level = elevel;
1394         ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples;
1395         ivinfo.strategy = vac_strategy;
1396
1397         stats = index_vacuum_cleanup(&ivinfo, stats);
1398
1399         if (!stats)
1400                 return;
1401
1402         /*
1403          * Now update statistics in pg_class, but only if the index says the count
1404          * is accurate.
1405          */
1406         if (!stats->estimated_count)
1407                 vac_update_relstats(indrel,
1408                                                         stats->num_pages,
1409                                                         stats->num_index_tuples,
1410                                                         0,
1411                                                         false,
1412                                                         InvalidTransactionId,
1413                                                         InvalidMultiXactId,
1414                                                         false);
1415
1416         ereport(elevel,
1417                         (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
1418                                         RelationGetRelationName(indrel),
1419                                         stats->num_index_tuples,
1420                                         stats->num_pages),
1421                          errdetail("%.0f index row versions were removed.\n"
1422                          "%u index pages have been deleted, %u are currently reusable.\n"
1423                                            "%s.",
1424                                            stats->tuples_removed,
1425                                            stats->pages_deleted, stats->pages_free,
1426                                            pg_rusage_show(&ru0))));
1427
1428         pfree(stats);
1429 }
1430
1431 /*
1432  * lazy_truncate_heap - try to truncate off any empty pages at the end
1433  */
1434 static void
1435 lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
1436 {
1437         BlockNumber old_rel_pages = vacrelstats->rel_pages;
1438         BlockNumber new_rel_pages;
1439         PGRUsage        ru0;
1440         int                     lock_retry;
1441
1442         pg_rusage_init(&ru0);
1443
1444         /*
1445          * Loop until no more truncating can be done.
1446          */
1447         do
1448         {
1449                 /*
1450                  * We need full exclusive lock on the relation in order to do
1451                  * truncation. If we can't get it, give up rather than waiting --- we
1452                  * don't want to block other backends, and we don't want to deadlock
1453                  * (which is quite possible considering we already hold a lower-grade
1454                  * lock).
1455                  */
1456                 vacrelstats->lock_waiter_detected = false;
1457                 lock_retry = 0;
1458                 while (true)
1459                 {
1460                         if (ConditionalLockRelation(onerel, AccessExclusiveLock))
1461                                 break;
1462
1463                         /*
1464                          * Check for interrupts while trying to (re-)acquire the exclusive
1465                          * lock.
1466                          */
1467                         CHECK_FOR_INTERRUPTS();
1468
1469                         if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
1470                                                                 VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
1471                         {
1472                                 /*
1473                                  * We failed to establish the lock in the specified number of
1474                                  * retries. This means we give up truncating.
1475                                  */
1476                                 vacrelstats->lock_waiter_detected = true;
1477                                 ereport(elevel,
1478                                                 (errmsg("\"%s\": stopping truncate due to conflicting lock request",
1479                                                                 RelationGetRelationName(onerel))));
1480                                 return;
1481                         }
1482
1483                         pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL);
1484                 }
1485
1486                 /*
1487                  * Now that we have exclusive lock, look to see if the rel has grown
1488                  * whilst we were vacuuming with non-exclusive lock.  If so, give up;
1489                  * the newly added pages presumably contain non-deletable tuples.
1490                  */
1491                 new_rel_pages = RelationGetNumberOfBlocks(onerel);
1492                 if (new_rel_pages != old_rel_pages)
1493                 {
1494                         /*
1495                          * Note: we intentionally don't update vacrelstats->rel_pages with
1496                          * the new rel size here.  If we did, it would amount to assuming
1497                          * that the new pages are empty, which is unlikely. Leaving the
1498                          * numbers alone amounts to assuming that the new pages have the
1499                          * same tuple density as existing ones, which is less unlikely.
1500                          */
1501                         UnlockRelation(onerel, AccessExclusiveLock);
1502                         return;
1503                 }
1504
1505                 /*
1506                  * Scan backwards from the end to verify that the end pages actually
1507                  * contain no tuples.  This is *necessary*, not optional, because
1508                  * other backends could have added tuples to these pages whilst we
1509                  * were vacuuming.
1510                  */
1511                 new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
1512
1513                 if (new_rel_pages >= old_rel_pages)
1514                 {
1515                         /* can't do anything after all */
1516                         UnlockRelation(onerel, AccessExclusiveLock);
1517                         return;
1518                 }
1519
1520                 /*
1521                  * Okay to truncate.
1522                  */
1523                 RelationTruncate(onerel, new_rel_pages);
1524
1525                 /*
1526                  * We can release the exclusive lock as soon as we have truncated.
1527                  * Other backends can't safely access the relation until they have
1528                  * processed the smgr invalidation that smgrtruncate sent out ... but
1529                  * that should happen as part of standard invalidation processing once
1530                  * they acquire lock on the relation.
1531                  */
1532                 UnlockRelation(onerel, AccessExclusiveLock);
1533
1534                 /*
1535                  * Update statistics.  Here, it *is* correct to adjust rel_pages
1536                  * without also touching reltuples, since the tuple count wasn't
1537                  * changed by the truncation.
1538                  */
1539                 vacrelstats->pages_removed += old_rel_pages - new_rel_pages;
1540                 vacrelstats->rel_pages = new_rel_pages;
1541
1542                 ereport(elevel,
1543                                 (errmsg("\"%s\": truncated %u to %u pages",
1544                                                 RelationGetRelationName(onerel),
1545                                                 old_rel_pages, new_rel_pages),
1546                                  errdetail("%s.",
1547                                                    pg_rusage_show(&ru0))));
1548                 old_rel_pages = new_rel_pages;
1549         } while (new_rel_pages > vacrelstats->nonempty_pages &&
1550                          vacrelstats->lock_waiter_detected);
1551 }
1552
1553 /*
1554  * Rescan end pages to verify that they are (still) empty of tuples.
1555  *
1556  * Returns number of nondeletable pages (last nonempty page + 1).
1557  */
1558 static BlockNumber
1559 count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
1560 {
1561         BlockNumber blkno;
1562         instr_time      starttime;
1563
1564         /* Initialize the starttime if we check for conflicting lock requests */
1565         INSTR_TIME_SET_CURRENT(starttime);
1566
1567         /* Strange coding of loop control is needed because blkno is unsigned */
1568         blkno = vacrelstats->rel_pages;
1569         while (blkno > vacrelstats->nonempty_pages)
1570         {
1571                 Buffer          buf;
1572                 Page            page;
1573                 OffsetNumber offnum,
1574                                         maxoff;
1575                 bool            hastup;
1576
1577                 /*
1578                  * Check if another process requests a lock on our relation. We are
1579                  * holding an AccessExclusiveLock here, so they will be waiting. We
1580                  * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
1581                  * only check if that interval has elapsed once every 32 blocks to
1582                  * keep the number of system calls and actual shared lock table
1583                  * lookups to a minimum.
1584                  */
1585                 if ((blkno % 32) == 0)
1586                 {
1587                         instr_time      currenttime;
1588                         instr_time      elapsed;
1589
1590                         INSTR_TIME_SET_CURRENT(currenttime);
1591                         elapsed = currenttime;
1592                         INSTR_TIME_SUBTRACT(elapsed, starttime);
1593                         if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
1594                                 >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
1595                         {
1596                                 if (LockHasWaitersRelation(onerel, AccessExclusiveLock))
1597                                 {
1598                                         ereport(elevel,
1599                                                         (errmsg("\"%s\": suspending truncate due to conflicting lock request",
1600                                                                         RelationGetRelationName(onerel))));
1601
1602                                         vacrelstats->lock_waiter_detected = true;
1603                                         return blkno;
1604                                 }
1605                                 starttime = currenttime;
1606                         }
1607                 }
1608
1609                 /*
1610                  * We don't insert a vacuum delay point here, because we have an
1611                  * exclusive lock on the table which we want to hold for as short a
1612                  * time as possible.  We still need to check for interrupts however.
1613                  */
1614                 CHECK_FOR_INTERRUPTS();
1615
1616                 blkno--;
1617
1618                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
1619                                                                  RBM_NORMAL, vac_strategy);
1620
1621                 /* In this phase we only need shared access to the buffer */
1622                 LockBuffer(buf, BUFFER_LOCK_SHARE);
1623
1624                 page = BufferGetPage(buf);
1625
1626                 if (PageIsNew(page) || PageIsEmpty(page))
1627                 {
1628                         /* PageIsNew probably shouldn't happen... */
1629                         UnlockReleaseBuffer(buf);
1630                         continue;
1631                 }
1632
1633                 hastup = false;
1634                 maxoff = PageGetMaxOffsetNumber(page);
1635                 for (offnum = FirstOffsetNumber;
1636                          offnum <= maxoff;
1637                          offnum = OffsetNumberNext(offnum))
1638                 {
1639                         ItemId          itemid;
1640
1641                         itemid = PageGetItemId(page, offnum);
1642
1643                         /*
1644                          * Note: any non-unused item should be taken as a reason to keep
1645                          * this page.  We formerly thought that DEAD tuples could be
1646                          * thrown away, but that's not so, because we'd not have cleaned
1647                          * out their index entries.
1648                          */
1649                         if (ItemIdIsUsed(itemid))
1650                         {
1651                                 hastup = true;
1652                                 break;                  /* can stop scanning */
1653                         }
1654                 }                                               /* scan along page */
1655
1656                 UnlockReleaseBuffer(buf);
1657
1658                 /* Done scanning if we found a tuple here */
1659                 if (hastup)
1660                         return blkno + 1;
1661         }
1662
1663         /*
1664          * If we fall out of the loop, all the previously-thought-to-be-empty
1665          * pages still are; we need not bother to look at the last known-nonempty
1666          * page.
1667          */
1668         return vacrelstats->nonempty_pages;
1669 }
1670
1671 /*
1672  * lazy_space_alloc - space allocation decisions for lazy vacuum
1673  *
1674  * See the comments at the head of this file for rationale.
1675  */
1676 static void
1677 lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
1678 {
1679         long            maxtuples;
1680         int                     vac_work_mem = IsAutoVacuumWorkerProcess() &&
1681         autovacuum_work_mem != -1 ?
1682         autovacuum_work_mem : maintenance_work_mem;
1683
1684         if (vacrelstats->hasindex)
1685         {
1686                 maxtuples = (vac_work_mem * 1024L) / sizeof(ItemPointerData);
1687                 maxtuples = Min(maxtuples, INT_MAX);
1688                 maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
1689
1690                 /* curious coding here to ensure the multiplication can't overflow */
1691                 if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
1692                         maxtuples = relblocks * LAZY_ALLOC_TUPLES;
1693
1694                 /* stay sane if small maintenance_work_mem */
1695                 maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
1696         }
1697         else
1698         {
1699                 maxtuples = MaxHeapTuplesPerPage;
1700         }
1701
1702         vacrelstats->num_dead_tuples = 0;
1703         vacrelstats->max_dead_tuples = (int) maxtuples;
1704         vacrelstats->dead_tuples = (ItemPointer)
1705                 palloc(maxtuples * sizeof(ItemPointerData));
1706 }
1707
1708 /*
1709  * lazy_record_dead_tuple - remember one deletable tuple
1710  */
1711 static void
1712 lazy_record_dead_tuple(LVRelStats *vacrelstats,
1713                                            ItemPointer itemptr)
1714 {
1715         /*
1716          * The array shouldn't overflow under normal behavior, but perhaps it
1717          * could if we are given a really small maintenance_work_mem. In that
1718          * case, just forget the last few tuples (we'll get 'em next time).
1719          */
1720         if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
1721         {
1722                 vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
1723                 vacrelstats->num_dead_tuples++;
1724         }
1725 }
1726
1727 /*
1728  *      lazy_tid_reaped() -- is a particular tid deletable?
1729  *
1730  *              This has the right signature to be an IndexBulkDeleteCallback.
1731  *
1732  *              Assumes dead_tuples array is in sorted order.
1733  */
1734 static bool
1735 lazy_tid_reaped(ItemPointer itemptr, void *state)
1736 {
1737         LVRelStats *vacrelstats = (LVRelStats *) state;
1738         ItemPointer res;
1739
1740         res = (ItemPointer) bsearch((void *) itemptr,
1741                                                                 (void *) vacrelstats->dead_tuples,
1742                                                                 vacrelstats->num_dead_tuples,
1743                                                                 sizeof(ItemPointerData),
1744                                                                 vac_cmp_itemptr);
1745
1746         return (res != NULL);
1747 }
1748
1749 /*
1750  * Comparator routines for use with qsort() and bsearch().
1751  */
1752 static int
1753 vac_cmp_itemptr(const void *left, const void *right)
1754 {
1755         BlockNumber lblk,
1756                                 rblk;
1757         OffsetNumber loff,
1758                                 roff;
1759
1760         lblk = ItemPointerGetBlockNumber((ItemPointer) left);
1761         rblk = ItemPointerGetBlockNumber((ItemPointer) right);
1762
1763         if (lblk < rblk)
1764                 return -1;
1765         if (lblk > rblk)
1766                 return 1;
1767
1768         loff = ItemPointerGetOffsetNumber((ItemPointer) left);
1769         roff = ItemPointerGetOffsetNumber((ItemPointer) right);
1770
1771         if (loff < roff)
1772                 return -1;
1773         if (loff > roff)
1774                 return 1;
1775
1776         return 0;
1777 }
1778
1779 /*
1780  * Check if every tuple in the given page is visible to all current and future
1781  * transactions. Also return the visibility_cutoff_xid which is the highest
1782  * xmin amongst the visible tuples.
1783  */
1784 static bool
1785 heap_page_is_all_visible(Relation rel, Buffer buf, TransactionId *visibility_cutoff_xid)
1786 {
1787         Page            page = BufferGetPage(buf);
1788         BlockNumber blockno = BufferGetBlockNumber(buf);
1789         OffsetNumber offnum,
1790                                 maxoff;
1791         bool            all_visible = true;
1792
1793         *visibility_cutoff_xid = InvalidTransactionId;
1794
1795         /*
1796          * This is a stripped down version of the line pointer scan in
1797          * lazy_scan_heap(). So if you change anything here, also check that code.
1798          */
1799         maxoff = PageGetMaxOffsetNumber(page);
1800         for (offnum = FirstOffsetNumber;
1801                  offnum <= maxoff && all_visible;
1802                  offnum = OffsetNumberNext(offnum))
1803         {
1804                 ItemId          itemid;
1805                 HeapTupleData tuple;
1806
1807                 itemid = PageGetItemId(page, offnum);
1808
1809                 /* Unused or redirect line pointers are of no interest */
1810                 if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
1811                         continue;
1812
1813                 ItemPointerSet(&(tuple.t_self), blockno, offnum);
1814
1815                 /*
1816                  * Dead line pointers can have index pointers pointing to them. So
1817                  * they can't be treated as visible
1818                  */
1819                 if (ItemIdIsDead(itemid))
1820                 {
1821                         all_visible = false;
1822                         break;
1823                 }
1824
1825                 Assert(ItemIdIsNormal(itemid));
1826
1827                 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1828                 tuple.t_len = ItemIdGetLength(itemid);
1829                 tuple.t_tableOid = RelationGetRelid(rel);
1830
1831                 switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
1832                 {
1833                         case HEAPTUPLE_LIVE:
1834                                 {
1835                                         TransactionId xmin;
1836
1837                                         /* Check comments in lazy_scan_heap. */
1838                                         if (!HeapTupleHeaderXminCommitted(tuple.t_data))
1839                                         {
1840                                                 all_visible = false;
1841                                                 break;
1842                                         }
1843
1844                                         /*
1845                                          * The inserter definitely committed. But is it old enough
1846                                          * that everyone sees it as committed?
1847                                          */
1848                                         xmin = HeapTupleHeaderGetXmin(tuple.t_data);
1849                                         if (!TransactionIdPrecedes(xmin, OldestXmin))
1850                                         {
1851                                                 all_visible = false;
1852                                                 break;
1853                                         }
1854
1855                                         /* Track newest xmin on page. */
1856                                         if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
1857                                                 *visibility_cutoff_xid = xmin;
1858                                 }
1859                                 break;
1860
1861                         case HEAPTUPLE_DEAD:
1862                         case HEAPTUPLE_RECENTLY_DEAD:
1863                         case HEAPTUPLE_INSERT_IN_PROGRESS:
1864                         case HEAPTUPLE_DELETE_IN_PROGRESS:
1865                                 all_visible = false;
1866                                 break;
1867
1868                         default:
1869                                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1870                                 break;
1871                 }
1872         }                                                       /* scan along page */
1873
1874         return all_visible;
1875 }