]> granicus.if.org Git - postgresql/blob - src/backend/commands/vacuumlazy.c
Only send cleanup_info messages if VACUUM removes any tuples.
[postgresql] / src / backend / commands / vacuumlazy.c
1 /*-------------------------------------------------------------------------
2  *
3  * vacuumlazy.c
4  *        Concurrent ("lazy") vacuuming.
5  *
6  *
7  * The major space usage for LAZY VACUUM is storage for the array of dead
8  * tuple TIDs, with the next biggest need being storage for per-disk-page
9  * free space info.  We want to ensure we can vacuum even the very largest
10  * relations with finite memory space usage.  To do that, we set upper bounds
11  * on the number of tuples and pages we will keep track of at once.
12  *
13  * We are willing to use at most maintenance_work_mem memory space to keep
14  * track of dead tuples.  We initially allocate an array of TIDs of that size,
15  * with an upper limit that depends on table size (this limit ensures we don't
16  * allocate a huge area uselessly for vacuuming small tables).  If the array
17  * threatens to overflow, we suspend the heap scan phase and perform a pass of
18  * index cleanup and page compaction, then resume the heap scan with an empty
19  * TID array.
20  *
21  * If we're processing a table with no indexes, we can just vacuum each page
22  * as we go; there's no need to save up multiple tuples to minimize the number
23  * of index scans performed.  So we don't use maintenance_work_mem memory for
24  * the TID array, just enough to hold as many heap tuples as fit on one page.
25  *
26  *
27  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
28  * Portions Copyright (c) 1994, Regents of the University of California
29  *
30  *
31  * IDENTIFICATION
32  *        $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.134 2010/04/21 19:53:24 sriggs Exp $
33  *
34  *-------------------------------------------------------------------------
35  */
36 #include "postgres.h"
37
38 #include <math.h>
39
40 #include "access/genam.h"
41 #include "access/heapam.h"
42 #include "access/transam.h"
43 #include "access/visibilitymap.h"
44 #include "catalog/storage.h"
45 #include "commands/dbcommands.h"
46 #include "commands/vacuum.h"
47 #include "miscadmin.h"
48 #include "pgstat.h"
49 #include "postmaster/autovacuum.h"
50 #include "storage/bufmgr.h"
51 #include "storage/freespace.h"
52 #include "storage/lmgr.h"
53 #include "utils/lsyscache.h"
54 #include "utils/memutils.h"
55 #include "utils/pg_rusage.h"
56 #include "utils/tqual.h"
57
58
59 /*
60  * Space/time tradeoff parameters: do these need to be user-tunable?
61  *
62  * To consider truncating the relation, we want there to be at least
63  * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
64  * is less) potentially-freeable pages.
65  */
66 #define REL_TRUNCATE_MINIMUM    1000
67 #define REL_TRUNCATE_FRACTION   16
68
69 /*
70  * Guesstimation of number of dead tuples per page.  This is used to
71  * provide an upper limit to memory allocated when vacuuming small
72  * tables.
73  */
74 #define LAZY_ALLOC_TUPLES               MaxHeapTuplesPerPage
75
76 /*
77  * Before we consider skipping a page that's marked as clean in
78  * visibility map, we must've seen at least this many clean pages.
79  */
80 #define SKIP_PAGES_THRESHOLD    32
81
82 typedef struct LVRelStats
83 {
84         /* hasindex = true means two-pass strategy; false means one-pass */
85         bool            hasindex;
86         bool            scanned_all;    /* have we scanned all pages (this far)? */
87         /* Overall statistics about rel */
88         BlockNumber rel_pages;
89         double          old_rel_tuples; /* previous value of pg_class.reltuples */
90         double          rel_tuples;             /* counts only tuples on scanned pages */
91         BlockNumber pages_removed;
92         double          tuples_deleted;
93         BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
94         /* List of TIDs of tuples we intend to delete */
95         /* NB: this list is ordered by TID address */
96         int                     num_dead_tuples;        /* current # of entries */
97         int                     max_dead_tuples;        /* # slots allocated in array */
98         ItemPointer dead_tuples;        /* array of ItemPointerData */
99         int                     num_index_scans;
100         TransactionId latestRemovedXid;
101 } LVRelStats;
102
103
104 /* A few variables that don't seem worth passing around as parameters */
105 static int      elevel = -1;
106
107 static TransactionId OldestXmin;
108 static TransactionId FreezeLimit;
109
110 static BufferAccessStrategy vac_strategy;
111
112
113 /* non-export function prototypes */
114 static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
115                            Relation *Irel, int nindexes, bool scan_all);
116 static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
117 static void lazy_vacuum_index(Relation indrel,
118                                   IndexBulkDeleteResult **stats,
119                                   LVRelStats *vacrelstats);
120 static void lazy_cleanup_index(Relation indrel,
121                                    IndexBulkDeleteResult *stats,
122                                    LVRelStats *vacrelstats);
123 static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
124                                  int tupindex, LVRelStats *vacrelstats);
125 static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
126 static BlockNumber count_nondeletable_pages(Relation onerel,
127                                                  LVRelStats *vacrelstats);
128 static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
129 static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
130                                            ItemPointer itemptr);
131 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
132 static int      vac_cmp_itemptr(const void *left, const void *right);
133
134
135 /*
136  *      lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
137  *
138  *              This routine vacuums a single heap, cleans out its indexes, and
139  *              updates its relpages and reltuples statistics.
140  *
141  *              At entry, we have already established a transaction and opened
142  *              and locked the relation.
143  */
144 void
145 lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
146                                 BufferAccessStrategy bstrategy, bool *scanned_all)
147 {
148         LVRelStats *vacrelstats;
149         Relation   *Irel;
150         int                     nindexes;
151         BlockNumber possibly_freeable;
152         PGRUsage        ru0;
153         TimestampTz starttime = 0;
154         bool            scan_all;
155         TransactionId freezeTableLimit;
156
157         pg_rusage_init(&ru0);
158
159         /* measure elapsed time iff autovacuum logging requires it */
160         if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration > 0)
161                 starttime = GetCurrentTimestamp();
162
163         if (vacstmt->options & VACOPT_VERBOSE)
164                 elevel = INFO;
165         else
166                 elevel = DEBUG2;
167
168         vac_strategy = bstrategy;
169
170         vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age,
171                                                   onerel->rd_rel->relisshared,
172                                                   &OldestXmin, &FreezeLimit, &freezeTableLimit);
173         scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
174                                                                                          freezeTableLimit);
175
176         vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
177
178         vacrelstats->scanned_all = true;        /* will be cleared if we skip a page */
179         vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples;
180         vacrelstats->num_index_scans = 0;
181
182         /* Open all indexes of the relation */
183         vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
184         vacrelstats->hasindex = (nindexes > 0);
185
186         /* Do the vacuuming */
187         lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all);
188
189         /* Done with indexes */
190         vac_close_indexes(nindexes, Irel, NoLock);
191
192         /*
193          * Optionally truncate the relation.
194          *
195          * Don't even think about it unless we have a shot at releasing a goodly
196          * number of pages.  Otherwise, the time taken isn't worth it.
197          */
198         possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
199         if (possibly_freeable > 0 &&
200                 (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
201                  possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION))
202                 lazy_truncate_heap(onerel, vacrelstats);
203
204         /* Vacuum the Free Space Map */
205         FreeSpaceMapVacuum(onerel);
206
207         /*
208          * Update statistics in pg_class.  But only if we didn't skip any pages;
209          * the tuple count only includes tuples from the pages we've visited, and
210          * we haven't frozen tuples in unvisited pages either.  The page count is
211          * accurate in any case, but because we use the reltuples / relpages ratio
212          * in the planner, it's better to not update relpages either if we can't
213          * update reltuples.
214          */
215         if (vacrelstats->scanned_all)
216                 vac_update_relstats(onerel,
217                                                         vacrelstats->rel_pages, vacrelstats->rel_tuples,
218                                                         vacrelstats->hasindex,
219                                                         FreezeLimit);
220
221         /* report results to the stats collector, too */
222         pgstat_report_vacuum(RelationGetRelid(onerel),
223                                                  onerel->rd_rel->relisshared,
224                                                  vacrelstats->scanned_all,
225                                                  vacrelstats->rel_tuples);
226
227         /* and log the action if appropriate */
228         if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
229         {
230                 if (Log_autovacuum_min_duration == 0 ||
231                         TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(),
232                                                                            Log_autovacuum_min_duration))
233                         ereport(LOG,
234                                         (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"
235                                                         "pages: %d removed, %d remain\n"
236                                                         "tuples: %.0f removed, %.0f remain\n"
237                                                         "system usage: %s",
238                                                         get_database_name(MyDatabaseId),
239                                                         get_namespace_name(RelationGetNamespace(onerel)),
240                                                         RelationGetRelationName(onerel),
241                                                         vacrelstats->num_index_scans,
242                                                   vacrelstats->pages_removed, vacrelstats->rel_pages,
243                                                 vacrelstats->tuples_deleted, vacrelstats->rel_tuples,
244                                                         pg_rusage_show(&ru0))));
245         }
246
247         if (scanned_all)
248                 *scanned_all = vacrelstats->scanned_all;
249 }
250
251 /*
252  * For Hot Standby we need to know the highest transaction id that will
253  * be removed by any change. VACUUM proceeds in a number of passes so
254  * we need to consider how each pass operates. The first phase runs
255  * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
256  * progresses - these will have a latestRemovedXid on each record.
257  * In some cases this removes all of the tuples to be removed, though
258  * often we have dead tuples with index pointers so we must remember them
259  * for removal in phase 3. Index records for those rows are removed
260  * in phase 2 and index blocks do not have MVCC information attached.
261  * So before we can allow removal of any index tuples we need to issue
262  * a WAL record containing the latestRemovedXid of rows that will be
263  * removed in phase three. This allows recovery queries to block at the
264  * correct place, i.e. before phase two, rather than during phase three
265  * which would be after the rows have become inaccessible.
266  */
267 static void
268 vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
269 {
270         /*
271          * No need to log changes for temp tables, they do not contain data
272          * visible on the standby server.
273          */
274         if (rel->rd_istemp || !XLogIsNeeded())
275                 return;
276
277         if (vacrelstats->tuples_deleted > 0)
278         {
279                 Assert(TransactionIdIsValid(vacrelstats->latestRemovedXid));
280
281                 (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid);
282         }
283 }
284
285 /*
286  *      lazy_scan_heap() -- scan an open heap relation
287  *
288  *              This routine sets commit status bits, builds lists of dead tuples
289  *              and pages with free space, and calculates statistics on the number
290  *              of live tuples in the heap.  When done, or when we run low on space
291  *              for dead-tuple TIDs, invoke vacuuming of indexes and heap.
292  *
293  *              If there are no indexes then we just vacuum each dirty page as we
294  *              process it, since there's no point in gathering many tuples.
295  */
296 static void
297 lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
298                            Relation *Irel, int nindexes, bool scan_all)
299 {
300         BlockNumber nblocks,
301                                 blkno;
302         HeapTupleData tuple;
303         char       *relname;
304         BlockNumber empty_pages,
305                                 scanned_pages,
306                                 vacuumed_pages;
307         double          num_tuples,
308                                 tups_vacuumed,
309                                 nkeep,
310                                 nunused;
311         IndexBulkDeleteResult **indstats;
312         int                     i;
313         PGRUsage        ru0;
314         Buffer          vmbuffer = InvalidBuffer;
315         BlockNumber all_visible_streak;
316
317         pg_rusage_init(&ru0);
318
319         relname = RelationGetRelationName(onerel);
320         ereport(elevel,
321                         (errmsg("vacuuming \"%s.%s\"",
322                                         get_namespace_name(RelationGetNamespace(onerel)),
323                                         relname)));
324
325         empty_pages = vacuumed_pages = scanned_pages = 0;
326         num_tuples = tups_vacuumed = nkeep = nunused = 0;
327
328         indstats = (IndexBulkDeleteResult **)
329                 palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
330
331         nblocks = RelationGetNumberOfBlocks(onerel);
332         vacrelstats->rel_pages = nblocks;
333         vacrelstats->nonempty_pages = 0;
334         vacrelstats->latestRemovedXid = InvalidTransactionId;
335
336         lazy_space_alloc(vacrelstats, nblocks);
337
338         all_visible_streak = 0;
339         for (blkno = 0; blkno < nblocks; blkno++)
340         {
341                 Buffer          buf;
342                 Page            page;
343                 OffsetNumber offnum,
344                                         maxoff;
345                 bool            tupgone,
346                                         hastup;
347                 int                     prev_dead_count;
348                 OffsetNumber frozen[MaxOffsetNumber];
349                 int                     nfrozen;
350                 Size            freespace;
351                 bool            all_visible_according_to_vm = false;
352                 bool            all_visible;
353
354                 /*
355                  * Skip pages that don't require vacuuming according to the visibility
356                  * map. But only if we've seen a streak of at least
357                  * SKIP_PAGES_THRESHOLD pages marked as clean. Since we're reading
358                  * sequentially, the OS should be doing readahead for us and there's
359                  * no gain in skipping a page now and then. You need a longer run of
360                  * consecutive skipped pages before it's worthwhile. Also, skipping
361                  * even a single page means that we can't update relfrozenxid or
362                  * reltuples, so we only want to do it if there's a good chance to
363                  * skip a goodly number of pages.
364                  */
365                 if (!scan_all)
366                 {
367                         all_visible_according_to_vm =
368                                 visibilitymap_test(onerel, blkno, &vmbuffer);
369                         if (all_visible_according_to_vm)
370                         {
371                                 all_visible_streak++;
372                                 if (all_visible_streak >= SKIP_PAGES_THRESHOLD)
373                                 {
374                                         vacrelstats->scanned_all = false;
375                                         continue;
376                                 }
377                         }
378                         else
379                                 all_visible_streak = 0;
380                 }
381
382                 vacuum_delay_point();
383
384                 scanned_pages++;
385
386                 /*
387                  * If we are close to overrunning the available space for dead-tuple
388                  * TIDs, pause and do a cycle of vacuuming before we tackle this page.
389                  */
390                 if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
391                         vacrelstats->num_dead_tuples > 0)
392                 {
393                         /* Log cleanup info before we touch indexes */
394                         vacuum_log_cleanup_info(onerel, vacrelstats);
395
396                         /* Remove index entries */
397                         for (i = 0; i < nindexes; i++)
398                                 lazy_vacuum_index(Irel[i],
399                                                                   &indstats[i],
400                                                                   vacrelstats);
401                         /* Remove tuples from heap */
402                         lazy_vacuum_heap(onerel, vacrelstats);
403                         /*
404                          * Forget the now-vacuumed tuples, and press on, but be careful
405                          * not to reset latestRemovedXid since we want that value to be valid.
406                          */
407                         vacrelstats->num_dead_tuples = 0;
408                         vacrelstats->num_index_scans++;
409                 }
410
411                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
412                                                                  RBM_NORMAL, vac_strategy);
413
414                 /* We need buffer cleanup lock so that we can prune HOT chains. */
415                 LockBufferForCleanup(buf);
416
417                 page = BufferGetPage(buf);
418
419                 if (PageIsNew(page))
420                 {
421                         /*
422                          * An all-zeroes page could be left over if a backend extends the
423                          * relation but crashes before initializing the page. Reclaim such
424                          * pages for use.
425                          *
426                          * We have to be careful here because we could be looking at a
427                          * page that someone has just added to the relation and not yet
428                          * been able to initialize (see RelationGetBufferForTuple). To
429                          * protect against that, release the buffer lock, grab the
430                          * relation extension lock momentarily, and re-lock the buffer. If
431                          * the page is still uninitialized by then, it must be left over
432                          * from a crashed backend, and we can initialize it.
433                          *
434                          * We don't really need the relation lock when this is a new or
435                          * temp relation, but it's probably not worth the code space to
436                          * check that, since this surely isn't a critical path.
437                          *
438                          * Note: the comparable code in vacuum.c need not worry because
439                          * it's got exclusive lock on the whole relation.
440                          */
441                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
442                         LockRelationForExtension(onerel, ExclusiveLock);
443                         UnlockRelationForExtension(onerel, ExclusiveLock);
444                         LockBufferForCleanup(buf);
445                         if (PageIsNew(page))
446                         {
447                                 ereport(WARNING,
448                                 (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
449                                                 relname, blkno)));
450                                 PageInit(page, BufferGetPageSize(buf), 0);
451                                 empty_pages++;
452                         }
453                         freespace = PageGetHeapFreeSpace(page);
454                         MarkBufferDirty(buf);
455                         UnlockReleaseBuffer(buf);
456
457                         RecordPageWithFreeSpace(onerel, blkno, freespace);
458                         continue;
459                 }
460
461                 if (PageIsEmpty(page))
462                 {
463                         empty_pages++;
464                         freespace = PageGetHeapFreeSpace(page);
465
466                         if (!PageIsAllVisible(page))
467                         {
468                                 PageSetAllVisible(page);
469                                 SetBufferCommitInfoNeedsSave(buf);
470                         }
471
472                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
473
474                         /* Update the visibility map */
475                         if (!all_visible_according_to_vm)
476                         {
477                                 visibilitymap_pin(onerel, blkno, &vmbuffer);
478                                 LockBuffer(buf, BUFFER_LOCK_SHARE);
479                                 if (PageIsAllVisible(page))
480                                         visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer);
481                                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
482                         }
483
484                         ReleaseBuffer(buf);
485                         RecordPageWithFreeSpace(onerel, blkno, freespace);
486                         continue;
487                 }
488
489                 /*
490                  * Prune all HOT-update chains in this page.
491                  *
492                  * We count tuples removed by the pruning step as removed by VACUUM.
493                  */
494                 tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
495                                                                                                         &vacrelstats->latestRemovedXid);
496                 /*
497                  * Now scan the page to collect vacuumable items and check for tuples
498                  * requiring freezing.
499                  */
500                 all_visible = true;
501                 nfrozen = 0;
502                 hastup = false;
503                 prev_dead_count = vacrelstats->num_dead_tuples;
504                 maxoff = PageGetMaxOffsetNumber(page);
505                 for (offnum = FirstOffsetNumber;
506                          offnum <= maxoff;
507                          offnum = OffsetNumberNext(offnum))
508                 {
509                         ItemId          itemid;
510
511                         itemid = PageGetItemId(page, offnum);
512
513                         /* Unused items require no processing, but we count 'em */
514                         if (!ItemIdIsUsed(itemid))
515                         {
516                                 nunused += 1;
517                                 continue;
518                         }
519
520                         /* Redirect items mustn't be touched */
521                         if (ItemIdIsRedirected(itemid))
522                         {
523                                 hastup = true;  /* this page won't be truncatable */
524                                 continue;
525                         }
526
527                         ItemPointerSet(&(tuple.t_self), blkno, offnum);
528
529                         /*
530                          * DEAD item pointers are to be vacuumed normally; but we don't
531                          * count them in tups_vacuumed, else we'd be double-counting (at
532                          * least in the common case where heap_page_prune() just freed up
533                          * a non-HOT tuple).
534                          */
535                         if (ItemIdIsDead(itemid))
536                         {
537                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
538                                 all_visible = false;
539                                 continue;
540                         }
541
542                         Assert(ItemIdIsNormal(itemid));
543
544                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
545                         tuple.t_len = ItemIdGetLength(itemid);
546
547                         tupgone = false;
548
549                         switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
550                         {
551                                 case HEAPTUPLE_DEAD:
552
553                                         /*
554                                          * Ordinarily, DEAD tuples would have been removed by
555                                          * heap_page_prune(), but it's possible that the tuple
556                                          * state changed since heap_page_prune() looked.  In
557                                          * particular an INSERT_IN_PROGRESS tuple could have
558                                          * changed to DEAD if the inserter aborted.  So this
559                                          * cannot be considered an error condition.
560                                          *
561                                          * If the tuple is HOT-updated then it must only be
562                                          * removed by a prune operation; so we keep it just as if
563                                          * it were RECENTLY_DEAD.  Also, if it's a heap-only
564                                          * tuple, we choose to keep it, because it'll be a lot
565                                          * cheaper to get rid of it in the next pruning pass than
566                                          * to treat it like an indexed tuple.
567                                          */
568                                         if (HeapTupleIsHotUpdated(&tuple) ||
569                                                 HeapTupleIsHeapOnly(&tuple))
570                                                 nkeep += 1;
571                                         else
572                                                 tupgone = true; /* we can delete the tuple */
573                                         all_visible = false;
574                                         break;
575                                 case HEAPTUPLE_LIVE:
576                                         /* Tuple is good --- but let's do some validity checks */
577                                         if (onerel->rd_rel->relhasoids &&
578                                                 !OidIsValid(HeapTupleGetOid(&tuple)))
579                                                 elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
580                                                          relname, blkno, offnum);
581
582                                         /*
583                                          * Is the tuple definitely visible to all transactions?
584                                          *
585                                          * NB: Like with per-tuple hint bits, we can't set the
586                                          * PD_ALL_VISIBLE flag if the inserter committed
587                                          * asynchronously. See SetHintBits for more info. Check
588                                          * that the HEAP_XMIN_COMMITTED hint bit is set because of
589                                          * that.
590                                          */
591                                         if (all_visible)
592                                         {
593                                                 TransactionId xmin;
594
595                                                 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
596                                                 {
597                                                         all_visible = false;
598                                                         break;
599                                                 }
600
601                                                 /*
602                                                  * The inserter definitely committed. But is it old
603                                                  * enough that everyone sees it as committed?
604                                                  */
605                                                 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
606                                                 if (!TransactionIdPrecedes(xmin, OldestXmin))
607                                                 {
608                                                         all_visible = false;
609                                                         break;
610                                                 }
611                                         }
612                                         break;
613                                 case HEAPTUPLE_RECENTLY_DEAD:
614
615                                         /*
616                                          * If tuple is recently deleted then we must not remove it
617                                          * from relation.
618                                          */
619                                         nkeep += 1;
620                                         all_visible = false;
621                                         break;
622                                 case HEAPTUPLE_INSERT_IN_PROGRESS:
623                                         /* This is an expected case during concurrent vacuum */
624                                         all_visible = false;
625                                         break;
626                                 case HEAPTUPLE_DELETE_IN_PROGRESS:
627                                         /* This is an expected case during concurrent vacuum */
628                                         all_visible = false;
629                                         break;
630                                 default:
631                                         elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
632                                         break;
633                         }
634
635                         if (tupgone)
636                         {
637                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
638                                 HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
639                                                                                          &vacrelstats->latestRemovedXid);
640                                 tups_vacuumed += 1;
641                         }
642                         else
643                         {
644                                 num_tuples += 1;
645                                 hastup = true;
646
647                                 /*
648                                  * Each non-removable tuple must be checked to see if it needs
649                                  * freezing.  Note we already have exclusive buffer lock.
650                                  */
651                                 if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
652                                                                           InvalidBuffer))
653                                         frozen[nfrozen++] = offnum;
654                         }
655                 }                                               /* scan along page */
656
657                 /*
658                  * If we froze any tuples, mark the buffer dirty, and write a WAL
659                  * record recording the changes.  We must log the changes to be
660                  * crash-safe against future truncation of CLOG.
661                  */
662                 if (nfrozen > 0)
663                 {
664                         MarkBufferDirty(buf);
665                         /* no XLOG for temp tables, though */
666                         if (!onerel->rd_istemp)
667                         {
668                                 XLogRecPtr      recptr;
669
670                                 recptr = log_heap_freeze(onerel, buf, FreezeLimit,
671                                                                                  frozen, nfrozen);
672                                 PageSetLSN(page, recptr);
673                                 PageSetTLI(page, ThisTimeLineID);
674                         }
675                 }
676
677                 /*
678                  * If there are no indexes then we can vacuum the page right now
679                  * instead of doing a second scan.
680                  */
681                 if (nindexes == 0 &&
682                         vacrelstats->num_dead_tuples > 0)
683                 {
684                         /* Remove tuples from heap */
685                         lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
686                         /*
687                          * Forget the now-vacuumed tuples, and press on, but be careful
688                          * not to reset latestRemovedXid since we want that value to be valid.
689                          */
690                         Assert(TransactionIdIsValid(vacrelstats->latestRemovedXid));
691                         vacrelstats->num_dead_tuples = 0;
692                         vacuumed_pages++;
693                 }
694
695                 freespace = PageGetHeapFreeSpace(page);
696
697                 /* Update the all-visible flag on the page */
698                 if (!PageIsAllVisible(page) && all_visible)
699                 {
700                         PageSetAllVisible(page);
701                         SetBufferCommitInfoNeedsSave(buf);
702                 }
703                 else if (PageIsAllVisible(page) && !all_visible)
704                 {
705                         elog(WARNING, "PD_ALL_VISIBLE flag was incorrectly set in relation \"%s\" page %u",
706                                  relname, blkno);
707                         PageClearAllVisible(page);
708                         SetBufferCommitInfoNeedsSave(buf);
709
710                         /*
711                          * Normally, we would drop the lock on the heap page before
712                          * updating the visibility map, but since this case shouldn't
713                          * happen anyway, don't worry about that.
714                          */
715                         visibilitymap_clear(onerel, blkno);
716                 }
717
718                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
719
720                 /* Update the visibility map */
721                 if (!all_visible_according_to_vm && all_visible)
722                 {
723                         visibilitymap_pin(onerel, blkno, &vmbuffer);
724                         LockBuffer(buf, BUFFER_LOCK_SHARE);
725                         if (PageIsAllVisible(page))
726                                 visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer);
727                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
728                 }
729
730                 ReleaseBuffer(buf);
731
732                 /* Remember the location of the last page with nonremovable tuples */
733                 if (hastup)
734                         vacrelstats->nonempty_pages = blkno + 1;
735
736                 /*
737                  * If we remembered any tuples for deletion, then the page will be
738                  * visited again by lazy_vacuum_heap, which will compute and record
739                  * its post-compaction free space.      If not, then we're done with this
740                  * page, so remember its free space as-is.      (This path will always be
741                  * taken if there are no indexes.)
742                  */
743                 if (vacrelstats->num_dead_tuples == prev_dead_count)
744                         RecordPageWithFreeSpace(onerel, blkno, freespace);
745         }
746
747         /* save stats for use later */
748         vacrelstats->rel_tuples = num_tuples;
749         vacrelstats->tuples_deleted = tups_vacuumed;
750
751         /* If any tuples need to be deleted, perform final vacuum cycle */
752         /* XXX put a threshold on min number of tuples here? */
753         if (vacrelstats->num_dead_tuples > 0)
754         {
755                 /* Log cleanup info before we touch indexes */
756                 vacuum_log_cleanup_info(onerel, vacrelstats);
757
758                 /* Remove index entries */
759                 for (i = 0; i < nindexes; i++)
760                         lazy_vacuum_index(Irel[i],
761                                                           &indstats[i],
762                                                           vacrelstats);
763                 /* Remove tuples from heap */
764                 lazy_vacuum_heap(onerel, vacrelstats);
765                 vacrelstats->num_index_scans++;
766         }
767
768         /* Release the pin on the visibility map page */
769         if (BufferIsValid(vmbuffer))
770         {
771                 ReleaseBuffer(vmbuffer);
772                 vmbuffer = InvalidBuffer;
773         }
774
775         /* Do post-vacuum cleanup and statistics update for each index */
776         for (i = 0; i < nindexes; i++)
777                 lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
778
779         /* If no indexes, make log report that lazy_vacuum_heap would've made */
780         if (vacuumed_pages)
781                 ereport(elevel,
782                                 (errmsg("\"%s\": removed %.0f row versions in %u pages",
783                                                 RelationGetRelationName(onerel),
784                                                 tups_vacuumed, vacuumed_pages)));
785
786         ereport(elevel,
787                         (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
788                                         RelationGetRelationName(onerel),
789                                         tups_vacuumed, num_tuples, scanned_pages, nblocks),
790                          errdetail("%.0f dead row versions cannot be removed yet.\n"
791                                            "There were %.0f unused item pointers.\n"
792                                            "%u pages are entirely empty.\n"
793                                            "%s.",
794                                            nkeep,
795                                            nunused,
796                                            empty_pages,
797                                            pg_rusage_show(&ru0))));
798 }
799
800
801 /*
802  *      lazy_vacuum_heap() -- second pass over the heap
803  *
804  *              This routine marks dead tuples as unused and compacts out free
805  *              space on their pages.  Pages not having dead tuples recorded from
806  *              lazy_scan_heap are not visited at all.
807  *
808  * Note: the reason for doing this as a second pass is we cannot remove
809  * the tuples until we've removed their index entries, and we want to
810  * process index entry removal in batches as large as possible.
811  */
812 static void
813 lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
814 {
815         int                     tupindex;
816         int                     npages;
817         PGRUsage        ru0;
818
819         pg_rusage_init(&ru0);
820         npages = 0;
821
822         tupindex = 0;
823         while (tupindex < vacrelstats->num_dead_tuples)
824         {
825                 BlockNumber tblk;
826                 Buffer          buf;
827                 Page            page;
828                 Size            freespace;
829
830                 vacuum_delay_point();
831
832                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
833                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
834                                                                  vac_strategy);
835                 LockBufferForCleanup(buf);
836                 tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
837
838                 /* Now that we've compacted the page, record its available space */
839                 page = BufferGetPage(buf);
840                 freespace = PageGetHeapFreeSpace(page);
841
842                 UnlockReleaseBuffer(buf);
843                 RecordPageWithFreeSpace(onerel, tblk, freespace);
844                 npages++;
845         }
846
847         ereport(elevel,
848                         (errmsg("\"%s\": removed %d row versions in %d pages",
849                                         RelationGetRelationName(onerel),
850                                         tupindex, npages),
851                          errdetail("%s.",
852                                            pg_rusage_show(&ru0))));
853 }
854
855 /*
856  *      lazy_vacuum_page() -- free dead tuples on a page
857  *                                       and repair its fragmentation.
858  *
859  * Caller must hold pin and buffer cleanup lock on the buffer.
860  *
861  * tupindex is the index in vacrelstats->dead_tuples of the first dead
862  * tuple for this page.  We assume the rest follow sequentially.
863  * The return value is the first tupindex after the tuples of this page.
864  */
865 static int
866 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
867                                  int tupindex, LVRelStats *vacrelstats)
868 {
869         Page            page = BufferGetPage(buffer);
870         OffsetNumber unused[MaxOffsetNumber];
871         int                     uncnt = 0;
872
873         START_CRIT_SECTION();
874
875         for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
876         {
877                 BlockNumber tblk;
878                 OffsetNumber toff;
879                 ItemId          itemid;
880
881                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
882                 if (tblk != blkno)
883                         break;                          /* past end of tuples for this block */
884                 toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
885                 itemid = PageGetItemId(page, toff);
886                 ItemIdSetUnused(itemid);
887                 unused[uncnt++] = toff;
888         }
889
890         PageRepairFragmentation(page);
891
892         MarkBufferDirty(buffer);
893
894         /* XLOG stuff */
895         if (!onerel->rd_istemp)
896         {
897                 XLogRecPtr      recptr;
898
899                 recptr = log_heap_clean(onerel, buffer,
900                                                                 NULL, 0, NULL, 0,
901                                                                 unused, uncnt,
902                                                                 vacrelstats->latestRemovedXid);
903                 PageSetLSN(page, recptr);
904                 PageSetTLI(page, ThisTimeLineID);
905         }
906
907         END_CRIT_SECTION();
908
909         return tupindex;
910 }
911
912 /*
913  *      lazy_vacuum_index() -- vacuum one index relation.
914  *
915  *              Delete all the index entries pointing to tuples listed in
916  *              vacrelstats->dead_tuples, and update running statistics.
917  */
918 static void
919 lazy_vacuum_index(Relation indrel,
920                                   IndexBulkDeleteResult **stats,
921                                   LVRelStats *vacrelstats)
922 {
923         IndexVacuumInfo ivinfo;
924         PGRUsage        ru0;
925
926         pg_rusage_init(&ru0);
927
928         ivinfo.index = indrel;
929         ivinfo.analyze_only = false;
930         ivinfo.estimated_count = true;
931         ivinfo.message_level = elevel;
932         ivinfo.num_heap_tuples = vacrelstats->old_rel_tuples;
933         ivinfo.strategy = vac_strategy;
934
935         /* Do bulk deletion */
936         *stats = index_bulk_delete(&ivinfo, *stats,
937                                                            lazy_tid_reaped, (void *) vacrelstats);
938
939         ereport(elevel,
940                         (errmsg("scanned index \"%s\" to remove %d row versions",
941                                         RelationGetRelationName(indrel),
942                                         vacrelstats->num_dead_tuples),
943                          errdetail("%s.", pg_rusage_show(&ru0))));
944 }
945
946 /*
947  *      lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
948  */
949 static void
950 lazy_cleanup_index(Relation indrel,
951                                    IndexBulkDeleteResult *stats,
952                                    LVRelStats *vacrelstats)
953 {
954         IndexVacuumInfo ivinfo;
955         PGRUsage        ru0;
956
957         pg_rusage_init(&ru0);
958
959         ivinfo.index = indrel;
960         ivinfo.analyze_only = false;
961         ivinfo.estimated_count = !vacrelstats->scanned_all;
962         ivinfo.message_level = elevel;
963         /* use rel_tuples only if we scanned all pages, else fall back */
964         ivinfo.num_heap_tuples = vacrelstats->scanned_all ? vacrelstats->rel_tuples : vacrelstats->old_rel_tuples;
965         ivinfo.strategy = vac_strategy;
966
967         stats = index_vacuum_cleanup(&ivinfo, stats);
968
969         if (!stats)
970                 return;
971
972         /*
973          * Now update statistics in pg_class, but only if the index says the count
974          * is accurate.
975          */
976         if (!stats->estimated_count)
977                 vac_update_relstats(indrel,
978                                                         stats->num_pages, stats->num_index_tuples,
979                                                         false, InvalidTransactionId);
980
981         ereport(elevel,
982                         (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
983                                         RelationGetRelationName(indrel),
984                                         stats->num_index_tuples,
985                                         stats->num_pages),
986                          errdetail("%.0f index row versions were removed.\n"
987                          "%u index pages have been deleted, %u are currently reusable.\n"
988                                            "%s.",
989                                            stats->tuples_removed,
990                                            stats->pages_deleted, stats->pages_free,
991                                            pg_rusage_show(&ru0))));
992
993         pfree(stats);
994 }
995
996 /*
997  * lazy_truncate_heap - try to truncate off any empty pages at the end
998  */
999 static void
1000 lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
1001 {
1002         BlockNumber old_rel_pages = vacrelstats->rel_pages;
1003         BlockNumber new_rel_pages;
1004         PGRUsage        ru0;
1005
1006         pg_rusage_init(&ru0);
1007
1008         /*
1009          * We need full exclusive lock on the relation in order to do truncation.
1010          * If we can't get it, give up rather than waiting --- we don't want to
1011          * block other backends, and we don't want to deadlock (which is quite
1012          * possible considering we already hold a lower-grade lock).
1013          */
1014         if (!ConditionalLockRelation(onerel, AccessExclusiveLock))
1015                 return;
1016
1017         /*
1018          * Now that we have exclusive lock, look to see if the rel has grown
1019          * whilst we were vacuuming with non-exclusive lock.  If so, give up; the
1020          * newly added pages presumably contain non-deletable tuples.
1021          */
1022         new_rel_pages = RelationGetNumberOfBlocks(onerel);
1023         if (new_rel_pages != old_rel_pages)
1024         {
1025                 /* might as well use the latest news when we update pg_class stats */
1026                 vacrelstats->rel_pages = new_rel_pages;
1027                 UnlockRelation(onerel, AccessExclusiveLock);
1028                 return;
1029         }
1030
1031         /*
1032          * Scan backwards from the end to verify that the end pages actually
1033          * contain no tuples.  This is *necessary*, not optional, because other
1034          * backends could have added tuples to these pages whilst we were
1035          * vacuuming.
1036          */
1037         new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
1038
1039         if (new_rel_pages >= old_rel_pages)
1040         {
1041                 /* can't do anything after all */
1042                 UnlockRelation(onerel, AccessExclusiveLock);
1043                 return;
1044         }
1045
1046         /*
1047          * Okay to truncate.
1048          */
1049         RelationTruncate(onerel, new_rel_pages);
1050
1051         /*
1052          * We can release the exclusive lock as soon as we have truncated.      Other
1053          * backends can't safely access the relation until they have processed the
1054          * smgr invalidation that smgrtruncate sent out ... but that should happen
1055          * as part of standard invalidation processing once they acquire lock on
1056          * the relation.
1057          */
1058         UnlockRelation(onerel, AccessExclusiveLock);
1059
1060         /* update statistics */
1061         vacrelstats->rel_pages = new_rel_pages;
1062         vacrelstats->pages_removed = old_rel_pages - new_rel_pages;
1063
1064         ereport(elevel,
1065                         (errmsg("\"%s\": truncated %u to %u pages",
1066                                         RelationGetRelationName(onerel),
1067                                         old_rel_pages, new_rel_pages),
1068                          errdetail("%s.",
1069                                            pg_rusage_show(&ru0))));
1070 }
1071
1072 /*
1073  * Rescan end pages to verify that they are (still) empty of tuples.
1074  *
1075  * Returns number of nondeletable pages (last nonempty page + 1).
1076  */
1077 static BlockNumber
1078 count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
1079 {
1080         BlockNumber blkno;
1081
1082         /* Strange coding of loop control is needed because blkno is unsigned */
1083         blkno = vacrelstats->rel_pages;
1084         while (blkno > vacrelstats->nonempty_pages)
1085         {
1086                 Buffer          buf;
1087                 Page            page;
1088                 OffsetNumber offnum,
1089                                         maxoff;
1090                 bool            hastup;
1091
1092                 /*
1093                  * We don't insert a vacuum delay point here, because we have an
1094                  * exclusive lock on the table which we want to hold for as short a
1095                  * time as possible.  We still need to check for interrupts however.
1096                  */
1097                 CHECK_FOR_INTERRUPTS();
1098
1099                 blkno--;
1100
1101                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
1102                                                                  RBM_NORMAL, vac_strategy);
1103
1104                 /* In this phase we only need shared access to the buffer */
1105                 LockBuffer(buf, BUFFER_LOCK_SHARE);
1106
1107                 page = BufferGetPage(buf);
1108
1109                 if (PageIsNew(page) || PageIsEmpty(page))
1110                 {
1111                         /* PageIsNew probably shouldn't happen... */
1112                         UnlockReleaseBuffer(buf);
1113                         continue;
1114                 }
1115
1116                 hastup = false;
1117                 maxoff = PageGetMaxOffsetNumber(page);
1118                 for (offnum = FirstOffsetNumber;
1119                          offnum <= maxoff;
1120                          offnum = OffsetNumberNext(offnum))
1121                 {
1122                         ItemId          itemid;
1123
1124                         itemid = PageGetItemId(page, offnum);
1125
1126                         /*
1127                          * Note: any non-unused item should be taken as a reason to keep
1128                          * this page.  We formerly thought that DEAD tuples could be
1129                          * thrown away, but that's not so, because we'd not have cleaned
1130                          * out their index entries.
1131                          */
1132                         if (ItemIdIsUsed(itemid))
1133                         {
1134                                 hastup = true;
1135                                 break;                  /* can stop scanning */
1136                         }
1137                 }                                               /* scan along page */
1138
1139                 UnlockReleaseBuffer(buf);
1140
1141                 /* Done scanning if we found a tuple here */
1142                 if (hastup)
1143                         return blkno + 1;
1144         }
1145
1146         /*
1147          * If we fall out of the loop, all the previously-thought-to-be-empty
1148          * pages still are; we need not bother to look at the last known-nonempty
1149          * page.
1150          */
1151         return vacrelstats->nonempty_pages;
1152 }
1153
1154 /*
1155  * lazy_space_alloc - space allocation decisions for lazy vacuum
1156  *
1157  * See the comments at the head of this file for rationale.
1158  */
1159 static void
1160 lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
1161 {
1162         long            maxtuples;
1163
1164         if (vacrelstats->hasindex)
1165         {
1166                 maxtuples = (maintenance_work_mem * 1024L) / sizeof(ItemPointerData);
1167                 maxtuples = Min(maxtuples, INT_MAX);
1168                 maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
1169
1170                 /* curious coding here to ensure the multiplication can't overflow */
1171                 if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
1172                         maxtuples = relblocks * LAZY_ALLOC_TUPLES;
1173
1174                 /* stay sane if small maintenance_work_mem */
1175                 maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
1176         }
1177         else
1178         {
1179                 maxtuples = MaxHeapTuplesPerPage;
1180         }
1181
1182         vacrelstats->num_dead_tuples = 0;
1183         vacrelstats->max_dead_tuples = (int) maxtuples;
1184         vacrelstats->dead_tuples = (ItemPointer)
1185                 palloc(maxtuples * sizeof(ItemPointerData));
1186 }
1187
1188 /*
1189  * lazy_record_dead_tuple - remember one deletable tuple
1190  */
1191 static void
1192 lazy_record_dead_tuple(LVRelStats *vacrelstats,
1193                                            ItemPointer itemptr)
1194 {
1195         /*
1196          * The array shouldn't overflow under normal behavior, but perhaps it
1197          * could if we are given a really small maintenance_work_mem. In that
1198          * case, just forget the last few tuples (we'll get 'em next time).
1199          */
1200         if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
1201         {
1202                 vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
1203                 vacrelstats->num_dead_tuples++;
1204         }
1205 }
1206
1207 /*
1208  *      lazy_tid_reaped() -- is a particular tid deletable?
1209  *
1210  *              This has the right signature to be an IndexBulkDeleteCallback.
1211  *
1212  *              Assumes dead_tuples array is in sorted order.
1213  */
1214 static bool
1215 lazy_tid_reaped(ItemPointer itemptr, void *state)
1216 {
1217         LVRelStats *vacrelstats = (LVRelStats *) state;
1218         ItemPointer res;
1219
1220         res = (ItemPointer) bsearch((void *) itemptr,
1221                                                                 (void *) vacrelstats->dead_tuples,
1222                                                                 vacrelstats->num_dead_tuples,
1223                                                                 sizeof(ItemPointerData),
1224                                                                 vac_cmp_itemptr);
1225
1226         return (res != NULL);
1227 }
1228
1229 /*
1230  * Comparator routines for use with qsort() and bsearch().
1231  */
1232 static int
1233 vac_cmp_itemptr(const void *left, const void *right)
1234 {
1235         BlockNumber lblk,
1236                                 rblk;
1237         OffsetNumber loff,
1238                                 roff;
1239
1240         lblk = ItemPointerGetBlockNumber((ItemPointer) left);
1241         rblk = ItemPointerGetBlockNumber((ItemPointer) right);
1242
1243         if (lblk < rblk)
1244                 return -1;
1245         if (lblk > rblk)
1246                 return 1;
1247
1248         loff = ItemPointerGetOffsetNumber((ItemPointer) left);
1249         roff = ItemPointerGetOffsetNumber((ItemPointer) right);
1250
1251         if (loff < roff)
1252                 return -1;
1253         if (loff > roff)
1254                 return 1;
1255
1256         return 0;
1257 }