]> granicus.if.org Git - postgresql/blob - src/backend/commands/vacuumlazy.c
Fix oversight in collecting values for cleanup_info records.
[postgresql] / src / backend / commands / vacuumlazy.c
1 /*-------------------------------------------------------------------------
2  *
3  * vacuumlazy.c
4  *        Concurrent ("lazy") vacuuming.
5  *
6  *
7  * The major space usage for LAZY VACUUM is storage for the array of dead
8  * tuple TIDs, with the next biggest need being storage for per-disk-page
9  * free space info.  We want to ensure we can vacuum even the very largest
10  * relations with finite memory space usage.  To do that, we set upper bounds
11  * on the number of tuples and pages we will keep track of at once.
12  *
13  * We are willing to use at most maintenance_work_mem memory space to keep
14  * track of dead tuples.  We initially allocate an array of TIDs of that size,
15  * with an upper limit that depends on table size (this limit ensures we don't
16  * allocate a huge area uselessly for vacuuming small tables).  If the array
17  * threatens to overflow, we suspend the heap scan phase and perform a pass of
18  * index cleanup and page compaction, then resume the heap scan with an empty
19  * TID array.
20  *
21  * If we're processing a table with no indexes, we can just vacuum each page
22  * as we go; there's no need to save up multiple tuples to minimize the number
23  * of index scans performed.  So we don't use maintenance_work_mem memory for
24  * the TID array, just enough to hold as many heap tuples as fit on one page.
25  *
26  *
27  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
28  * Portions Copyright (c) 1994, Regents of the University of California
29  *
30  *
31  * IDENTIFICATION
32  *        $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.133 2010/04/21 17:20:56 sriggs Exp $
33  *
34  *-------------------------------------------------------------------------
35  */
36 #include "postgres.h"
37
38 #include <math.h>
39
40 #include "access/genam.h"
41 #include "access/heapam.h"
42 #include "access/transam.h"
43 #include "access/visibilitymap.h"
44 #include "catalog/storage.h"
45 #include "commands/dbcommands.h"
46 #include "commands/vacuum.h"
47 #include "miscadmin.h"
48 #include "pgstat.h"
49 #include "postmaster/autovacuum.h"
50 #include "storage/bufmgr.h"
51 #include "storage/freespace.h"
52 #include "storage/lmgr.h"
53 #include "utils/lsyscache.h"
54 #include "utils/memutils.h"
55 #include "utils/pg_rusage.h"
56 #include "utils/tqual.h"
57
58
59 /*
60  * Space/time tradeoff parameters: do these need to be user-tunable?
61  *
62  * To consider truncating the relation, we want there to be at least
63  * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
64  * is less) potentially-freeable pages.
65  */
66 #define REL_TRUNCATE_MINIMUM    1000
67 #define REL_TRUNCATE_FRACTION   16
68
69 /*
70  * Guesstimation of number of dead tuples per page.  This is used to
71  * provide an upper limit to memory allocated when vacuuming small
72  * tables.
73  */
74 #define LAZY_ALLOC_TUPLES               MaxHeapTuplesPerPage
75
76 /*
77  * Before we consider skipping a page that's marked as clean in
78  * visibility map, we must've seen at least this many clean pages.
79  */
80 #define SKIP_PAGES_THRESHOLD    32
81
82 typedef struct LVRelStats
83 {
84         /* hasindex = true means two-pass strategy; false means one-pass */
85         bool            hasindex;
86         bool            scanned_all;    /* have we scanned all pages (this far)? */
87         /* Overall statistics about rel */
88         BlockNumber rel_pages;
89         double          old_rel_tuples; /* previous value of pg_class.reltuples */
90         double          rel_tuples;             /* counts only tuples on scanned pages */
91         BlockNumber pages_removed;
92         double          tuples_deleted;
93         BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
94         /* List of TIDs of tuples we intend to delete */
95         /* NB: this list is ordered by TID address */
96         int                     num_dead_tuples;        /* current # of entries */
97         int                     max_dead_tuples;        /* # slots allocated in array */
98         ItemPointer dead_tuples;        /* array of ItemPointerData */
99         int                     num_index_scans;
100         TransactionId latestRemovedXid;
101 } LVRelStats;
102
103
104 /* A few variables that don't seem worth passing around as parameters */
105 static int      elevel = -1;
106
107 static TransactionId OldestXmin;
108 static TransactionId FreezeLimit;
109
110 static BufferAccessStrategy vac_strategy;
111
112
113 /* non-export function prototypes */
114 static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
115                            Relation *Irel, int nindexes, bool scan_all);
116 static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
117 static void lazy_vacuum_index(Relation indrel,
118                                   IndexBulkDeleteResult **stats,
119                                   LVRelStats *vacrelstats);
120 static void lazy_cleanup_index(Relation indrel,
121                                    IndexBulkDeleteResult *stats,
122                                    LVRelStats *vacrelstats);
123 static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
124                                  int tupindex, LVRelStats *vacrelstats);
125 static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
126 static BlockNumber count_nondeletable_pages(Relation onerel,
127                                                  LVRelStats *vacrelstats);
128 static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
129 static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
130                                            ItemPointer itemptr);
131 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
132 static int      vac_cmp_itemptr(const void *left, const void *right);
133
134
135 /*
136  *      lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
137  *
138  *              This routine vacuums a single heap, cleans out its indexes, and
139  *              updates its relpages and reltuples statistics.
140  *
141  *              At entry, we have already established a transaction and opened
142  *              and locked the relation.
143  */
144 void
145 lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
146                                 BufferAccessStrategy bstrategy, bool *scanned_all)
147 {
148         LVRelStats *vacrelstats;
149         Relation   *Irel;
150         int                     nindexes;
151         BlockNumber possibly_freeable;
152         PGRUsage        ru0;
153         TimestampTz starttime = 0;
154         bool            scan_all;
155         TransactionId freezeTableLimit;
156
157         pg_rusage_init(&ru0);
158
159         /* measure elapsed time iff autovacuum logging requires it */
160         if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration > 0)
161                 starttime = GetCurrentTimestamp();
162
163         if (vacstmt->options & VACOPT_VERBOSE)
164                 elevel = INFO;
165         else
166                 elevel = DEBUG2;
167
168         vac_strategy = bstrategy;
169
170         vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age,
171                                                   onerel->rd_rel->relisshared,
172                                                   &OldestXmin, &FreezeLimit, &freezeTableLimit);
173         scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
174                                                                                          freezeTableLimit);
175
176         vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
177
178         vacrelstats->scanned_all = true;        /* will be cleared if we skip a page */
179         vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples;
180         vacrelstats->num_index_scans = 0;
181
182         /* Open all indexes of the relation */
183         vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
184         vacrelstats->hasindex = (nindexes > 0);
185
186         /* Do the vacuuming */
187         lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all);
188
189         /* Done with indexes */
190         vac_close_indexes(nindexes, Irel, NoLock);
191
192         /*
193          * Optionally truncate the relation.
194          *
195          * Don't even think about it unless we have a shot at releasing a goodly
196          * number of pages.  Otherwise, the time taken isn't worth it.
197          */
198         possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
199         if (possibly_freeable > 0 &&
200                 (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
201                  possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION))
202                 lazy_truncate_heap(onerel, vacrelstats);
203
204         /* Vacuum the Free Space Map */
205         FreeSpaceMapVacuum(onerel);
206
207         /*
208          * Update statistics in pg_class.  But only if we didn't skip any pages;
209          * the tuple count only includes tuples from the pages we've visited, and
210          * we haven't frozen tuples in unvisited pages either.  The page count is
211          * accurate in any case, but because we use the reltuples / relpages ratio
212          * in the planner, it's better to not update relpages either if we can't
213          * update reltuples.
214          */
215         if (vacrelstats->scanned_all)
216                 vac_update_relstats(onerel,
217                                                         vacrelstats->rel_pages, vacrelstats->rel_tuples,
218                                                         vacrelstats->hasindex,
219                                                         FreezeLimit);
220
221         /* report results to the stats collector, too */
222         pgstat_report_vacuum(RelationGetRelid(onerel),
223                                                  onerel->rd_rel->relisshared,
224                                                  vacrelstats->scanned_all,
225                                                  vacrelstats->rel_tuples);
226
227         /* and log the action if appropriate */
228         if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
229         {
230                 if (Log_autovacuum_min_duration == 0 ||
231                         TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(),
232                                                                            Log_autovacuum_min_duration))
233                         ereport(LOG,
234                                         (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"
235                                                         "pages: %d removed, %d remain\n"
236                                                         "tuples: %.0f removed, %.0f remain\n"
237                                                         "system usage: %s",
238                                                         get_database_name(MyDatabaseId),
239                                                         get_namespace_name(RelationGetNamespace(onerel)),
240                                                         RelationGetRelationName(onerel),
241                                                         vacrelstats->num_index_scans,
242                                                   vacrelstats->pages_removed, vacrelstats->rel_pages,
243                                                 vacrelstats->tuples_deleted, vacrelstats->rel_tuples,
244                                                         pg_rusage_show(&ru0))));
245         }
246
247         if (scanned_all)
248                 *scanned_all = vacrelstats->scanned_all;
249 }
250
251 /*
252  * For Hot Standby we need to know the highest transaction id that will
253  * be removed by any change. VACUUM proceeds in a number of passes so
254  * we need to consider how each pass operates. The first phase runs
255  * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
256  * progresses - these will have a latestRemovedXid on each record.
257  * In some cases this removes all of the tuples to be removed, though
258  * often we have dead tuples with index pointers so we must remember them
259  * for removal in phase 3. Index records for those rows are removed
260  * in phase 2 and index blocks do not have MVCC information attached.
261  * So before we can allow removal of any index tuples we need to issue
262  * a WAL record containing the latestRemovedXid of rows that will be
263  * removed in phase three. This allows recovery queries to block at the
264  * correct place, i.e. before phase two, rather than during phase three
265  * which would be after the rows have become inaccessible.
266  */
267 static void
268 vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
269 {
270         /*
271          * No need to log changes for temp tables, they do not contain data
272          * visible on the standby server.
273          */
274         if (rel->rd_istemp || !XLogIsNeeded())
275                 return;
276
277         Assert(TransactionIdIsValid(vacrelstats->latestRemovedXid));
278
279         (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid);
280 }
281
282 /*
283  *      lazy_scan_heap() -- scan an open heap relation
284  *
285  *              This routine sets commit status bits, builds lists of dead tuples
286  *              and pages with free space, and calculates statistics on the number
287  *              of live tuples in the heap.  When done, or when we run low on space
288  *              for dead-tuple TIDs, invoke vacuuming of indexes and heap.
289  *
290  *              If there are no indexes then we just vacuum each dirty page as we
291  *              process it, since there's no point in gathering many tuples.
292  */
293 static void
294 lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
295                            Relation *Irel, int nindexes, bool scan_all)
296 {
297         BlockNumber nblocks,
298                                 blkno;
299         HeapTupleData tuple;
300         char       *relname;
301         BlockNumber empty_pages,
302                                 scanned_pages,
303                                 vacuumed_pages;
304         double          num_tuples,
305                                 tups_vacuumed,
306                                 nkeep,
307                                 nunused;
308         IndexBulkDeleteResult **indstats;
309         int                     i;
310         PGRUsage        ru0;
311         Buffer          vmbuffer = InvalidBuffer;
312         BlockNumber all_visible_streak;
313
314         pg_rusage_init(&ru0);
315
316         relname = RelationGetRelationName(onerel);
317         ereport(elevel,
318                         (errmsg("vacuuming \"%s.%s\"",
319                                         get_namespace_name(RelationGetNamespace(onerel)),
320                                         relname)));
321
322         empty_pages = vacuumed_pages = scanned_pages = 0;
323         num_tuples = tups_vacuumed = nkeep = nunused = 0;
324
325         indstats = (IndexBulkDeleteResult **)
326                 palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
327
328         nblocks = RelationGetNumberOfBlocks(onerel);
329         vacrelstats->rel_pages = nblocks;
330         vacrelstats->nonempty_pages = 0;
331         vacrelstats->latestRemovedXid = InvalidTransactionId;
332
333         lazy_space_alloc(vacrelstats, nblocks);
334
335         all_visible_streak = 0;
336         for (blkno = 0; blkno < nblocks; blkno++)
337         {
338                 Buffer          buf;
339                 Page            page;
340                 OffsetNumber offnum,
341                                         maxoff;
342                 bool            tupgone,
343                                         hastup;
344                 int                     prev_dead_count;
345                 OffsetNumber frozen[MaxOffsetNumber];
346                 int                     nfrozen;
347                 Size            freespace;
348                 bool            all_visible_according_to_vm = false;
349                 bool            all_visible;
350
351                 /*
352                  * Skip pages that don't require vacuuming according to the visibility
353                  * map. But only if we've seen a streak of at least
354                  * SKIP_PAGES_THRESHOLD pages marked as clean. Since we're reading
355                  * sequentially, the OS should be doing readahead for us and there's
356                  * no gain in skipping a page now and then. You need a longer run of
357                  * consecutive skipped pages before it's worthwhile. Also, skipping
358                  * even a single page means that we can't update relfrozenxid or
359                  * reltuples, so we only want to do it if there's a good chance to
360                  * skip a goodly number of pages.
361                  */
362                 if (!scan_all)
363                 {
364                         all_visible_according_to_vm =
365                                 visibilitymap_test(onerel, blkno, &vmbuffer);
366                         if (all_visible_according_to_vm)
367                         {
368                                 all_visible_streak++;
369                                 if (all_visible_streak >= SKIP_PAGES_THRESHOLD)
370                                 {
371                                         vacrelstats->scanned_all = false;
372                                         continue;
373                                 }
374                         }
375                         else
376                                 all_visible_streak = 0;
377                 }
378
379                 vacuum_delay_point();
380
381                 scanned_pages++;
382
383                 /*
384                  * If we are close to overrunning the available space for dead-tuple
385                  * TIDs, pause and do a cycle of vacuuming before we tackle this page.
386                  */
387                 if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
388                         vacrelstats->num_dead_tuples > 0)
389                 {
390                         /* Log cleanup info before we touch indexes */
391                         vacuum_log_cleanup_info(onerel, vacrelstats);
392
393                         /* Remove index entries */
394                         for (i = 0; i < nindexes; i++)
395                                 lazy_vacuum_index(Irel[i],
396                                                                   &indstats[i],
397                                                                   vacrelstats);
398                         /* Remove tuples from heap */
399                         lazy_vacuum_heap(onerel, vacrelstats);
400                         /*
401                          * Forget the now-vacuumed tuples, and press on, but be careful
402                          * not to reset latestRemovedXid since we want that value to be valid.
403                          */
404                         vacrelstats->num_dead_tuples = 0;
405                         vacrelstats->num_index_scans++;
406                 }
407
408                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
409                                                                  RBM_NORMAL, vac_strategy);
410
411                 /* We need buffer cleanup lock so that we can prune HOT chains. */
412                 LockBufferForCleanup(buf);
413
414                 page = BufferGetPage(buf);
415
416                 if (PageIsNew(page))
417                 {
418                         /*
419                          * An all-zeroes page could be left over if a backend extends the
420                          * relation but crashes before initializing the page. Reclaim such
421                          * pages for use.
422                          *
423                          * We have to be careful here because we could be looking at a
424                          * page that someone has just added to the relation and not yet
425                          * been able to initialize (see RelationGetBufferForTuple). To
426                          * protect against that, release the buffer lock, grab the
427                          * relation extension lock momentarily, and re-lock the buffer. If
428                          * the page is still uninitialized by then, it must be left over
429                          * from a crashed backend, and we can initialize it.
430                          *
431                          * We don't really need the relation lock when this is a new or
432                          * temp relation, but it's probably not worth the code space to
433                          * check that, since this surely isn't a critical path.
434                          *
435                          * Note: the comparable code in vacuum.c need not worry because
436                          * it's got exclusive lock on the whole relation.
437                          */
438                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
439                         LockRelationForExtension(onerel, ExclusiveLock);
440                         UnlockRelationForExtension(onerel, ExclusiveLock);
441                         LockBufferForCleanup(buf);
442                         if (PageIsNew(page))
443                         {
444                                 ereport(WARNING,
445                                 (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
446                                                 relname, blkno)));
447                                 PageInit(page, BufferGetPageSize(buf), 0);
448                                 empty_pages++;
449                         }
450                         freespace = PageGetHeapFreeSpace(page);
451                         MarkBufferDirty(buf);
452                         UnlockReleaseBuffer(buf);
453
454                         RecordPageWithFreeSpace(onerel, blkno, freespace);
455                         continue;
456                 }
457
458                 if (PageIsEmpty(page))
459                 {
460                         empty_pages++;
461                         freespace = PageGetHeapFreeSpace(page);
462
463                         if (!PageIsAllVisible(page))
464                         {
465                                 PageSetAllVisible(page);
466                                 SetBufferCommitInfoNeedsSave(buf);
467                         }
468
469                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
470
471                         /* Update the visibility map */
472                         if (!all_visible_according_to_vm)
473                         {
474                                 visibilitymap_pin(onerel, blkno, &vmbuffer);
475                                 LockBuffer(buf, BUFFER_LOCK_SHARE);
476                                 if (PageIsAllVisible(page))
477                                         visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer);
478                                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
479                         }
480
481                         ReleaseBuffer(buf);
482                         RecordPageWithFreeSpace(onerel, blkno, freespace);
483                         continue;
484                 }
485
486                 /*
487                  * Prune all HOT-update chains in this page.
488                  *
489                  * We count tuples removed by the pruning step as removed by VACUUM.
490                  */
491                 tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
492                                                                                                         &vacrelstats->latestRemovedXid);
493                 /*
494                  * Now scan the page to collect vacuumable items and check for tuples
495                  * requiring freezing.
496                  */
497                 all_visible = true;
498                 nfrozen = 0;
499                 hastup = false;
500                 prev_dead_count = vacrelstats->num_dead_tuples;
501                 maxoff = PageGetMaxOffsetNumber(page);
502                 for (offnum = FirstOffsetNumber;
503                          offnum <= maxoff;
504                          offnum = OffsetNumberNext(offnum))
505                 {
506                         ItemId          itemid;
507
508                         itemid = PageGetItemId(page, offnum);
509
510                         /* Unused items require no processing, but we count 'em */
511                         if (!ItemIdIsUsed(itemid))
512                         {
513                                 nunused += 1;
514                                 continue;
515                         }
516
517                         /* Redirect items mustn't be touched */
518                         if (ItemIdIsRedirected(itemid))
519                         {
520                                 hastup = true;  /* this page won't be truncatable */
521                                 continue;
522                         }
523
524                         ItemPointerSet(&(tuple.t_self), blkno, offnum);
525
526                         /*
527                          * DEAD item pointers are to be vacuumed normally; but we don't
528                          * count them in tups_vacuumed, else we'd be double-counting (at
529                          * least in the common case where heap_page_prune() just freed up
530                          * a non-HOT tuple).
531                          */
532                         if (ItemIdIsDead(itemid))
533                         {
534                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
535                                 all_visible = false;
536                                 continue;
537                         }
538
539                         Assert(ItemIdIsNormal(itemid));
540
541                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
542                         tuple.t_len = ItemIdGetLength(itemid);
543
544                         tupgone = false;
545
546                         switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
547                         {
548                                 case HEAPTUPLE_DEAD:
549
550                                         /*
551                                          * Ordinarily, DEAD tuples would have been removed by
552                                          * heap_page_prune(), but it's possible that the tuple
553                                          * state changed since heap_page_prune() looked.  In
554                                          * particular an INSERT_IN_PROGRESS tuple could have
555                                          * changed to DEAD if the inserter aborted.  So this
556                                          * cannot be considered an error condition.
557                                          *
558                                          * If the tuple is HOT-updated then it must only be
559                                          * removed by a prune operation; so we keep it just as if
560                                          * it were RECENTLY_DEAD.  Also, if it's a heap-only
561                                          * tuple, we choose to keep it, because it'll be a lot
562                                          * cheaper to get rid of it in the next pruning pass than
563                                          * to treat it like an indexed tuple.
564                                          */
565                                         if (HeapTupleIsHotUpdated(&tuple) ||
566                                                 HeapTupleIsHeapOnly(&tuple))
567                                                 nkeep += 1;
568                                         else
569                                                 tupgone = true; /* we can delete the tuple */
570                                         all_visible = false;
571                                         break;
572                                 case HEAPTUPLE_LIVE:
573                                         /* Tuple is good --- but let's do some validity checks */
574                                         if (onerel->rd_rel->relhasoids &&
575                                                 !OidIsValid(HeapTupleGetOid(&tuple)))
576                                                 elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
577                                                          relname, blkno, offnum);
578
579                                         /*
580                                          * Is the tuple definitely visible to all transactions?
581                                          *
582                                          * NB: Like with per-tuple hint bits, we can't set the
583                                          * PD_ALL_VISIBLE flag if the inserter committed
584                                          * asynchronously. See SetHintBits for more info. Check
585                                          * that the HEAP_XMIN_COMMITTED hint bit is set because of
586                                          * that.
587                                          */
588                                         if (all_visible)
589                                         {
590                                                 TransactionId xmin;
591
592                                                 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
593                                                 {
594                                                         all_visible = false;
595                                                         break;
596                                                 }
597
598                                                 /*
599                                                  * The inserter definitely committed. But is it old
600                                                  * enough that everyone sees it as committed?
601                                                  */
602                                                 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
603                                                 if (!TransactionIdPrecedes(xmin, OldestXmin))
604                                                 {
605                                                         all_visible = false;
606                                                         break;
607                                                 }
608                                         }
609                                         break;
610                                 case HEAPTUPLE_RECENTLY_DEAD:
611
612                                         /*
613                                          * If tuple is recently deleted then we must not remove it
614                                          * from relation.
615                                          */
616                                         nkeep += 1;
617                                         all_visible = false;
618                                         break;
619                                 case HEAPTUPLE_INSERT_IN_PROGRESS:
620                                         /* This is an expected case during concurrent vacuum */
621                                         all_visible = false;
622                                         break;
623                                 case HEAPTUPLE_DELETE_IN_PROGRESS:
624                                         /* This is an expected case during concurrent vacuum */
625                                         all_visible = false;
626                                         break;
627                                 default:
628                                         elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
629                                         break;
630                         }
631
632                         if (tupgone)
633                         {
634                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
635                                 HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
636                                                                                          &vacrelstats->latestRemovedXid);
637                                 tups_vacuumed += 1;
638                         }
639                         else
640                         {
641                                 num_tuples += 1;
642                                 hastup = true;
643
644                                 /*
645                                  * Each non-removable tuple must be checked to see if it needs
646                                  * freezing.  Note we already have exclusive buffer lock.
647                                  */
648                                 if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
649                                                                           InvalidBuffer))
650                                         frozen[nfrozen++] = offnum;
651                         }
652                 }                                               /* scan along page */
653
654                 /*
655                  * If we froze any tuples, mark the buffer dirty, and write a WAL
656                  * record recording the changes.  We must log the changes to be
657                  * crash-safe against future truncation of CLOG.
658                  */
659                 if (nfrozen > 0)
660                 {
661                         MarkBufferDirty(buf);
662                         /* no XLOG for temp tables, though */
663                         if (!onerel->rd_istemp)
664                         {
665                                 XLogRecPtr      recptr;
666
667                                 recptr = log_heap_freeze(onerel, buf, FreezeLimit,
668                                                                                  frozen, nfrozen);
669                                 PageSetLSN(page, recptr);
670                                 PageSetTLI(page, ThisTimeLineID);
671                         }
672                 }
673
674                 /*
675                  * If there are no indexes then we can vacuum the page right now
676                  * instead of doing a second scan.
677                  */
678                 if (nindexes == 0 &&
679                         vacrelstats->num_dead_tuples > 0)
680                 {
681                         /* Remove tuples from heap */
682                         lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
683                         /*
684                          * Forget the now-vacuumed tuples, and press on, but be careful
685                          * not to reset latestRemovedXid since we want that value to be valid.
686                          */
687                         Assert(TransactionIdIsValid(vacrelstats->latestRemovedXid));
688                         vacrelstats->num_dead_tuples = 0;
689                         vacuumed_pages++;
690                 }
691
692                 freespace = PageGetHeapFreeSpace(page);
693
694                 /* Update the all-visible flag on the page */
695                 if (!PageIsAllVisible(page) && all_visible)
696                 {
697                         PageSetAllVisible(page);
698                         SetBufferCommitInfoNeedsSave(buf);
699                 }
700                 else if (PageIsAllVisible(page) && !all_visible)
701                 {
702                         elog(WARNING, "PD_ALL_VISIBLE flag was incorrectly set in relation \"%s\" page %u",
703                                  relname, blkno);
704                         PageClearAllVisible(page);
705                         SetBufferCommitInfoNeedsSave(buf);
706
707                         /*
708                          * Normally, we would drop the lock on the heap page before
709                          * updating the visibility map, but since this case shouldn't
710                          * happen anyway, don't worry about that.
711                          */
712                         visibilitymap_clear(onerel, blkno);
713                 }
714
715                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
716
717                 /* Update the visibility map */
718                 if (!all_visible_according_to_vm && all_visible)
719                 {
720                         visibilitymap_pin(onerel, blkno, &vmbuffer);
721                         LockBuffer(buf, BUFFER_LOCK_SHARE);
722                         if (PageIsAllVisible(page))
723                                 visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer);
724                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
725                 }
726
727                 ReleaseBuffer(buf);
728
729                 /* Remember the location of the last page with nonremovable tuples */
730                 if (hastup)
731                         vacrelstats->nonempty_pages = blkno + 1;
732
733                 /*
734                  * If we remembered any tuples for deletion, then the page will be
735                  * visited again by lazy_vacuum_heap, which will compute and record
736                  * its post-compaction free space.      If not, then we're done with this
737                  * page, so remember its free space as-is.      (This path will always be
738                  * taken if there are no indexes.)
739                  */
740                 if (vacrelstats->num_dead_tuples == prev_dead_count)
741                         RecordPageWithFreeSpace(onerel, blkno, freespace);
742         }
743
744         /* save stats for use later */
745         vacrelstats->rel_tuples = num_tuples;
746         vacrelstats->tuples_deleted = tups_vacuumed;
747
748         /* If any tuples need to be deleted, perform final vacuum cycle */
749         /* XXX put a threshold on min number of tuples here? */
750         if (vacrelstats->num_dead_tuples > 0)
751         {
752                 /* Log cleanup info before we touch indexes */
753                 vacuum_log_cleanup_info(onerel, vacrelstats);
754
755                 /* Remove index entries */
756                 for (i = 0; i < nindexes; i++)
757                         lazy_vacuum_index(Irel[i],
758                                                           &indstats[i],
759                                                           vacrelstats);
760                 /* Remove tuples from heap */
761                 lazy_vacuum_heap(onerel, vacrelstats);
762                 vacrelstats->num_index_scans++;
763         }
764
765         /* Release the pin on the visibility map page */
766         if (BufferIsValid(vmbuffer))
767         {
768                 ReleaseBuffer(vmbuffer);
769                 vmbuffer = InvalidBuffer;
770         }
771
772         /* Do post-vacuum cleanup and statistics update for each index */
773         for (i = 0; i < nindexes; i++)
774                 lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
775
776         /* If no indexes, make log report that lazy_vacuum_heap would've made */
777         if (vacuumed_pages)
778                 ereport(elevel,
779                                 (errmsg("\"%s\": removed %.0f row versions in %u pages",
780                                                 RelationGetRelationName(onerel),
781                                                 tups_vacuumed, vacuumed_pages)));
782
783         ereport(elevel,
784                         (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
785                                         RelationGetRelationName(onerel),
786                                         tups_vacuumed, num_tuples, scanned_pages, nblocks),
787                          errdetail("%.0f dead row versions cannot be removed yet.\n"
788                                            "There were %.0f unused item pointers.\n"
789                                            "%u pages are entirely empty.\n"
790                                            "%s.",
791                                            nkeep,
792                                            nunused,
793                                            empty_pages,
794                                            pg_rusage_show(&ru0))));
795 }
796
797
798 /*
799  *      lazy_vacuum_heap() -- second pass over the heap
800  *
801  *              This routine marks dead tuples as unused and compacts out free
802  *              space on their pages.  Pages not having dead tuples recorded from
803  *              lazy_scan_heap are not visited at all.
804  *
805  * Note: the reason for doing this as a second pass is we cannot remove
806  * the tuples until we've removed their index entries, and we want to
807  * process index entry removal in batches as large as possible.
808  */
809 static void
810 lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
811 {
812         int                     tupindex;
813         int                     npages;
814         PGRUsage        ru0;
815
816         pg_rusage_init(&ru0);
817         npages = 0;
818
819         tupindex = 0;
820         while (tupindex < vacrelstats->num_dead_tuples)
821         {
822                 BlockNumber tblk;
823                 Buffer          buf;
824                 Page            page;
825                 Size            freespace;
826
827                 vacuum_delay_point();
828
829                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
830                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
831                                                                  vac_strategy);
832                 LockBufferForCleanup(buf);
833                 tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
834
835                 /* Now that we've compacted the page, record its available space */
836                 page = BufferGetPage(buf);
837                 freespace = PageGetHeapFreeSpace(page);
838
839                 UnlockReleaseBuffer(buf);
840                 RecordPageWithFreeSpace(onerel, tblk, freespace);
841                 npages++;
842         }
843
844         ereport(elevel,
845                         (errmsg("\"%s\": removed %d row versions in %d pages",
846                                         RelationGetRelationName(onerel),
847                                         tupindex, npages),
848                          errdetail("%s.",
849                                            pg_rusage_show(&ru0))));
850 }
851
852 /*
853  *      lazy_vacuum_page() -- free dead tuples on a page
854  *                                       and repair its fragmentation.
855  *
856  * Caller must hold pin and buffer cleanup lock on the buffer.
857  *
858  * tupindex is the index in vacrelstats->dead_tuples of the first dead
859  * tuple for this page.  We assume the rest follow sequentially.
860  * The return value is the first tupindex after the tuples of this page.
861  */
862 static int
863 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
864                                  int tupindex, LVRelStats *vacrelstats)
865 {
866         Page            page = BufferGetPage(buffer);
867         OffsetNumber unused[MaxOffsetNumber];
868         int                     uncnt = 0;
869
870         START_CRIT_SECTION();
871
872         for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
873         {
874                 BlockNumber tblk;
875                 OffsetNumber toff;
876                 ItemId          itemid;
877
878                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
879                 if (tblk != blkno)
880                         break;                          /* past end of tuples for this block */
881                 toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
882                 itemid = PageGetItemId(page, toff);
883                 ItemIdSetUnused(itemid);
884                 unused[uncnt++] = toff;
885         }
886
887         PageRepairFragmentation(page);
888
889         MarkBufferDirty(buffer);
890
891         /* XLOG stuff */
892         if (!onerel->rd_istemp)
893         {
894                 XLogRecPtr      recptr;
895
896                 recptr = log_heap_clean(onerel, buffer,
897                                                                 NULL, 0, NULL, 0,
898                                                                 unused, uncnt,
899                                                                 vacrelstats->latestRemovedXid);
900                 PageSetLSN(page, recptr);
901                 PageSetTLI(page, ThisTimeLineID);
902         }
903
904         END_CRIT_SECTION();
905
906         return tupindex;
907 }
908
909 /*
910  *      lazy_vacuum_index() -- vacuum one index relation.
911  *
912  *              Delete all the index entries pointing to tuples listed in
913  *              vacrelstats->dead_tuples, and update running statistics.
914  */
915 static void
916 lazy_vacuum_index(Relation indrel,
917                                   IndexBulkDeleteResult **stats,
918                                   LVRelStats *vacrelstats)
919 {
920         IndexVacuumInfo ivinfo;
921         PGRUsage        ru0;
922
923         pg_rusage_init(&ru0);
924
925         ivinfo.index = indrel;
926         ivinfo.analyze_only = false;
927         ivinfo.estimated_count = true;
928         ivinfo.message_level = elevel;
929         ivinfo.num_heap_tuples = vacrelstats->old_rel_tuples;
930         ivinfo.strategy = vac_strategy;
931
932         /* Do bulk deletion */
933         *stats = index_bulk_delete(&ivinfo, *stats,
934                                                            lazy_tid_reaped, (void *) vacrelstats);
935
936         ereport(elevel,
937                         (errmsg("scanned index \"%s\" to remove %d row versions",
938                                         RelationGetRelationName(indrel),
939                                         vacrelstats->num_dead_tuples),
940                          errdetail("%s.", pg_rusage_show(&ru0))));
941 }
942
943 /*
944  *      lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
945  */
946 static void
947 lazy_cleanup_index(Relation indrel,
948                                    IndexBulkDeleteResult *stats,
949                                    LVRelStats *vacrelstats)
950 {
951         IndexVacuumInfo ivinfo;
952         PGRUsage        ru0;
953
954         pg_rusage_init(&ru0);
955
956         ivinfo.index = indrel;
957         ivinfo.analyze_only = false;
958         ivinfo.estimated_count = !vacrelstats->scanned_all;
959         ivinfo.message_level = elevel;
960         /* use rel_tuples only if we scanned all pages, else fall back */
961         ivinfo.num_heap_tuples = vacrelstats->scanned_all ? vacrelstats->rel_tuples : vacrelstats->old_rel_tuples;
962         ivinfo.strategy = vac_strategy;
963
964         stats = index_vacuum_cleanup(&ivinfo, stats);
965
966         if (!stats)
967                 return;
968
969         /*
970          * Now update statistics in pg_class, but only if the index says the count
971          * is accurate.
972          */
973         if (!stats->estimated_count)
974                 vac_update_relstats(indrel,
975                                                         stats->num_pages, stats->num_index_tuples,
976                                                         false, InvalidTransactionId);
977
978         ereport(elevel,
979                         (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
980                                         RelationGetRelationName(indrel),
981                                         stats->num_index_tuples,
982                                         stats->num_pages),
983                          errdetail("%.0f index row versions were removed.\n"
984                          "%u index pages have been deleted, %u are currently reusable.\n"
985                                            "%s.",
986                                            stats->tuples_removed,
987                                            stats->pages_deleted, stats->pages_free,
988                                            pg_rusage_show(&ru0))));
989
990         pfree(stats);
991 }
992
993 /*
994  * lazy_truncate_heap - try to truncate off any empty pages at the end
995  */
996 static void
997 lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
998 {
999         BlockNumber old_rel_pages = vacrelstats->rel_pages;
1000         BlockNumber new_rel_pages;
1001         PGRUsage        ru0;
1002
1003         pg_rusage_init(&ru0);
1004
1005         /*
1006          * We need full exclusive lock on the relation in order to do truncation.
1007          * If we can't get it, give up rather than waiting --- we don't want to
1008          * block other backends, and we don't want to deadlock (which is quite
1009          * possible considering we already hold a lower-grade lock).
1010          */
1011         if (!ConditionalLockRelation(onerel, AccessExclusiveLock))
1012                 return;
1013
1014         /*
1015          * Now that we have exclusive lock, look to see if the rel has grown
1016          * whilst we were vacuuming with non-exclusive lock.  If so, give up; the
1017          * newly added pages presumably contain non-deletable tuples.
1018          */
1019         new_rel_pages = RelationGetNumberOfBlocks(onerel);
1020         if (new_rel_pages != old_rel_pages)
1021         {
1022                 /* might as well use the latest news when we update pg_class stats */
1023                 vacrelstats->rel_pages = new_rel_pages;
1024                 UnlockRelation(onerel, AccessExclusiveLock);
1025                 return;
1026         }
1027
1028         /*
1029          * Scan backwards from the end to verify that the end pages actually
1030          * contain no tuples.  This is *necessary*, not optional, because other
1031          * backends could have added tuples to these pages whilst we were
1032          * vacuuming.
1033          */
1034         new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
1035
1036         if (new_rel_pages >= old_rel_pages)
1037         {
1038                 /* can't do anything after all */
1039                 UnlockRelation(onerel, AccessExclusiveLock);
1040                 return;
1041         }
1042
1043         /*
1044          * Okay to truncate.
1045          */
1046         RelationTruncate(onerel, new_rel_pages);
1047
1048         /*
1049          * We can release the exclusive lock as soon as we have truncated.      Other
1050          * backends can't safely access the relation until they have processed the
1051          * smgr invalidation that smgrtruncate sent out ... but that should happen
1052          * as part of standard invalidation processing once they acquire lock on
1053          * the relation.
1054          */
1055         UnlockRelation(onerel, AccessExclusiveLock);
1056
1057         /* update statistics */
1058         vacrelstats->rel_pages = new_rel_pages;
1059         vacrelstats->pages_removed = old_rel_pages - new_rel_pages;
1060
1061         ereport(elevel,
1062                         (errmsg("\"%s\": truncated %u to %u pages",
1063                                         RelationGetRelationName(onerel),
1064                                         old_rel_pages, new_rel_pages),
1065                          errdetail("%s.",
1066                                            pg_rusage_show(&ru0))));
1067 }
1068
1069 /*
1070  * Rescan end pages to verify that they are (still) empty of tuples.
1071  *
1072  * Returns number of nondeletable pages (last nonempty page + 1).
1073  */
1074 static BlockNumber
1075 count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
1076 {
1077         BlockNumber blkno;
1078
1079         /* Strange coding of loop control is needed because blkno is unsigned */
1080         blkno = vacrelstats->rel_pages;
1081         while (blkno > vacrelstats->nonempty_pages)
1082         {
1083                 Buffer          buf;
1084                 Page            page;
1085                 OffsetNumber offnum,
1086                                         maxoff;
1087                 bool            hastup;
1088
1089                 /*
1090                  * We don't insert a vacuum delay point here, because we have an
1091                  * exclusive lock on the table which we want to hold for as short a
1092                  * time as possible.  We still need to check for interrupts however.
1093                  */
1094                 CHECK_FOR_INTERRUPTS();
1095
1096                 blkno--;
1097
1098                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
1099                                                                  RBM_NORMAL, vac_strategy);
1100
1101                 /* In this phase we only need shared access to the buffer */
1102                 LockBuffer(buf, BUFFER_LOCK_SHARE);
1103
1104                 page = BufferGetPage(buf);
1105
1106                 if (PageIsNew(page) || PageIsEmpty(page))
1107                 {
1108                         /* PageIsNew probably shouldn't happen... */
1109                         UnlockReleaseBuffer(buf);
1110                         continue;
1111                 }
1112
1113                 hastup = false;
1114                 maxoff = PageGetMaxOffsetNumber(page);
1115                 for (offnum = FirstOffsetNumber;
1116                          offnum <= maxoff;
1117                          offnum = OffsetNumberNext(offnum))
1118                 {
1119                         ItemId          itemid;
1120
1121                         itemid = PageGetItemId(page, offnum);
1122
1123                         /*
1124                          * Note: any non-unused item should be taken as a reason to keep
1125                          * this page.  We formerly thought that DEAD tuples could be
1126                          * thrown away, but that's not so, because we'd not have cleaned
1127                          * out their index entries.
1128                          */
1129                         if (ItemIdIsUsed(itemid))
1130                         {
1131                                 hastup = true;
1132                                 break;                  /* can stop scanning */
1133                         }
1134                 }                                               /* scan along page */
1135
1136                 UnlockReleaseBuffer(buf);
1137
1138                 /* Done scanning if we found a tuple here */
1139                 if (hastup)
1140                         return blkno + 1;
1141         }
1142
1143         /*
1144          * If we fall out of the loop, all the previously-thought-to-be-empty
1145          * pages still are; we need not bother to look at the last known-nonempty
1146          * page.
1147          */
1148         return vacrelstats->nonempty_pages;
1149 }
1150
1151 /*
1152  * lazy_space_alloc - space allocation decisions for lazy vacuum
1153  *
1154  * See the comments at the head of this file for rationale.
1155  */
1156 static void
1157 lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
1158 {
1159         long            maxtuples;
1160
1161         if (vacrelstats->hasindex)
1162         {
1163                 maxtuples = (maintenance_work_mem * 1024L) / sizeof(ItemPointerData);
1164                 maxtuples = Min(maxtuples, INT_MAX);
1165                 maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
1166
1167                 /* curious coding here to ensure the multiplication can't overflow */
1168                 if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
1169                         maxtuples = relblocks * LAZY_ALLOC_TUPLES;
1170
1171                 /* stay sane if small maintenance_work_mem */
1172                 maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
1173         }
1174         else
1175         {
1176                 maxtuples = MaxHeapTuplesPerPage;
1177         }
1178
1179         vacrelstats->num_dead_tuples = 0;
1180         vacrelstats->max_dead_tuples = (int) maxtuples;
1181         vacrelstats->dead_tuples = (ItemPointer)
1182                 palloc(maxtuples * sizeof(ItemPointerData));
1183 }
1184
1185 /*
1186  * lazy_record_dead_tuple - remember one deletable tuple
1187  */
1188 static void
1189 lazy_record_dead_tuple(LVRelStats *vacrelstats,
1190                                            ItemPointer itemptr)
1191 {
1192         /*
1193          * The array shouldn't overflow under normal behavior, but perhaps it
1194          * could if we are given a really small maintenance_work_mem. In that
1195          * case, just forget the last few tuples (we'll get 'em next time).
1196          */
1197         if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
1198         {
1199                 vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
1200                 vacrelstats->num_dead_tuples++;
1201         }
1202 }
1203
1204 /*
1205  *      lazy_tid_reaped() -- is a particular tid deletable?
1206  *
1207  *              This has the right signature to be an IndexBulkDeleteCallback.
1208  *
1209  *              Assumes dead_tuples array is in sorted order.
1210  */
1211 static bool
1212 lazy_tid_reaped(ItemPointer itemptr, void *state)
1213 {
1214         LVRelStats *vacrelstats = (LVRelStats *) state;
1215         ItemPointer res;
1216
1217         res = (ItemPointer) bsearch((void *) itemptr,
1218                                                                 (void *) vacrelstats->dead_tuples,
1219                                                                 vacrelstats->num_dead_tuples,
1220                                                                 sizeof(ItemPointerData),
1221                                                                 vac_cmp_itemptr);
1222
1223         return (res != NULL);
1224 }
1225
1226 /*
1227  * Comparator routines for use with qsort() and bsearch().
1228  */
1229 static int
1230 vac_cmp_itemptr(const void *left, const void *right)
1231 {
1232         BlockNumber lblk,
1233                                 rblk;
1234         OffsetNumber loff,
1235                                 roff;
1236
1237         lblk = ItemPointerGetBlockNumber((ItemPointer) left);
1238         rblk = ItemPointerGetBlockNumber((ItemPointer) right);
1239
1240         if (lblk < rblk)
1241                 return -1;
1242         if (lblk > rblk)
1243                 return 1;
1244
1245         loff = ItemPointerGetOffsetNumber((ItemPointer) left);
1246         roff = ItemPointerGetOffsetNumber((ItemPointer) right);
1247
1248         if (loff < roff)
1249                 return -1;
1250         if (loff > roff)
1251                 return 1;
1252
1253         return 0;
1254 }