]> granicus.if.org Git - postgresql/blob - src/backend/commands/vacuumlazy.c
Skip ambulkdelete scan if there's nothing to delete and the index is not
[postgresql] / src / backend / commands / vacuumlazy.c
1 /*-------------------------------------------------------------------------
2  *
3  * vacuumlazy.c
4  *        Concurrent ("lazy") vacuuming.
5  *
6  *
7  * The major space usage for LAZY VACUUM is storage for the array of dead
8  * tuple TIDs, with the next biggest need being storage for per-disk-page
9  * free space info.  We want to ensure we can vacuum even the very largest
10  * relations with finite memory space usage.  To do that, we set upper bounds
11  * on the number of tuples and pages we will keep track of at once.
12  *
13  * We are willing to use at most maintenance_work_mem memory space to keep
14  * track of dead tuples.  We initially allocate an array of TIDs of that size.
15  * If the array threatens to overflow, we suspend the heap scan phase and
16  * perform a pass of index cleanup and page compaction, then resume the heap
17  * scan with an empty TID array.
18  *
19  * We can limit the storage for page free space to MaxFSMPages entries,
20  * since that's the most the free space map will be willing to remember
21  * anyway.      If the relation has fewer than that many pages with free space,
22  * life is easy: just build an array of per-page info.  If it has more,
23  * we store the free space info as a heap ordered by amount of free space,
24  * so that we can discard the pages with least free space to ensure we never
25  * have more than MaxFSMPages entries in all.  The surviving page entries
26  * are passed to the free space map at conclusion of the scan.
27  *
28  *
29  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
30  * Portions Copyright (c) 1994, Regents of the University of California
31  *
32  *
33  * IDENTIFICATION
34  *        $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.66 2006/02/11 23:31:34 tgl Exp $
35  *
36  *-------------------------------------------------------------------------
37  */
38 #include "postgres.h"
39
40 #include <math.h>
41
42 #include "access/genam.h"
43 #include "access/heapam.h"
44 #include "access/xlog.h"
45 #include "commands/vacuum.h"
46 #include "miscadmin.h"
47 #include "pgstat.h"
48 #include "storage/freespace.h"
49 #include "storage/smgr.h"
50 #include "utils/lsyscache.h"
51 #include "utils/pg_rusage.h"
52
53
54 /*
55  * Space/time tradeoff parameters: do these need to be user-tunable?
56  *
57  * To consider truncating the relation, we want there to be at least
58  * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
59  * is less) potentially-freeable pages.
60  */
61 #define REL_TRUNCATE_MINIMUM    1000
62 #define REL_TRUNCATE_FRACTION   16
63
64
65 typedef struct LVRelStats
66 {
67         /* Overall statistics about rel */
68         BlockNumber rel_pages;
69         double          rel_tuples;
70         BlockNumber pages_removed;
71         double          tuples_deleted;
72         BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
73         Size            threshold;              /* minimum interesting free space */
74         /* List of TIDs of tuples we intend to delete */
75         /* NB: this list is ordered by TID address */
76         int                     num_dead_tuples;        /* current # of entries */
77         int                     max_dead_tuples;        /* # slots allocated in array */
78         ItemPointer dead_tuples;        /* array of ItemPointerData */
79         /* Array or heap of per-page info about free space */
80         /* We use a simple array until it fills up, then convert to heap */
81         bool            fs_is_heap;             /* are we using heap organization? */
82         int                     num_free_pages; /* current # of entries */
83         int                     max_free_pages; /* # slots allocated in array */
84         PageFreeSpaceInfo *free_pages;          /* array or heap of blkno/avail */
85 } LVRelStats;
86
87
88 static int      elevel = -1;
89
90 static TransactionId OldestXmin;
91 static TransactionId FreezeLimit;
92
93
94 /* non-export function prototypes */
95 static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
96                            Relation *Irel, int nindexes);
97 static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
98 static void lazy_scan_index(Relation indrel, LVRelStats *vacrelstats);
99 static void lazy_vacuum_index(Relation indrel,
100                                   double *index_tups_vacuumed,
101                                   BlockNumber *index_pages_removed,
102                                   LVRelStats *vacrelstats);
103 static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
104                                  int tupindex, LVRelStats *vacrelstats);
105 static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
106 static BlockNumber count_nondeletable_pages(Relation onerel,
107                                                  LVRelStats *vacrelstats);
108 static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
109 static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
110                                            ItemPointer itemptr);
111 static void lazy_record_free_space(LVRelStats *vacrelstats,
112                                            BlockNumber page, Size avail);
113 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
114 static bool dummy_tid_reaped(ItemPointer itemptr, void *state);
115 static void lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats);
116 static int      vac_cmp_itemptr(const void *left, const void *right);
117 static int      vac_cmp_page_spaces(const void *left, const void *right);
118
119
120 /*
121  *      lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
122  *
123  *              This routine vacuums a single heap, cleans out its indexes, and
124  *              updates its num_pages and num_tuples statistics.
125  *
126  *              At entry, we have already established a transaction and opened
127  *              and locked the relation.
128  */
129 void
130 lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
131 {
132         LVRelStats *vacrelstats;
133         Relation   *Irel;
134         int                     nindexes;
135         bool            hasindex;
136         BlockNumber possibly_freeable;
137
138         if (vacstmt->verbose)
139                 elevel = INFO;
140         else
141                 elevel = DEBUG2;
142
143         vacuum_set_xid_limits(vacstmt, onerel->rd_rel->relisshared,
144                                                   &OldestXmin, &FreezeLimit);
145
146         vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
147
148         /* Set threshold for interesting free space = average request size */
149         /* XXX should we scale it up or down?  Adjust vacuum.c too, if so */
150         vacrelstats->threshold = GetAvgFSMRequestSize(&onerel->rd_node);
151
152         /* Open all indexes of the relation */
153         vac_open_indexes(onerel, ShareUpdateExclusiveLock, &nindexes, &Irel);
154         hasindex = (nindexes > 0);
155
156         /* Do the vacuuming */
157         lazy_scan_heap(onerel, vacrelstats, Irel, nindexes);
158
159         /* Done with indexes */
160         vac_close_indexes(nindexes, Irel, NoLock);
161
162         /*
163          * Optionally truncate the relation.
164          *
165          * Don't even think about it unless we have a shot at releasing a goodly
166          * number of pages.  Otherwise, the time taken isn't worth it.
167          */
168         possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
169         if (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
170                 possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)
171                 lazy_truncate_heap(onerel, vacrelstats);
172
173         /* Update shared free space map with final free space info */
174         lazy_update_fsm(onerel, vacrelstats);
175
176         /* Update statistics in pg_class */
177         vac_update_relstats(RelationGetRelid(onerel),
178                                                 vacrelstats->rel_pages,
179                                                 vacrelstats->rel_tuples,
180                                                 hasindex);
181
182         /* report results to the stats collector, too */
183         pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared,
184                                                  vacstmt->analyze, vacrelstats->rel_tuples);
185 }
186
187
188 /*
189  *      lazy_scan_heap() -- scan an open heap relation
190  *
191  *              This routine sets commit status bits, builds lists of dead tuples
192  *              and pages with free space, and calculates statistics on the number
193  *              of live tuples in the heap.  When done, or when we run low on space
194  *              for dead-tuple TIDs, invoke vacuuming of indexes and heap.
195  */
196 static void
197 lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
198                            Relation *Irel, int nindexes)
199 {
200         BlockNumber nblocks,
201                                 blkno;
202         HeapTupleData tuple;
203         char       *relname;
204         BlockNumber empty_pages;
205         double          num_tuples,
206                                 tups_vacuumed,
207                                 nkeep,
208                                 nunused;
209         double     *index_tups_vacuumed;
210         BlockNumber *index_pages_removed;
211         bool            did_vacuum_index = false;
212         int                     i;
213         PGRUsage        ru0;
214
215         pg_rusage_init(&ru0);
216
217         relname = RelationGetRelationName(onerel);
218         ereport(elevel,
219                         (errmsg("vacuuming \"%s.%s\"",
220                                         get_namespace_name(RelationGetNamespace(onerel)),
221                                         relname)));
222
223         empty_pages = 0;
224         num_tuples = tups_vacuumed = nkeep = nunused = 0;
225
226         /*
227          * Because index vacuuming is done in multiple passes, we have to keep
228          * track of the total number of rows and pages removed from each index.
229          * index_tups_vacuumed[i] is the number removed so far from the i'th
230          * index.  (For partial indexes this could well be different from
231          * tups_vacuumed.)      Likewise for index_pages_removed[i].
232          */
233         index_tups_vacuumed = (double *) palloc0(nindexes * sizeof(double));
234         index_pages_removed = (BlockNumber *) palloc0(nindexes * sizeof(BlockNumber));
235
236         nblocks = RelationGetNumberOfBlocks(onerel);
237         vacrelstats->rel_pages = nblocks;
238         vacrelstats->nonempty_pages = 0;
239
240         lazy_space_alloc(vacrelstats, nblocks);
241
242         for (blkno = 0; blkno < nblocks; blkno++)
243         {
244                 Buffer          buf;
245                 Page            page;
246                 OffsetNumber offnum,
247                                         maxoff;
248                 bool            pgchanged,
249                                         tupgone,
250                                         hastup;
251                 int                     prev_dead_count;
252
253                 vacuum_delay_point();
254
255                 /*
256                  * If we are close to overrunning the available space for dead-tuple
257                  * TIDs, pause and do a cycle of vacuuming before we tackle this page.
258                  */
259                 if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
260                         vacrelstats->num_dead_tuples > 0)
261                 {
262                         /* Remove index entries */
263                         for (i = 0; i < nindexes; i++)
264                                 lazy_vacuum_index(Irel[i],
265                                                                   &index_tups_vacuumed[i],
266                                                                   &index_pages_removed[i],
267                                                                   vacrelstats);
268                         did_vacuum_index = true;
269                         /* Remove tuples from heap */
270                         lazy_vacuum_heap(onerel, vacrelstats);
271                         /* Forget the now-vacuumed tuples, and press on */
272                         vacrelstats->num_dead_tuples = 0;
273                 }
274
275                 buf = ReadBuffer(onerel, blkno);
276
277                 /* In this phase we only need shared access to the buffer */
278                 LockBuffer(buf, BUFFER_LOCK_SHARE);
279
280                 page = BufferGetPage(buf);
281
282                 if (PageIsNew(page))
283                 {
284                         /*
285                          * An all-zeroes page could be left over if a backend extends the
286                          * relation but crashes before initializing the page. Reclaim such
287                          * pages for use.
288                          *
289                          * We have to be careful here because we could be looking at a
290                          * page that someone has just added to the relation and not yet
291                          * been able to initialize (see RelationGetBufferForTuple). To
292                          * interlock against that, release the buffer read lock (which we
293                          * must do anyway) and grab the relation extension lock before
294                          * re-locking in exclusive mode.  If the page is still
295                          * uninitialized by then, it must be left over from a crashed
296                          * backend, and we can initialize it.
297                          *
298                          * We don't really need the relation lock when this is a new or
299                          * temp relation, but it's probably not worth the code space to
300                          * check that, since this surely isn't a critical path.
301                          *
302                          * Note: the comparable code in vacuum.c need not worry because
303                          * it's got exclusive lock on the whole relation.
304                          */
305                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
306                         LockRelationForExtension(onerel, ExclusiveLock);
307                         UnlockRelationForExtension(onerel, ExclusiveLock);
308                         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
309                         if (PageIsNew(page))
310                         {
311                                 ereport(WARNING,
312                                 (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
313                                                 relname, blkno)));
314                                 PageInit(page, BufferGetPageSize(buf), 0);
315                                 empty_pages++;
316                                 lazy_record_free_space(vacrelstats, blkno,
317                                                                            PageGetFreeSpace(page));
318                         }
319                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
320                         WriteBuffer(buf);
321                         continue;
322                 }
323
324                 if (PageIsEmpty(page))
325                 {
326                         empty_pages++;
327                         lazy_record_free_space(vacrelstats, blkno,
328                                                                    PageGetFreeSpace(page));
329                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
330                         ReleaseBuffer(buf);
331                         continue;
332                 }
333
334                 pgchanged = false;
335                 hastup = false;
336                 prev_dead_count = vacrelstats->num_dead_tuples;
337                 maxoff = PageGetMaxOffsetNumber(page);
338                 for (offnum = FirstOffsetNumber;
339                          offnum <= maxoff;
340                          offnum = OffsetNumberNext(offnum))
341                 {
342                         ItemId          itemid;
343
344                         itemid = PageGetItemId(page, offnum);
345
346                         if (!ItemIdIsUsed(itemid))
347                         {
348                                 nunused += 1;
349                                 continue;
350                         }
351
352                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
353                         tuple.t_len = ItemIdGetLength(itemid);
354                         ItemPointerSet(&(tuple.t_self), blkno, offnum);
355
356                         tupgone = false;
357
358                         switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
359                         {
360                                 case HEAPTUPLE_DEAD:
361                                         tupgone = true;         /* we can delete the tuple */
362                                         break;
363                                 case HEAPTUPLE_LIVE:
364
365                                         /*
366                                          * Tuple is good.  Consider whether to replace its xmin
367                                          * value with FrozenTransactionId.
368                                          *
369                                          * NB: Since we hold only a shared buffer lock here, we
370                                          * are assuming that TransactionId read/write is atomic.
371                                          * This is not the only place that makes such an
372                                          * assumption. It'd be possible to avoid the assumption by
373                                          * momentarily acquiring exclusive lock, but for the
374                                          * moment I see no need to.
375                                          */
376                                         if (TransactionIdIsNormal(HeapTupleHeaderGetXmin(tuple.t_data)) &&
377                                                 TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
378                                                                                           FreezeLimit))
379                                         {
380                                                 HeapTupleHeaderSetXmin(tuple.t_data, FrozenTransactionId);
381                                                 /* infomask should be okay already */
382                                                 Assert(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED);
383                                                 pgchanged = true;
384                                         }
385
386                                         /*
387                                          * Other checks...
388                                          */
389                                         if (onerel->rd_rel->relhasoids &&
390                                                 !OidIsValid(HeapTupleGetOid(&tuple)))
391                                                 elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
392                                                          relname, blkno, offnum);
393                                         break;
394                                 case HEAPTUPLE_RECENTLY_DEAD:
395
396                                         /*
397                                          * If tuple is recently deleted then we must not remove it
398                                          * from relation.
399                                          */
400                                         nkeep += 1;
401                                         break;
402                                 case HEAPTUPLE_INSERT_IN_PROGRESS:
403                                         /* This is an expected case during concurrent vacuum */
404                                         break;
405                                 case HEAPTUPLE_DELETE_IN_PROGRESS:
406                                         /* This is an expected case during concurrent vacuum */
407                                         break;
408                                 default:
409                                         elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
410                                         break;
411                         }
412
413                         if (tupgone)
414                         {
415                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
416                                 tups_vacuumed += 1;
417                         }
418                         else
419                         {
420                                 num_tuples += 1;
421                                 hastup = true;
422                         }
423                 }                                               /* scan along page */
424
425                 /*
426                  * If we remembered any tuples for deletion, then the page will be
427                  * visited again by lazy_vacuum_heap, which will compute and record
428                  * its post-compaction free space.      If not, then we're done with this
429                  * page, so remember its free space as-is.
430                  */
431                 if (vacrelstats->num_dead_tuples == prev_dead_count)
432                 {
433                         lazy_record_free_space(vacrelstats, blkno,
434                                                                    PageGetFreeSpace(page));
435                 }
436
437                 /* Remember the location of the last page with nonremovable tuples */
438                 if (hastup)
439                         vacrelstats->nonempty_pages = blkno + 1;
440
441                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
442
443                 if (pgchanged)
444                         WriteBuffer(buf);
445                 else
446                         ReleaseBuffer(buf);
447         }
448
449         /* save stats for use later */
450         vacrelstats->rel_tuples = num_tuples;
451         vacrelstats->tuples_deleted = tups_vacuumed;
452
453         /* If any tuples need to be deleted, perform final vacuum cycle */
454         /* XXX put a threshold on min number of tuples here? */
455         if (vacrelstats->num_dead_tuples > 0)
456         {
457                 /* Remove index entries */
458                 for (i = 0; i < nindexes; i++)
459                         lazy_vacuum_index(Irel[i],
460                                                           &index_tups_vacuumed[i],
461                                                           &index_pages_removed[i],
462                                                           vacrelstats);
463                 /* Remove tuples from heap */
464                 lazy_vacuum_heap(onerel, vacrelstats);
465         }
466         else if (!did_vacuum_index)
467         {
468                 /* Must do post-vacuum cleanup and statistics update anyway */
469                 for (i = 0; i < nindexes; i++)
470                         lazy_scan_index(Irel[i], vacrelstats);
471         }
472
473         ereport(elevel,
474                         (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
475                                         RelationGetRelationName(onerel),
476                                         tups_vacuumed, num_tuples, nblocks),
477                          errdetail("%.0f dead row versions cannot be removed yet.\n"
478                                            "There were %.0f unused item pointers.\n"
479                                            "%u pages are entirely empty.\n"
480                                            "%s.",
481                                            nkeep,
482                                            nunused,
483                                            empty_pages,
484                                            pg_rusage_show(&ru0))));
485 }
486
487
488 /*
489  *      lazy_vacuum_heap() -- second pass over the heap
490  *
491  *              This routine marks dead tuples as unused and compacts out free
492  *              space on their pages.  Pages not having dead tuples recorded from
493  *              lazy_scan_heap are not visited at all.
494  *
495  * Note: the reason for doing this as a second pass is we cannot remove
496  * the tuples until we've removed their index entries, and we want to
497  * process index entry removal in batches as large as possible.
498  */
499 static void
500 lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
501 {
502         int                     tupindex;
503         int                     npages;
504         PGRUsage        ru0;
505
506         pg_rusage_init(&ru0);
507         npages = 0;
508
509         tupindex = 0;
510         while (tupindex < vacrelstats->num_dead_tuples)
511         {
512                 BlockNumber tblk;
513                 Buffer          buf;
514                 Page            page;
515
516                 vacuum_delay_point();
517
518                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
519                 buf = ReadBuffer(onerel, tblk);
520                 LockBufferForCleanup(buf);
521                 tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
522                 /* Now that we've compacted the page, record its available space */
523                 page = BufferGetPage(buf);
524                 lazy_record_free_space(vacrelstats, tblk,
525                                                            PageGetFreeSpace(page));
526                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
527                 WriteBuffer(buf);
528                 npages++;
529         }
530
531         ereport(elevel,
532                         (errmsg("\"%s\": removed %d row versions in %d pages",
533                                         RelationGetRelationName(onerel),
534                                         tupindex, npages),
535                          errdetail("%s.",
536                                            pg_rusage_show(&ru0))));
537 }
538
539 /*
540  *      lazy_vacuum_page() -- free dead tuples on a page
541  *                                       and repair its fragmentation.
542  *
543  * Caller is expected to handle reading, locking, and writing the buffer.
544  *
545  * tupindex is the index in vacrelstats->dead_tuples of the first dead
546  * tuple for this page.  We assume the rest follow sequentially.
547  * The return value is the first tupindex after the tuples of this page.
548  */
549 static int
550 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
551                                  int tupindex, LVRelStats *vacrelstats)
552 {
553         OffsetNumber unused[MaxOffsetNumber];
554         int                     uncnt;
555         Page            page = BufferGetPage(buffer);
556         ItemId          itemid;
557
558         START_CRIT_SECTION();
559         for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
560         {
561                 BlockNumber tblk;
562                 OffsetNumber toff;
563
564                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
565                 if (tblk != blkno)
566                         break;                          /* past end of tuples for this block */
567                 toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
568                 itemid = PageGetItemId(page, toff);
569                 itemid->lp_flags &= ~LP_USED;
570         }
571
572         uncnt = PageRepairFragmentation(page, unused);
573
574         /* XLOG stuff */
575         if (!onerel->rd_istemp)
576         {
577                 XLogRecPtr      recptr;
578
579                 recptr = log_heap_clean(onerel, buffer, unused, uncnt);
580                 PageSetLSN(page, recptr);
581                 PageSetTLI(page, ThisTimeLineID);
582         }
583         else
584         {
585                 /* No XLOG record, but still need to flag that XID exists on disk */
586                 MyXactMadeTempRelUpdate = true;
587         }
588
589         END_CRIT_SECTION();
590
591         return tupindex;
592 }
593
594 /*
595  *      lazy_scan_index() -- scan one index relation to update pg_class statistic.
596  *
597  * We use this when we have no deletions to do.
598  */
599 static void
600 lazy_scan_index(Relation indrel, LVRelStats *vacrelstats)
601 {
602         IndexBulkDeleteResult *stats;
603         IndexVacuumCleanupInfo vcinfo;
604         PGRUsage        ru0;
605
606         pg_rusage_init(&ru0);
607
608         /*
609          * Acquire appropriate type of lock on index: must be exclusive if index
610          * AM isn't concurrent-safe.
611          */
612         if (indrel->rd_am->amconcurrent)
613                 LockRelation(indrel, RowExclusiveLock);
614         else
615                 LockRelation(indrel, AccessExclusiveLock);
616
617         /*
618          * Even though we're not planning to delete anything, we use the
619          * ambulkdelete call, because (a) the scan happens within the index AM for
620          * more speed, and (b) it may want to pass private statistics to the
621          * amvacuumcleanup call.
622          */
623         stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);
624
625         /* Do post-VACUUM cleanup, even though we deleted nothing */
626         vcinfo.vacuum_full = false;
627         vcinfo.message_level = elevel;
628         vcinfo.num_heap_tuples = vacrelstats->rel_tuples;
629
630         stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
631
632         /*
633          * Release lock acquired above.
634          */
635         if (indrel->rd_am->amconcurrent)
636                 UnlockRelation(indrel, RowExclusiveLock);
637         else
638                 UnlockRelation(indrel, AccessExclusiveLock);
639
640         if (!stats)
641                 return;
642
643         /* now update statistics in pg_class */
644         vac_update_relstats(RelationGetRelid(indrel),
645                                                 stats->num_pages,
646                                                 stats->num_index_tuples,
647                                                 false);
648
649         ereport(elevel,
650                         (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
651                                         RelationGetRelationName(indrel),
652                                         stats->num_index_tuples,
653                                         stats->num_pages),
654         errdetail("%u index pages have been deleted, %u are currently reusable.\n"
655                           "%s.",
656                           stats->pages_deleted, stats->pages_free,
657                           pg_rusage_show(&ru0))));
658
659         pfree(stats);
660 }
661
662 /*
663  *      lazy_vacuum_index() -- vacuum one index relation.
664  *
665  *              Delete all the index entries pointing to tuples listed in
666  *              vacrelstats->dead_tuples.
667  *
668  *              Increment *index_tups_vacuumed by the number of index entries
669  *              removed, and *index_pages_removed by the number of pages removed.
670  *
671  *              Finally, we arrange to update the index relation's statistics in
672  *              pg_class.
673  */
674 static void
675 lazy_vacuum_index(Relation indrel,
676                                   double *index_tups_vacuumed,
677                                   BlockNumber *index_pages_removed,
678                                   LVRelStats *vacrelstats)
679 {
680         IndexBulkDeleteResult *stats;
681         IndexVacuumCleanupInfo vcinfo;
682         PGRUsage        ru0;
683
684         pg_rusage_init(&ru0);
685
686         /*
687          * Acquire appropriate type of lock on index: must be exclusive if index
688          * AM isn't concurrent-safe.
689          */
690         if (indrel->rd_am->amconcurrent)
691                 LockRelation(indrel, RowExclusiveLock);
692         else
693                 LockRelation(indrel, AccessExclusiveLock);
694
695         /* Do bulk deletion */
696         stats = index_bulk_delete(indrel, lazy_tid_reaped, (void *) vacrelstats);
697
698         /* Do post-VACUUM cleanup */
699         vcinfo.vacuum_full = false;
700         vcinfo.message_level = elevel;
701         /* We don't yet know rel_tuples, so pass -1 */
702         /* index_bulk_delete can't have skipped scan anyway ... */
703         vcinfo.num_heap_tuples = -1;
704
705         stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
706
707         /*
708          * Release lock acquired above.
709          */
710         if (indrel->rd_am->amconcurrent)
711                 UnlockRelation(indrel, RowExclusiveLock);
712         else
713                 UnlockRelation(indrel, AccessExclusiveLock);
714
715         if (!stats)
716                 return;
717
718         /* accumulate total removed over multiple index-cleaning cycles */
719         *index_tups_vacuumed += stats->tuples_removed;
720         *index_pages_removed += stats->pages_removed;
721
722         /* now update statistics in pg_class */
723         vac_update_relstats(RelationGetRelid(indrel),
724                                                 stats->num_pages,
725                                                 stats->num_index_tuples,
726                                                 false);
727
728         ereport(elevel,
729                         (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
730                                         RelationGetRelationName(indrel),
731                                         stats->num_index_tuples,
732                                         stats->num_pages),
733                          errdetail("%.0f index row versions were removed.\n"
734                          "%u index pages have been deleted, %u are currently reusable.\n"
735                                            "%s.",
736                                            stats->tuples_removed,
737                                            stats->pages_deleted, stats->pages_free,
738                                            pg_rusage_show(&ru0))));
739
740         pfree(stats);
741 }
742
743 /*
744  * lazy_truncate_heap - try to truncate off any empty pages at the end
745  */
746 static void
747 lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
748 {
749         BlockNumber old_rel_pages = vacrelstats->rel_pages;
750         BlockNumber new_rel_pages;
751         PageFreeSpaceInfo *pageSpaces;
752         int                     n;
753         int                     i,
754                                 j;
755         PGRUsage        ru0;
756
757         pg_rusage_init(&ru0);
758
759         /*
760          * We need full exclusive lock on the relation in order to do truncation.
761          * If we can't get it, give up rather than waiting --- we don't want to
762          * block other backends, and we don't want to deadlock (which is quite
763          * possible considering we already hold a lower-grade lock).
764          */
765         if (!ConditionalLockRelation(onerel, AccessExclusiveLock))
766                 return;
767
768         /*
769          * Now that we have exclusive lock, look to see if the rel has grown
770          * whilst we were vacuuming with non-exclusive lock.  If so, give up; the
771          * newly added pages presumably contain non-deletable tuples.
772          */
773         new_rel_pages = RelationGetNumberOfBlocks(onerel);
774         if (new_rel_pages != old_rel_pages)
775         {
776                 /* might as well use the latest news when we update pg_class stats */
777                 vacrelstats->rel_pages = new_rel_pages;
778                 UnlockRelation(onerel, AccessExclusiveLock);
779                 return;
780         }
781
782         /*
783          * Scan backwards from the end to verify that the end pages actually
784          * contain nothing we need to keep.  This is *necessary*, not optional,
785          * because other backends could have added tuples to these pages whilst we
786          * were vacuuming.
787          */
788         new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
789
790         if (new_rel_pages >= old_rel_pages)
791         {
792                 /* can't do anything after all */
793                 UnlockRelation(onerel, AccessExclusiveLock);
794                 return;
795         }
796
797         /*
798          * Okay to truncate.
799          */
800         RelationTruncate(onerel, new_rel_pages);
801
802         /*
803          * Drop free-space info for removed blocks; these must not get entered
804          * into the FSM!
805          */
806         pageSpaces = vacrelstats->free_pages;
807         n = vacrelstats->num_free_pages;
808         j = 0;
809         for (i = 0; i < n; i++)
810         {
811                 if (pageSpaces[i].blkno < new_rel_pages)
812                 {
813                         pageSpaces[j] = pageSpaces[i];
814                         j++;
815                 }
816         }
817         vacrelstats->num_free_pages = j;
818         /* We destroyed the heap ordering, so mark array unordered */
819         vacrelstats->fs_is_heap = false;
820
821         /* update statistics */
822         vacrelstats->rel_pages = new_rel_pages;
823         vacrelstats->pages_removed = old_rel_pages - new_rel_pages;
824
825         /*
826          * We keep the exclusive lock until commit (perhaps not necessary)?
827          */
828
829         ereport(elevel,
830                         (errmsg("\"%s\": truncated %u to %u pages",
831                                         RelationGetRelationName(onerel),
832                                         old_rel_pages, new_rel_pages),
833                          errdetail("%s.",
834                                            pg_rusage_show(&ru0))));
835 }
836
837 /*
838  * Rescan end pages to verify that they are (still) empty of needed tuples.
839  *
840  * Returns number of nondeletable pages (last nonempty page + 1).
841  */
842 static BlockNumber
843 count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
844 {
845         BlockNumber blkno;
846         HeapTupleData tuple;
847
848         /* Strange coding of loop control is needed because blkno is unsigned */
849         blkno = vacrelstats->rel_pages;
850         while (blkno > vacrelstats->nonempty_pages)
851         {
852                 Buffer          buf;
853                 Page            page;
854                 OffsetNumber offnum,
855                                         maxoff;
856                 bool            tupgone,
857                                         hastup;
858
859                 vacuum_delay_point();
860
861                 blkno--;
862
863                 buf = ReadBuffer(onerel, blkno);
864
865                 /* In this phase we only need shared access to the buffer */
866                 LockBuffer(buf, BUFFER_LOCK_SHARE);
867
868                 page = BufferGetPage(buf);
869
870                 if (PageIsNew(page) || PageIsEmpty(page))
871                 {
872                         /* PageIsNew probably shouldn't happen... */
873                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
874                         ReleaseBuffer(buf);
875                         continue;
876                 }
877
878                 hastup = false;
879                 maxoff = PageGetMaxOffsetNumber(page);
880                 for (offnum = FirstOffsetNumber;
881                          offnum <= maxoff;
882                          offnum = OffsetNumberNext(offnum))
883                 {
884                         ItemId          itemid;
885
886                         itemid = PageGetItemId(page, offnum);
887
888                         if (!ItemIdIsUsed(itemid))
889                                 continue;
890
891                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
892                         tuple.t_len = ItemIdGetLength(itemid);
893                         ItemPointerSet(&(tuple.t_self), blkno, offnum);
894
895                         tupgone = false;
896
897                         switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
898                         {
899                                 case HEAPTUPLE_DEAD:
900                                         tupgone = true;         /* we can delete the tuple */
901                                         break;
902                                 case HEAPTUPLE_LIVE:
903                                         /* Shouldn't be necessary to re-freeze anything */
904                                         break;
905                                 case HEAPTUPLE_RECENTLY_DEAD:
906
907                                         /*
908                                          * If tuple is recently deleted then we must not remove it
909                                          * from relation.
910                                          */
911                                         break;
912                                 case HEAPTUPLE_INSERT_IN_PROGRESS:
913                                         /* This is an expected case during concurrent vacuum */
914                                         break;
915                                 case HEAPTUPLE_DELETE_IN_PROGRESS:
916                                         /* This is an expected case during concurrent vacuum */
917                                         break;
918                                 default:
919                                         elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
920                                         break;
921                         }
922
923                         if (!tupgone)
924                         {
925                                 hastup = true;
926                                 break;                  /* can stop scanning */
927                         }
928                 }                                               /* scan along page */
929
930                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
931
932                 ReleaseBuffer(buf);
933
934                 /* Done scanning if we found a tuple here */
935                 if (hastup)
936                         return blkno + 1;
937         }
938
939         /*
940          * If we fall out of the loop, all the previously-thought-to-be-empty
941          * pages really are; we need not bother to look at the last known-nonempty
942          * page.
943          */
944         return vacrelstats->nonempty_pages;
945 }
946
947 /*
948  * lazy_space_alloc - space allocation decisions for lazy vacuum
949  *
950  * See the comments at the head of this file for rationale.
951  */
952 static void
953 lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
954 {
955         long            maxtuples;
956         int                     maxpages;
957
958         maxtuples = (maintenance_work_mem * 1024L) / sizeof(ItemPointerData);
959         maxtuples = Min(maxtuples, INT_MAX);
960         /* stay sane if small maintenance_work_mem */
961         maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
962
963         vacrelstats->num_dead_tuples = 0;
964         vacrelstats->max_dead_tuples = (int) maxtuples;
965         vacrelstats->dead_tuples = (ItemPointer)
966                 palloc(maxtuples * sizeof(ItemPointerData));
967
968         maxpages = MaxFSMPages;
969         /* No need to allocate more pages than the relation has blocks */
970         if (relblocks < (BlockNumber) maxpages)
971                 maxpages = (int) relblocks;
972
973         vacrelstats->fs_is_heap = false;
974         vacrelstats->num_free_pages = 0;
975         vacrelstats->max_free_pages = maxpages;
976         vacrelstats->free_pages = (PageFreeSpaceInfo *)
977                 palloc(maxpages * sizeof(PageFreeSpaceInfo));
978 }
979
980 /*
981  * lazy_record_dead_tuple - remember one deletable tuple
982  */
983 static void
984 lazy_record_dead_tuple(LVRelStats *vacrelstats,
985                                            ItemPointer itemptr)
986 {
987         /*
988          * The array shouldn't overflow under normal behavior, but perhaps it
989          * could if we are given a really small maintenance_work_mem. In that
990          * case, just forget the last few tuples.
991          */
992         if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
993         {
994                 vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
995                 vacrelstats->num_dead_tuples++;
996         }
997 }
998
999 /*
1000  * lazy_record_free_space - remember free space on one page
1001  */
1002 static void
1003 lazy_record_free_space(LVRelStats *vacrelstats,
1004                                            BlockNumber page,
1005                                            Size avail)
1006 {
1007         PageFreeSpaceInfo *pageSpaces;
1008         int                     n;
1009
1010         /*
1011          * A page with less than stats->threshold free space will be forgotten
1012          * immediately, and never passed to the free space map.  Removing the
1013          * uselessly small entries early saves cycles, and in particular reduces
1014          * the amount of time we spend holding the FSM lock when we finally call
1015          * RecordRelationFreeSpace.  Since the FSM will probably drop pages with
1016          * little free space anyway, there's no point in making this really small.
1017          *
1018          * XXX Is it worth trying to measure average tuple size, and using that to
1019          * adjust the threshold?  Would be worthwhile if FSM has no stats yet for
1020          * this relation.  But changing the threshold as we scan the rel might
1021          * lead to bizarre behavior, too.  Also, it's probably better if vacuum.c
1022          * has the same thresholding behavior as we do here.
1023          */
1024         if (avail < vacrelstats->threshold)
1025                 return;
1026
1027         /* Copy pointers to local variables for notational simplicity */
1028         pageSpaces = vacrelstats->free_pages;
1029         n = vacrelstats->max_free_pages;
1030
1031         /* If we haven't filled the array yet, just keep adding entries */
1032         if (vacrelstats->num_free_pages < n)
1033         {
1034                 pageSpaces[vacrelstats->num_free_pages].blkno = page;
1035                 pageSpaces[vacrelstats->num_free_pages].avail = avail;
1036                 vacrelstats->num_free_pages++;
1037                 return;
1038         }
1039
1040         /*----------
1041          * The rest of this routine works with "heap" organization of the
1042          * free space arrays, wherein we maintain the heap property
1043          *                      avail[(j-1) div 2] <= avail[j]  for 0 < j < n.
1044          * In particular, the zero'th element always has the smallest available
1045          * space and can be discarded to make room for a new page with more space.
1046          * See Knuth's discussion of heap-based priority queues, sec 5.2.3;
1047          * but note he uses 1-origin array subscripts, not 0-origin.
1048          *----------
1049          */
1050
1051         /* If we haven't yet converted the array to heap organization, do it */
1052         if (!vacrelstats->fs_is_heap)
1053         {
1054                 /*
1055                  * Scan backwards through the array, "sift-up" each value into its
1056                  * correct position.  We can start the scan at n/2-1 since each entry
1057                  * above that position has no children to worry about.
1058                  */
1059                 int                     l = n / 2;
1060
1061                 while (--l >= 0)
1062                 {
1063                         BlockNumber R = pageSpaces[l].blkno;
1064                         Size            K = pageSpaces[l].avail;
1065                         int                     i;              /* i is where the "hole" is */
1066
1067                         i = l;
1068                         for (;;)
1069                         {
1070                                 int                     j = 2 * i + 1;
1071
1072                                 if (j >= n)
1073                                         break;
1074                                 if (j + 1 < n && pageSpaces[j].avail > pageSpaces[j + 1].avail)
1075                                         j++;
1076                                 if (K <= pageSpaces[j].avail)
1077                                         break;
1078                                 pageSpaces[i] = pageSpaces[j];
1079                                 i = j;
1080                         }
1081                         pageSpaces[i].blkno = R;
1082                         pageSpaces[i].avail = K;
1083                 }
1084
1085                 vacrelstats->fs_is_heap = true;
1086         }
1087
1088         /* If new page has more than zero'th entry, insert it into heap */
1089         if (avail > pageSpaces[0].avail)
1090         {
1091                 /*
1092                  * Notionally, we replace the zero'th entry with the new data, and
1093                  * then sift-up to maintain the heap property.  Physically, the new
1094                  * data doesn't get stored into the arrays until we find the right
1095                  * location for it.
1096                  */
1097                 int                     i = 0;          /* i is where the "hole" is */
1098
1099                 for (;;)
1100                 {
1101                         int                     j = 2 * i + 1;
1102
1103                         if (j >= n)
1104                                 break;
1105                         if (j + 1 < n && pageSpaces[j].avail > pageSpaces[j + 1].avail)
1106                                 j++;
1107                         if (avail <= pageSpaces[j].avail)
1108                                 break;
1109                         pageSpaces[i] = pageSpaces[j];
1110                         i = j;
1111                 }
1112                 pageSpaces[i].blkno = page;
1113                 pageSpaces[i].avail = avail;
1114         }
1115 }
1116
1117 /*
1118  *      lazy_tid_reaped() -- is a particular tid deletable?
1119  *
1120  *              This has the right signature to be an IndexBulkDeleteCallback.
1121  *
1122  *              Assumes dead_tuples array is in sorted order.
1123  */
1124 static bool
1125 lazy_tid_reaped(ItemPointer itemptr, void *state)
1126 {
1127         LVRelStats *vacrelstats = (LVRelStats *) state;
1128         ItemPointer res;
1129
1130         res = (ItemPointer) bsearch((void *) itemptr,
1131                                                                 (void *) vacrelstats->dead_tuples,
1132                                                                 vacrelstats->num_dead_tuples,
1133                                                                 sizeof(ItemPointerData),
1134                                                                 vac_cmp_itemptr);
1135
1136         return (res != NULL);
1137 }
1138
1139 /*
1140  * Dummy version for lazy_scan_index.
1141  */
1142 static bool
1143 dummy_tid_reaped(ItemPointer itemptr, void *state)
1144 {
1145         return false;
1146 }
1147
1148 /*
1149  * Update the shared Free Space Map with the info we now have about
1150  * free space in the relation, discarding any old info the map may have.
1151  */
1152 static void
1153 lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats)
1154 {
1155         PageFreeSpaceInfo *pageSpaces = vacrelstats->free_pages;
1156         int                     nPages = vacrelstats->num_free_pages;
1157
1158         /*
1159          * Sort data into order, as required by RecordRelationFreeSpace.
1160          */
1161         if (nPages > 1)
1162                 qsort(pageSpaces, nPages, sizeof(PageFreeSpaceInfo),
1163                           vac_cmp_page_spaces);
1164
1165         RecordRelationFreeSpace(&onerel->rd_node, nPages, pageSpaces);
1166 }
1167
1168 /*
1169  * Comparator routines for use with qsort() and bsearch().
1170  */
1171 static int
1172 vac_cmp_itemptr(const void *left, const void *right)
1173 {
1174         BlockNumber lblk,
1175                                 rblk;
1176         OffsetNumber loff,
1177                                 roff;
1178
1179         lblk = ItemPointerGetBlockNumber((ItemPointer) left);
1180         rblk = ItemPointerGetBlockNumber((ItemPointer) right);
1181
1182         if (lblk < rblk)
1183                 return -1;
1184         if (lblk > rblk)
1185                 return 1;
1186
1187         loff = ItemPointerGetOffsetNumber((ItemPointer) left);
1188         roff = ItemPointerGetOffsetNumber((ItemPointer) right);
1189
1190         if (loff < roff)
1191                 return -1;
1192         if (loff > roff)
1193                 return 1;
1194
1195         return 0;
1196 }
1197
1198 static int
1199 vac_cmp_page_spaces(const void *left, const void *right)
1200 {
1201         PageFreeSpaceInfo *linfo = (PageFreeSpaceInfo *) left;
1202         PageFreeSpaceInfo *rinfo = (PageFreeSpaceInfo *) right;
1203
1204         if (linfo->blkno < rinfo->blkno)
1205                 return -1;
1206         else if (linfo->blkno > rinfo->blkno)
1207                 return 1;
1208         return 0;
1209 }