granicus.if.org Git - postgresql/blob - src/backend/commands/vacuumlazy.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * vacuumlazy.c
   4  *        Concurrent ("lazy") vacuuming.
   5  *
   6  *
   7  * The major space usage for LAZY VACUUM is storage for the array of dead
   8  * tuple TIDs, with the next biggest need being storage for per-disk-page
   9  * free space info.  We want to ensure we can vacuum even the very largest
  10  * relations with finite memory space usage.  To do that, we set upper bounds
  11  * on the number of tuples and pages we will keep track of at once.
  12  *
  13  * We are willing to use at most maintenance_work_mem memory space to keep
  14  * track of dead tuples.  We initially allocate an array of TIDs of that size,
  15  * with an upper limit that depends on table size (this limit ensures we don't
  16  * allocate a huge area uselessly for vacuuming small tables).  If the array
  17  * threatens to overflow, we suspend the heap scan phase and perform a pass of
  18  * index cleanup and page compaction, then resume the heap scan with an empty
  19  * TID array.
  20  *
  21  * If we're processing a table with no indexes, we can just vacuum each page
  22  * as we go; there's no need to save up multiple tuples to minimize the number
  23  * of index scans performed.  So we don't use maintenance_work_mem memory for
  24  * the TID array, just enough to hold as many heap tuples as fit on one page.
  25  *
  26  *
  27  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
  28  * Portions Copyright (c) 1994, Regents of the University of California
  29  *
  30  *
  31  * IDENTIFICATION
  32  *        src/backend/commands/vacuumlazy.c
  33  *
  34  *-------------------------------------------------------------------------
  35  */
  36 #include "postgres.h"
  37
  38 #include <math.h>
  39
  40 #include "access/genam.h"
  41 #include "access/heapam.h"
  42 #include "access/heapam_xlog.h"
  43 #include "access/htup_details.h"
  44 #include "access/multixact.h"
  45 #include "access/transam.h"
  46 #include "access/visibilitymap.h"
  47 #include "catalog/storage.h"
  48 #include "commands/dbcommands.h"
  49 #include "commands/vacuum.h"
  50 #include "miscadmin.h"
  51 #include "pgstat.h"
  52 #include "portability/instr_time.h"
  53 #include "postmaster/autovacuum.h"
  54 #include "storage/bufmgr.h"
  55 #include "storage/freespace.h"
  56 #include "storage/lmgr.h"
  57 #include "utils/lsyscache.h"
  58 #include "utils/memutils.h"
  59 #include "utils/pg_rusage.h"
  60 #include "utils/timestamp.h"
  61 #include "utils/tqual.h"
  62
  63
  64 /*
  65  * Space/time tradeoff parameters: do these need to be user-tunable?
  66  *
  67  * To consider truncating the relation, we want there to be at least
  68  * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
  69  * is less) potentially-freeable pages.
  70  */
  71 #define REL_TRUNCATE_MINIMUM    1000
  72 #define REL_TRUNCATE_FRACTION   16
  73
  74 /*
  75  * Timing parameters for truncate locking heuristics.
  76  *
  77  * These were not exposed as user tunable GUC values because it didn't seem
  78  * that the potential for improvement was great enough to merit the cost of
  79  * supporting them.
  80  */
  81 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL             20      /* ms */
  82 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL              50      /* ms */
  83 #define VACUUM_TRUNCATE_LOCK_TIMEOUT                    5000            /* ms */
  84
  85 /*
  86  * Guesstimation of number of dead tuples per page.  This is used to
  87  * provide an upper limit to memory allocated when vacuuming small
  88  * tables.
  89  */
  90 #define LAZY_ALLOC_TUPLES               MaxHeapTuplesPerPage
  91
  92 /*
  93  * Before we consider skipping a page that's marked as clean in
  94  * visibility map, we must've seen at least this many clean pages.
  95  */
  96 #define SKIP_PAGES_THRESHOLD    ((BlockNumber) 32)
  97
  98 typedef struct LVRelStats
  99 {
 100         /* hasindex = true means two-pass strategy; false means one-pass */
 101         bool            hasindex;
 102         /* Overall statistics about rel */
 103         BlockNumber old_rel_pages;      /* previous value of pg_class.relpages */
 104         BlockNumber rel_pages;          /* total number of pages */
 105         BlockNumber scanned_pages;      /* number of pages we examined */
 106         double          scanned_tuples; /* counts only tuples on scanned pages */
 107         double          old_rel_tuples; /* previous value of pg_class.reltuples */
 108         double          new_rel_tuples; /* new estimated total # of tuples */
 109         BlockNumber pages_removed;
 110         double          tuples_deleted;
 111         BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
 112         /* List of TIDs of tuples we intend to delete */
 113         /* NB: this list is ordered by TID address */
 114         int                     num_dead_tuples;        /* current # of entries */
 115         int                     max_dead_tuples;        /* # slots allocated in array */
 116         ItemPointer dead_tuples;        /* array of ItemPointerData */
 117         int                     num_index_scans;
 118         TransactionId latestRemovedXid;
 119         bool            lock_waiter_detected;
 120 } LVRelStats;
 121
 122
 123 /* A few variables that don't seem worth passing around as parameters */
 124 static int      elevel = -1;
 125
 126 static TransactionId OldestXmin;
 127 static TransactionId FreezeLimit;
 128 static MultiXactId MultiXactFrzLimit;
 129
 130 static BufferAccessStrategy vac_strategy;
 131
 132
 133 /* non-export function prototypes */
 134 static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 135                            Relation *Irel, int nindexes, bool scan_all);
 136 static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
 137 static bool lazy_check_needs_freeze(Buffer buf);
 138 static void lazy_vacuum_index(Relation indrel,
 139                                   IndexBulkDeleteResult **stats,
 140                                   LVRelStats *vacrelstats);
 141 static void lazy_cleanup_index(Relation indrel,
 142                                    IndexBulkDeleteResult *stats,
 143                                    LVRelStats *vacrelstats);
 144 static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 145                                  int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer);
 146 static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
 147 static BlockNumber count_nondeletable_pages(Relation onerel,
 148                                                  LVRelStats *vacrelstats);
 149 static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
 150 static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
 151                                            ItemPointer itemptr);
 152 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
 153 static int      vac_cmp_itemptr(const void *left, const void *right);
 154 static bool heap_page_is_all_visible(Buffer buf,
 155                                                  TransactionId *visibility_cutoff_xid);
 156
 157
 158 /*
 159  *      lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
 160  *
 161  *              This routine vacuums a single heap, cleans out its indexes, and
 162  *              updates its relpages and reltuples statistics.
 163  *
 164  *              At entry, we have already established a transaction and opened
 165  *              and locked the relation.
 166  */
 167 void
 168 lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
 169                                 BufferAccessStrategy bstrategy)
 170 {
 171         LVRelStats *vacrelstats;
 172         Relation   *Irel;
 173         int                     nindexes;
 174         BlockNumber possibly_freeable;
 175         PGRUsage        ru0;
 176         TimestampTz starttime = 0;
 177         long            secs;
 178         int                     usecs;
 179         double          read_rate,
 180                                 write_rate;
 181         bool            scan_all;
 182         TransactionId freezeTableLimit;
 183         BlockNumber new_rel_pages;
 184         double          new_rel_tuples;
 185         BlockNumber new_rel_allvisible;
 186         TransactionId new_frozen_xid;
 187         MultiXactId     new_min_multi;
 188
 189         /* measure elapsed time iff autovacuum logging requires it */
 190         if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
 191         {
 192                 pg_rusage_init(&ru0);
 193                 starttime = GetCurrentTimestamp();
 194         }
 195
 196         if (vacstmt->options & VACOPT_VERBOSE)
 197                 elevel = INFO;
 198         else
 199                 elevel = DEBUG2;
 200
 201         vac_strategy = bstrategy;
 202
 203         vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age,
 204                                                   onerel->rd_rel->relisshared,
 205                                                   &OldestXmin, &FreezeLimit, &freezeTableLimit,
 206                                                   &MultiXactFrzLimit);
 207         scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
 208                                                                                          freezeTableLimit);
 209
 210         vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
 211
 212         vacrelstats->old_rel_pages = onerel->rd_rel->relpages;
 213         vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples;
 214         vacrelstats->num_index_scans = 0;
 215         vacrelstats->pages_removed = 0;
 216         vacrelstats->lock_waiter_detected = false;
 217
 218         /* Open all indexes of the relation */
 219         vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
 220         vacrelstats->hasindex = (nindexes > 0);
 221
 222         /* Do the vacuuming */
 223         lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all);
 224
 225         /* Done with indexes */
 226         vac_close_indexes(nindexes, Irel, NoLock);
 227
 228         /*
 229          * Optionally truncate the relation.
 230          *
 231          * Don't even think about it unless we have a shot at releasing a goodly
 232          * number of pages.  Otherwise, the time taken isn't worth it.
 233          */
 234         possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
 235         if (possibly_freeable > 0 &&
 236                 (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
 237                  possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION))
 238                 lazy_truncate_heap(onerel, vacrelstats);
 239
 240         /* Vacuum the Free Space Map */
 241         FreeSpaceMapVacuum(onerel);
 242
 243         /*
 244          * Update statistics in pg_class.
 245          *
 246          * A corner case here is that if we scanned no pages at all because every
 247          * page is all-visible, we should not update relpages/reltuples, because
 248          * we have no new information to contribute.  In particular this keeps us
 249          * from replacing relpages=reltuples=0 (which means "unknown tuple
 250          * density") with nonzero relpages and reltuples=0 (which means "zero
 251          * tuple density") unless there's some actual evidence for the latter.
 252          *
 253          * We do update relallvisible even in the corner case, since if the table
 254          * is all-visible we'd definitely like to know that.  But clamp the value
 255          * to be not more than what we're setting relpages to.
 256          *
 257          * Also, don't change relfrozenxid if we skipped any pages, since then we
 258          * don't know for certain that all tuples have a newer xmin.
 259          */
 260         new_rel_pages = vacrelstats->rel_pages;
 261         new_rel_tuples = vacrelstats->new_rel_tuples;
 262         if (vacrelstats->scanned_pages == 0 && new_rel_pages > 0)
 263         {
 264                 new_rel_pages = vacrelstats->old_rel_pages;
 265                 new_rel_tuples = vacrelstats->old_rel_tuples;
 266         }
 267
 268         new_rel_allvisible = visibilitymap_count(onerel);
 269         if (new_rel_allvisible > new_rel_pages)
 270                 new_rel_allvisible = new_rel_pages;
 271
 272         new_frozen_xid = FreezeLimit;
 273         if (vacrelstats->scanned_pages < vacrelstats->rel_pages)
 274                 new_frozen_xid = InvalidTransactionId;
 275
 276         new_min_multi = MultiXactFrzLimit;
 277         if (vacrelstats->scanned_pages < vacrelstats->rel_pages)
 278                 new_min_multi = InvalidMultiXactId;
 279
 280         vac_update_relstats(onerel,
 281                                                 new_rel_pages,
 282                                                 new_rel_tuples,
 283                                                 new_rel_allvisible,
 284                                                 vacrelstats->hasindex,
 285                                                 new_frozen_xid,
 286                                                 new_min_multi);
 287
 288         /* report results to the stats collector, too */
 289         pgstat_report_vacuum(RelationGetRelid(onerel),
 290                                                   onerel->rd_rel->relisshared,
 291                                                   new_rel_tuples);
 292
 293         /* and log the action if appropriate */
 294         if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
 295         {
 296                 TimestampTz endtime = GetCurrentTimestamp();
 297
 298                 if (Log_autovacuum_min_duration == 0 ||
 299                         TimestampDifferenceExceeds(starttime, endtime,
 300                                                                            Log_autovacuum_min_duration))
 301                 {
 302                         TimestampDifference(starttime, endtime, &secs, &usecs);
 303
 304                         read_rate = 0;
 305                         write_rate = 0;
 306                         if ((secs > 0) || (usecs > 0))
 307                         {
 308                                 read_rate = (double) BLCKSZ *VacuumPageMiss / (1024 * 1024) /
 309                                                         (secs + usecs / 1000000.0);
 310                                 write_rate = (double) BLCKSZ *VacuumPageDirty / (1024 * 1024) /
 311                                                         (secs + usecs / 1000000.0);
 312                         }
 313                         ereport(LOG,
 314                                         (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"
 315                                                         "pages: %d removed, %d remain\n"
 316                                                         "tuples: %.0f removed, %.0f remain\n"
 317                                                         "buffer usage: %d hits, %d misses, %d dirtied\n"
 318                                         "avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"
 319                                                         "system usage: %s",
 320                                                         get_database_name(MyDatabaseId),
 321                                                         get_namespace_name(RelationGetNamespace(onerel)),
 322                                                         RelationGetRelationName(onerel),
 323                                                         vacrelstats->num_index_scans,
 324                                                         vacrelstats->pages_removed,
 325                                                         vacrelstats->rel_pages,
 326                                                         vacrelstats->tuples_deleted,
 327                                                         vacrelstats->new_rel_tuples,
 328                                                         VacuumPageHit,
 329                                                         VacuumPageMiss,
 330                                                         VacuumPageDirty,
 331                                                         read_rate, write_rate,
 332                                                         pg_rusage_show(&ru0))));
 333                 }
 334         }
 335 }
 336
 337 /*
 338  * For Hot Standby we need to know the highest transaction id that will
 339  * be removed by any change. VACUUM proceeds in a number of passes so
 340  * we need to consider how each pass operates. The first phase runs
 341  * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
 342  * progresses - these will have a latestRemovedXid on each record.
 343  * In some cases this removes all of the tuples to be removed, though
 344  * often we have dead tuples with index pointers so we must remember them
 345  * for removal in phase 3. Index records for those rows are removed
 346  * in phase 2 and index blocks do not have MVCC information attached.
 347  * So before we can allow removal of any index tuples we need to issue
 348  * a WAL record containing the latestRemovedXid of rows that will be
 349  * removed in phase three. This allows recovery queries to block at the
 350  * correct place, i.e. before phase two, rather than during phase three
 351  * which would be after the rows have become inaccessible.
 352  */
 353 static void
 354 vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
 355 {
 356         /*
 357          * Skip this for relations for which no WAL is to be written, or if we're
 358          * not trying to support archive recovery.
 359          */
 360         if (!RelationNeedsWAL(rel) || !XLogIsNeeded())
 361                 return;
 362
 363         /*
 364          * No need to write the record at all unless it contains a valid value
 365          */
 366         if (TransactionIdIsValid(vacrelstats->latestRemovedXid))
 367                 (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid);
 368 }
 369
 370 /*
 371  *      lazy_scan_heap() -- scan an open heap relation
 372  *
 373  *              This routine prunes each page in the heap, which will among other
 374  *              things truncate dead tuples to dead line pointers, defragment the
 375  *              page, and set commit status bits (see heap_page_prune).  It also builds
 376  *              lists of dead tuples and pages with free space, calculates statistics
 377  *              on the number of live tuples in the heap, and marks pages as
 378  *              all-visible if appropriate.  When done, or when we run low on space for
 379  *              dead-tuple TIDs, invoke vacuuming of indexes and call lazy_vacuum_heap
 380  *              to reclaim dead line pointers.
 381  *
 382  *              If there are no indexes then we can reclaim line pointers on the fly;
 383  *              dead line pointers need only be retained until all index pointers that
 384  *              reference them have been killed.
 385  */
 386 static void
 387 lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 388                            Relation *Irel, int nindexes, bool scan_all)
 389 {
 390         BlockNumber nblocks,
 391                                 blkno;
 392         HeapTupleData tuple;
 393         char       *relname;
 394         BlockNumber empty_pages,
 395                                 vacuumed_pages;
 396         double          num_tuples,
 397                                 tups_vacuumed,
 398                                 nkeep,
 399                                 nunused;
 400         IndexBulkDeleteResult **indstats;
 401         int                     i;
 402         PGRUsage        ru0;
 403         Buffer          vmbuffer = InvalidBuffer;
 404         BlockNumber next_not_all_visible_block;
 405         bool            skipping_all_visible_blocks;
 406
 407         pg_rusage_init(&ru0);
 408
 409         relname = RelationGetRelationName(onerel);
 410         ereport(elevel,
 411                         (errmsg("vacuuming \"%s.%s\"",
 412                                         get_namespace_name(RelationGetNamespace(onerel)),
 413                                         relname)));
 414
 415         empty_pages = vacuumed_pages = 0;
 416         num_tuples = tups_vacuumed = nkeep = nunused = 0;
 417
 418         indstats = (IndexBulkDeleteResult **)
 419                 palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
 420
 421         nblocks = RelationGetNumberOfBlocks(onerel);
 422         vacrelstats->rel_pages = nblocks;
 423         vacrelstats->scanned_pages = 0;
 424         vacrelstats->nonempty_pages = 0;
 425         vacrelstats->latestRemovedXid = InvalidTransactionId;
 426
 427         lazy_space_alloc(vacrelstats, nblocks);
 428
 429         /*
 430          * We want to skip pages that don't require vacuuming according to the
 431          * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD
 432          * consecutive pages.  Since we're reading sequentially, the OS should be
 433          * doing readahead for us, so there's no gain in skipping a page now and
 434          * then; that's likely to disable readahead and so be counterproductive.
 435          * Also, skipping even a single page means that we can't update
 436          * relfrozenxid, so we only want to do it if we can skip a goodly number
 437          * of pages.
 438          *
 439          * Before entering the main loop, establish the invariant that
 440          * next_not_all_visible_block is the next block number >= blkno that's not
 441          * all-visible according to the visibility map, or nblocks if there's no
 442          * such block.  Also, we set up the skipping_all_visible_blocks flag,
 443          * which is needed because we need hysteresis in the decision: once we've
 444          * started skipping blocks, we may as well skip everything up to the next
 445          * not-all-visible block.
 446          *
 447          * Note: if scan_all is true, we won't actually skip any pages; but we
 448          * maintain next_not_all_visible_block anyway, so as to set up the
 449          * all_visible_according_to_vm flag correctly for each page.
 450          *
 451          * Note: The value returned by visibilitymap_test could be slightly
 452          * out-of-date, since we make this test before reading the corresponding
 453          * heap page or locking the buffer.  This is OK.  If we mistakenly think
 454          * that the page is all-visible when in fact the flag's just been cleared,
 455          * we might fail to vacuum the page.  But it's OK to skip pages when
 456          * scan_all is not set, so no great harm done; the next vacuum will find
 457          * them.  If we make the reverse mistake and vacuum a page unnecessarily,
 458          * it'll just be a no-op.
 459          */
 460         for (next_not_all_visible_block = 0;
 461                  next_not_all_visible_block < nblocks;
 462                  next_not_all_visible_block++)
 463         {
 464                 if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer))
 465                         break;
 466                 vacuum_delay_point();
 467         }
 468         if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD)
 469                 skipping_all_visible_blocks = true;
 470         else
 471                 skipping_all_visible_blocks = false;
 472
 473         for (blkno = 0; blkno < nblocks; blkno++)
 474         {
 475                 Buffer          buf;
 476                 Page            page;
 477                 OffsetNumber offnum,
 478                                         maxoff;
 479                 bool            tupgone,
 480                                         hastup;
 481                 int                     prev_dead_count;
 482                 OffsetNumber frozen[MaxOffsetNumber];
 483                 int                     nfrozen;
 484                 Size            freespace;
 485                 bool            all_visible_according_to_vm;
 486                 bool            all_visible;
 487                 bool            has_dead_tuples;
 488                 TransactionId visibility_cutoff_xid = InvalidTransactionId;
 489
 490                 if (blkno == next_not_all_visible_block)
 491                 {
 492                         /* Time to advance next_not_all_visible_block */
 493                         for (next_not_all_visible_block++;
 494                                  next_not_all_visible_block < nblocks;
 495                                  next_not_all_visible_block++)
 496                         {
 497                                 if (!visibilitymap_test(onerel, next_not_all_visible_block,
 498                                                                                 &vmbuffer))
 499                                         break;
 500                                 vacuum_delay_point();
 501                         }
 502
 503                         /*
 504                          * We know we can't skip the current block.  But set up
 505                          * skipping_all_visible_blocks to do the right thing at the
 506                          * following blocks.
 507                          */
 508                         if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD)
 509                                 skipping_all_visible_blocks = true;
 510                         else
 511                                 skipping_all_visible_blocks = false;
 512                         all_visible_according_to_vm = false;
 513                 }
 514                 else
 515                 {
 516                         /* Current block is all-visible */
 517                         if (skipping_all_visible_blocks && !scan_all)
 518                                 continue;
 519                         all_visible_according_to_vm = true;
 520                 }
 521
 522                 vacuum_delay_point();
 523
 524                 /*
 525                  * If we are close to overrunning the available space for dead-tuple
 526                  * TIDs, pause and do a cycle of vacuuming before we tackle this page.
 527                  */
 528                 if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
 529                         vacrelstats->num_dead_tuples > 0)
 530                 {
 531                         /*
 532                          * Before beginning index vacuuming, we release any pin we may
 533                          * hold on the visibility map page.  This isn't necessary for
 534                          * correctness, but we do it anyway to avoid holding the pin
 535                          * across a lengthy, unrelated operation.
 536                          */
 537                         if (BufferIsValid(vmbuffer))
 538                         {
 539                                 ReleaseBuffer(vmbuffer);
 540                                 vmbuffer = InvalidBuffer;
 541                         }
 542
 543                         /* Log cleanup info before we touch indexes */
 544                         vacuum_log_cleanup_info(onerel, vacrelstats);
 545
 546                         /* Remove index entries */
 547                         for (i = 0; i < nindexes; i++)
 548                                 lazy_vacuum_index(Irel[i],
 549                                                                   &indstats[i],
 550                                                                   vacrelstats);
 551                         /* Remove tuples from heap */
 552                         lazy_vacuum_heap(onerel, vacrelstats);
 553
 554                         /*
 555                          * Forget the now-vacuumed tuples, and press on, but be careful
 556                          * not to reset latestRemovedXid since we want that value to be
 557                          * valid.
 558                          */
 559                         vacrelstats->num_dead_tuples = 0;
 560                         vacrelstats->num_index_scans++;
 561                 }
 562
 563                 /*
 564                  * Pin the visibility map page in case we need to mark the page
 565                  * all-visible.  In most cases this will be very cheap, because we'll
 566                  * already have the correct page pinned anyway.  However, it's
 567                  * possible that (a) next_not_all_visible_block is covered by a
 568                  * different VM page than the current block or (b) we released our pin
 569                  * and did a cycle of index vacuuming.
 570                  */
 571                 visibilitymap_pin(onerel, blkno, &vmbuffer);
 572
 573                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
 574                                                                  RBM_NORMAL, vac_strategy);
 575
 576                 /* We need buffer cleanup lock so that we can prune HOT chains. */
 577                 if (!ConditionalLockBufferForCleanup(buf))
 578                 {
 579                         /*
 580                          * If we're not scanning the whole relation to guard against XID
 581                          * wraparound, it's OK to skip vacuuming a page.  The next vacuum
 582                          * will clean it up.
 583                          */
 584                         if (!scan_all)
 585                         {
 586                                 ReleaseBuffer(buf);
 587                                 continue;
 588                         }
 589
 590                         /*
 591                          * If this is a wraparound checking vacuum, then we read the page
 592                          * with share lock to see if any xids need to be frozen. If the
 593                          * page doesn't need attention we just skip and continue. If it
 594                          * does, we wait for cleanup lock.
 595                          *
 596                          * We could defer the lock request further by remembering the page
 597                          * and coming back to it later, or we could even register
 598                          * ourselves for multiple buffers and then service whichever one
 599                          * is received first.  For now, this seems good enough.
 600                          */
 601                         LockBuffer(buf, BUFFER_LOCK_SHARE);
 602                         if (!lazy_check_needs_freeze(buf))
 603                         {
 604                                 UnlockReleaseBuffer(buf);
 605                                 continue;
 606                         }
 607                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 608                         LockBufferForCleanup(buf);
 609                         /* drop through to normal processing */
 610                 }
 611
 612                 vacrelstats->scanned_pages++;
 613
 614                 page = BufferGetPage(buf);
 615
 616                 if (PageIsNew(page))
 617                 {
 618                         /*
 619                          * An all-zeroes page could be left over if a backend extends the
 620                          * relation but crashes before initializing the page. Reclaim such
 621                          * pages for use.
 622                          *
 623                          * We have to be careful here because we could be looking at a
 624                          * page that someone has just added to the relation and not yet
 625                          * been able to initialize (see RelationGetBufferForTuple). To
 626                          * protect against that, release the buffer lock, grab the
 627                          * relation extension lock momentarily, and re-lock the buffer. If
 628                          * the page is still uninitialized by then, it must be left over
 629                          * from a crashed backend, and we can initialize it.
 630                          *
 631                          * We don't really need the relation lock when this is a new or
 632                          * temp relation, but it's probably not worth the code space to
 633                          * check that, since this surely isn't a critical path.
 634                          *
 635                          * Note: the comparable code in vacuum.c need not worry because
 636                          * it's got exclusive lock on the whole relation.
 637                          */
 638                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 639                         LockRelationForExtension(onerel, ExclusiveLock);
 640                         UnlockRelationForExtension(onerel, ExclusiveLock);
 641                         LockBufferForCleanup(buf);
 642                         if (PageIsNew(page))
 643                         {
 644                                 ereport(WARNING,
 645                                 (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
 646                                                 relname, blkno)));
 647                                 PageInit(page, BufferGetPageSize(buf), 0);
 648                                 empty_pages++;
 649                         }
 650                         freespace = PageGetHeapFreeSpace(page);
 651                         MarkBufferDirty(buf);
 652                         UnlockReleaseBuffer(buf);
 653
 654                         RecordPageWithFreeSpace(onerel, blkno, freespace);
 655                         continue;
 656                 }
 657
 658                 if (PageIsEmpty(page))
 659                 {
 660                         empty_pages++;
 661                         freespace = PageGetHeapFreeSpace(page);
 662
 663                         /* empty pages are always all-visible */
 664                         if (!PageIsAllVisible(page))
 665                         {
 666                                 PageSetAllVisible(page);
 667                                 MarkBufferDirty(buf);
 668                                 visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
 669                                                                   vmbuffer, InvalidTransactionId);
 670                         }
 671
 672                         UnlockReleaseBuffer(buf);
 673                         RecordPageWithFreeSpace(onerel, blkno, freespace);
 674                         continue;
 675                 }
 676
 677                 /*
 678                  * Prune all HOT-update chains in this page.
 679                  *
 680                  * We count tuples removed by the pruning step as removed by VACUUM.
 681                  */
 682                 tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
 683                                                                                  &vacrelstats->latestRemovedXid);
 684
 685                 /*
 686                  * Now scan the page to collect vacuumable items and check for tuples
 687                  * requiring freezing.
 688                  */
 689                 all_visible = true;
 690                 has_dead_tuples = false;
 691                 nfrozen = 0;
 692                 hastup = false;
 693                 prev_dead_count = vacrelstats->num_dead_tuples;
 694                 maxoff = PageGetMaxOffsetNumber(page);
 695
 696                 /*
 697                  * Note: If you change anything in the loop below, also look at
 698                  * heap_page_is_all_visible to see if that needs to be changed.
 699                  */
 700                 for (offnum = FirstOffsetNumber;
 701                          offnum <= maxoff;
 702                          offnum = OffsetNumberNext(offnum))
 703                 {
 704                         ItemId          itemid;
 705
 706                         itemid = PageGetItemId(page, offnum);
 707
 708                         /* Unused items require no processing, but we count 'em */
 709                         if (!ItemIdIsUsed(itemid))
 710                         {
 711                                 nunused += 1;
 712                                 continue;
 713                         }
 714
 715                         /* Redirect items mustn't be touched */
 716                         if (ItemIdIsRedirected(itemid))
 717                         {
 718                                 hastup = true;  /* this page won't be truncatable */
 719                                 continue;
 720                         }
 721
 722                         ItemPointerSet(&(tuple.t_self), blkno, offnum);
 723
 724                         /*
 725                          * DEAD item pointers are to be vacuumed normally; but we don't
 726                          * count them in tups_vacuumed, else we'd be double-counting (at
 727                          * least in the common case where heap_page_prune() just freed up
 728                          * a non-HOT tuple).
 729                          */
 730                         if (ItemIdIsDead(itemid))
 731                         {
 732                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
 733                                 all_visible = false;
 734                                 continue;
 735                         }
 736
 737                         Assert(ItemIdIsNormal(itemid));
 738
 739                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
 740                         tuple.t_len = ItemIdGetLength(itemid);
 741
 742                         tupgone = false;
 743
 744                         switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
 745                         {
 746                                 case HEAPTUPLE_DEAD:
 747
 748                                         /*
 749                                          * Ordinarily, DEAD tuples would have been removed by
 750                                          * heap_page_prune(), but it's possible that the tuple
 751                                          * state changed since heap_page_prune() looked.  In
 752                                          * particular an INSERT_IN_PROGRESS tuple could have
 753                                          * changed to DEAD if the inserter aborted.  So this
 754                                          * cannot be considered an error condition.
 755                                          *
 756                                          * If the tuple is HOT-updated then it must only be
 757                                          * removed by a prune operation; so we keep it just as if
 758                                          * it were RECENTLY_DEAD.  Also, if it's a heap-only
 759                                          * tuple, we choose to keep it, because it'll be a lot
 760                                          * cheaper to get rid of it in the next pruning pass than
 761                                          * to treat it like an indexed tuple.
 762                                          */
 763                                         if (HeapTupleIsHotUpdated(&tuple) ||
 764                                                 HeapTupleIsHeapOnly(&tuple))
 765                                                 nkeep += 1;
 766                                         else
 767                                                 tupgone = true; /* we can delete the tuple */
 768                                         all_visible = false;
 769                                         break;
 770                                 case HEAPTUPLE_LIVE:
 771                                         /* Tuple is good --- but let's do some validity checks */
 772                                         if (onerel->rd_rel->relhasoids &&
 773                                                 !OidIsValid(HeapTupleGetOid(&tuple)))
 774                                                 elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
 775                                                          relname, blkno, offnum);
 776
 777                                         /*
 778                                          * Is the tuple definitely visible to all transactions?
 779                                          *
 780                                          * NB: Like with per-tuple hint bits, we can't set the
 781                                          * PD_ALL_VISIBLE flag if the inserter committed
 782                                          * asynchronously. See SetHintBits for more info. Check
 783                                          * that the HEAP_XMIN_COMMITTED hint bit is set because of
 784                                          * that.
 785                                          */
 786                                         if (all_visible)
 787                                         {
 788                                                 TransactionId xmin;
 789
 790                                                 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
 791                                                 {
 792                                                         all_visible = false;
 793                                                         break;
 794                                                 }
 795
 796                                                 /*
 797                                                  * The inserter definitely committed. But is it old
 798                                                  * enough that everyone sees it as committed?
 799                                                  */
 800                                                 xmin = HeapTupleHeaderGetXmin(tuple.t_data);
 801                                                 if (!TransactionIdPrecedes(xmin, OldestXmin))
 802                                                 {
 803                                                         all_visible = false;
 804                                                         break;
 805                                                 }
 806
 807                                                 /* Track newest xmin on page. */
 808                                                 if (TransactionIdFollows(xmin, visibility_cutoff_xid))
 809                                                         visibility_cutoff_xid = xmin;
 810                                         }
 811                                         break;
 812                                 case HEAPTUPLE_RECENTLY_DEAD:
 813
 814                                         /*
 815                                          * If tuple is recently deleted then we must not remove it
 816                                          * from relation.
 817                                          */
 818                                         nkeep += 1;
 819                                         all_visible = false;
 820                                         break;
 821                                 case HEAPTUPLE_INSERT_IN_PROGRESS:
 822                                         /* This is an expected case during concurrent vacuum */
 823                                         all_visible = false;
 824                                         break;
 825                                 case HEAPTUPLE_DELETE_IN_PROGRESS:
 826                                         /* This is an expected case during concurrent vacuum */
 827                                         all_visible = false;
 828                                         break;
 829                                 default:
 830                                         elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
 831                                         break;
 832                         }
 833
 834                         if (tupgone)
 835                         {
 836                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
 837                                 HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
 838                                                                                          &vacrelstats->latestRemovedXid);
 839                                 tups_vacuumed += 1;
 840                                 has_dead_tuples = true;
 841                         }
 842                         else
 843                         {
 844                                 num_tuples += 1;
 845                                 hastup = true;
 846
 847                                 /*
 848                                  * Each non-removable tuple must be checked to see if it needs
 849                                  * freezing.  Note we already have exclusive buffer lock.
 850                                  */
 851                                 if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
 852                                                                           MultiXactFrzLimit))
 853                                         frozen[nfrozen++] = offnum;
 854                         }
 855                 }                                               /* scan along page */
 856
 857                 /*
 858                  * If we froze any tuples, mark the buffer dirty, and write a WAL
 859                  * record recording the changes.  We must log the changes to be
 860                  * crash-safe against future truncation of CLOG.
 861                  */
 862                 if (nfrozen > 0)
 863                 {
 864                         MarkBufferDirty(buf);
 865                         if (RelationNeedsWAL(onerel))
 866                         {
 867                                 XLogRecPtr      recptr;
 868
 869                                 recptr = log_heap_freeze(onerel, buf, FreezeLimit,
 870                                                                                  MultiXactFrzLimit, frozen, nfrozen);
 871                                 PageSetLSN(page, recptr);
 872                         }
 873                 }
 874
 875                 /*
 876                  * If there are no indexes then we can vacuum the page right now
 877                  * instead of doing a second scan.
 878                  */
 879                 if (nindexes == 0 &&
 880                         vacrelstats->num_dead_tuples > 0)
 881                 {
 882                         /* Remove tuples from heap */
 883                         lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer);
 884
 885                         /*
 886                          * Forget the now-vacuumed tuples, and press on, but be careful
 887                          * not to reset latestRemovedXid since we want that value to be
 888                          * valid.
 889                          */
 890                         vacrelstats->num_dead_tuples = 0;
 891                         vacuumed_pages++;
 892                 }
 893
 894                 freespace = PageGetHeapFreeSpace(page);
 895
 896                 /* mark page all-visible, if appropriate */
 897                 if (all_visible && !all_visible_according_to_vm)
 898                 {
 899                         /*
 900                          * It should never be the case that the visibility map page is set
 901                          * while the page-level bit is clear, but the reverse is allowed
 902                          * (if checksums are not enabled).  Regardless, set the both bits
 903                          * so that we get back in sync.
 904                          *
 905                          * NB: If the heap page is all-visible but the VM bit is not set,
 906                          * we don't need to dirty the heap page.  However, if checksums are
 907                          * enabled, we do need to make sure that the heap page is dirtied
 908                          * before passing it to visibilitymap_set(), because it may be
 909                          * logged.  Given that this situation should only happen in rare
 910                          * cases after a crash, it is not worth optimizing.
 911                          */
 912                         PageSetAllVisible(page);
 913                         MarkBufferDirty(buf);
 914                         visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
 915                                                           vmbuffer, visibility_cutoff_xid);
 916                 }
 917
 918                 /*
 919                  * As of PostgreSQL 9.2, the visibility map bit should never be set if
 920                  * the page-level bit is clear.  However, it's possible that the bit
 921                  * got cleared after we checked it and before we took the buffer
 922                  * content lock, so we must recheck before jumping to the conclusion
 923                  * that something bad has happened.
 924                  */
 925                 else if (all_visible_according_to_vm && !PageIsAllVisible(page)
 926                                  && visibilitymap_test(onerel, blkno, &vmbuffer))
 927                 {
 928                         elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
 929                                  relname, blkno);
 930                         visibilitymap_clear(onerel, blkno, vmbuffer);
 931                 }
 932
 933                 /*
 934                  * It's possible for the value returned by GetOldestXmin() to move
 935                  * backwards, so it's not wrong for us to see tuples that appear to
 936                  * not be visible to everyone yet, while PD_ALL_VISIBLE is already
 937                  * set. The real safe xmin value never moves backwards, but
 938                  * GetOldestXmin() is conservative and sometimes returns a value
 939                  * that's unnecessarily small, so if we see that contradiction it just
 940                  * means that the tuples that we think are not visible to everyone yet
 941                  * actually are, and the PD_ALL_VISIBLE flag is correct.
 942                  *
 943                  * There should never be dead tuples on a page with PD_ALL_VISIBLE
 944                  * set, however.
 945                  */
 946                 else if (PageIsAllVisible(page) && has_dead_tuples)
 947                 {
 948                         elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
 949                                  relname, blkno);
 950                         PageClearAllVisible(page);
 951                         MarkBufferDirty(buf);
 952                         visibilitymap_clear(onerel, blkno, vmbuffer);
 953                 }
 954
 955                 UnlockReleaseBuffer(buf);
 956
 957                 /* Remember the location of the last page with nonremovable tuples */
 958                 if (hastup)
 959                         vacrelstats->nonempty_pages = blkno + 1;
 960
 961                 /*
 962                  * If we remembered any tuples for deletion, then the page will be
 963                  * visited again by lazy_vacuum_heap, which will compute and record
 964                  * its post-compaction free space.      If not, then we're done with this
 965                  * page, so remember its free space as-is.      (This path will always be
 966                  * taken if there are no indexes.)
 967                  */
 968                 if (vacrelstats->num_dead_tuples == prev_dead_count)
 969                         RecordPageWithFreeSpace(onerel, blkno, freespace);
 970         }
 971
 972         /* save stats for use later */
 973         vacrelstats->scanned_tuples = num_tuples;
 974         vacrelstats->tuples_deleted = tups_vacuumed;
 975
 976         /* now we can compute the new value for pg_class.reltuples */
 977         vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false,
 978                                                                                                                  nblocks,
 979                                                                                                   vacrelstats->scanned_pages,
 980                                                                                                                  num_tuples);
 981
 982         /*
 983          * Release any remaining pin on visibility map page.
 984          */
 985         if (BufferIsValid(vmbuffer))
 986         {
 987                 ReleaseBuffer(vmbuffer);
 988                 vmbuffer = InvalidBuffer;
 989         }
 990
 991         /* If any tuples need to be deleted, perform final vacuum cycle */
 992         /* XXX put a threshold on min number of tuples here? */
 993         if (vacrelstats->num_dead_tuples > 0)
 994         {
 995                 /* Log cleanup info before we touch indexes */
 996                 vacuum_log_cleanup_info(onerel, vacrelstats);
 997
 998                 /* Remove index entries */
 999                 for (i = 0; i < nindexes; i++)
1000                         lazy_vacuum_index(Irel[i],
1001                                                           &indstats[i],
1002                                                           vacrelstats);
1003                 /* Remove tuples from heap */
1004                 lazy_vacuum_heap(onerel, vacrelstats);
1005                 vacrelstats->num_index_scans++;
1006         }
1007
1008         /* Do post-vacuum cleanup and statistics update for each index */
1009         for (i = 0; i < nindexes; i++)
1010                 lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
1011
1012         /* If no indexes, make log report that lazy_vacuum_heap would've made */
1013         if (vacuumed_pages)
1014                 ereport(elevel,
1015                                 (errmsg("\"%s\": removed %.0f row versions in %u pages",
1016                                                 RelationGetRelationName(onerel),
1017                                                 tups_vacuumed, vacuumed_pages)));
1018
1019         ereport(elevel,
1020                         (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
1021                                         RelationGetRelationName(onerel),
1022                                         tups_vacuumed, num_tuples,
1023                                         vacrelstats->scanned_pages, nblocks),
1024                          errdetail("%.0f dead row versions cannot be removed yet.\n"
1025                                            "There were %.0f unused item pointers.\n"
1026                                            "%u pages are entirely empty.\n"
1027                                            "%s.",
1028                                            nkeep,
1029                                            nunused,
1030                                            empty_pages,
1031                                            pg_rusage_show(&ru0))));
1032 }
1033
1034
1035 /*
1036  *      lazy_vacuum_heap() -- second pass over the heap
1037  *
1038  *              This routine marks dead tuples as unused and compacts out free
1039  *              space on their pages.  Pages not having dead tuples recorded from
1040  *              lazy_scan_heap are not visited at all.
1041  *
1042  * Note: the reason for doing this as a second pass is we cannot remove
1043  * the tuples until we've removed their index entries, and we want to
1044  * process index entry removal in batches as large as possible.
1045  */
1046 static void
1047 lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
1048 {
1049         int                     tupindex;
1050         int                     npages;
1051         PGRUsage        ru0;
1052         Buffer          vmbuffer = InvalidBuffer;
1053
1054         pg_rusage_init(&ru0);
1055         npages = 0;
1056
1057         tupindex = 0;
1058         while (tupindex < vacrelstats->num_dead_tuples)
1059         {
1060                 BlockNumber tblk;
1061                 Buffer          buf;
1062                 Page            page;
1063                 Size            freespace;
1064
1065                 vacuum_delay_point();
1066
1067                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
1068                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
1069                                                                  vac_strategy);
1070                 if (!ConditionalLockBufferForCleanup(buf))
1071                 {
1072                         ReleaseBuffer(buf);
1073                         ++tupindex;
1074                         continue;
1075                 }
1076                 tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats,
1077                                                                         &vmbuffer);
1078
1079                 /* Now that we've compacted the page, record its available space */
1080                 page = BufferGetPage(buf);
1081                 freespace = PageGetHeapFreeSpace(page);
1082
1083                 UnlockReleaseBuffer(buf);
1084                 RecordPageWithFreeSpace(onerel, tblk, freespace);
1085                 npages++;
1086         }
1087
1088         if (BufferIsValid(vmbuffer))
1089         {
1090                 ReleaseBuffer(vmbuffer);
1091                 vmbuffer = InvalidBuffer;
1092         }
1093
1094         ereport(elevel,
1095                         (errmsg("\"%s\": removed %d row versions in %d pages",
1096                                         RelationGetRelationName(onerel),
1097                                         tupindex, npages),
1098                          errdetail("%s.",
1099                                            pg_rusage_show(&ru0))));
1100 }
1101
1102 /*
1103  *      lazy_vacuum_page() -- free dead tuples on a page
1104  *                                       and repair its fragmentation.
1105  *
1106  * Caller must hold pin and buffer cleanup lock on the buffer.
1107  *
1108  * tupindex is the index in vacrelstats->dead_tuples of the first dead
1109  * tuple for this page.  We assume the rest follow sequentially.
1110  * The return value is the first tupindex after the tuples of this page.
1111  */
1112 static int
1113 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
1114                                  int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer)
1115 {
1116         Page            page = BufferGetPage(buffer);
1117         OffsetNumber unused[MaxOffsetNumber];
1118         int                     uncnt = 0;
1119         TransactionId   visibility_cutoff_xid;
1120
1121         START_CRIT_SECTION();
1122
1123         for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
1124         {
1125                 BlockNumber tblk;
1126                 OffsetNumber toff;
1127                 ItemId          itemid;
1128
1129                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
1130                 if (tblk != blkno)
1131                         break;                          /* past end of tuples for this block */
1132                 toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
1133                 itemid = PageGetItemId(page, toff);
1134                 ItemIdSetUnused(itemid);
1135                 unused[uncnt++] = toff;
1136         }
1137
1138         PageRepairFragmentation(page);
1139
1140         /*
1141          * Mark buffer dirty before we write WAL.
1142          *
1143          * If checksums are enabled, visibilitymap_set() may log the heap page, so
1144          * we must mark heap buffer dirty before calling visibilitymap_set().
1145          */
1146         MarkBufferDirty(buffer);
1147
1148         /*
1149          * Now that we have removed the dead tuples from the page, once again check
1150          * if the page has become all-visible.
1151          */
1152         if (!visibilitymap_test(onerel, blkno, vmbuffer) &&
1153                 heap_page_is_all_visible(buffer, &visibility_cutoff_xid))
1154         {
1155                 Assert(BufferIsValid(*vmbuffer));
1156                 PageSetAllVisible(page);
1157                 visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr, *vmbuffer,
1158                                 visibility_cutoff_xid);
1159         }
1160
1161         /* XLOG stuff */
1162         if (RelationNeedsWAL(onerel))
1163         {
1164                 XLogRecPtr      recptr;
1165
1166                 recptr = log_heap_clean(onerel, buffer,
1167                                                                 NULL, 0, NULL, 0,
1168                                                                 unused, uncnt,
1169                                                                 vacrelstats->latestRemovedXid);
1170                 PageSetLSN(page, recptr);
1171         }
1172
1173         END_CRIT_SECTION();
1174
1175         return tupindex;
1176 }
1177
1178 /*
1179  *      lazy_check_needs_freeze() -- scan page to see if any tuples
1180  *                                       need to be cleaned to avoid wraparound
1181  *
1182  * Returns true if the page needs to be vacuumed using cleanup lock.
1183  */
1184 static bool
1185 lazy_check_needs_freeze(Buffer buf)
1186 {
1187         Page            page;
1188         OffsetNumber offnum,
1189                                 maxoff;
1190         HeapTupleHeader tupleheader;
1191
1192         page = BufferGetPage(buf);
1193
1194         if (PageIsNew(page) || PageIsEmpty(page))
1195         {
1196                 /* PageIsNew probably shouldn't happen... */
1197                 return false;
1198         }
1199
1200         maxoff = PageGetMaxOffsetNumber(page);
1201         for (offnum = FirstOffsetNumber;
1202                  offnum <= maxoff;
1203                  offnum = OffsetNumberNext(offnum))
1204         {
1205                 ItemId          itemid;
1206
1207                 itemid = PageGetItemId(page, offnum);
1208
1209                 if (!ItemIdIsNormal(itemid))
1210                         continue;
1211
1212                 tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
1213
1214                 if (heap_tuple_needs_freeze(tupleheader, FreezeLimit,
1215                                                                         MultiXactFrzLimit, buf))
1216                         return true;
1217         }                                                       /* scan along page */
1218
1219         return false;
1220 }
1221
1222
1223 /*
1224  *      lazy_vacuum_index() -- vacuum one index relation.
1225  *
1226  *              Delete all the index entries pointing to tuples listed in
1227  *              vacrelstats->dead_tuples, and update running statistics.
1228  */
1229 static void
1230 lazy_vacuum_index(Relation indrel,
1231                                   IndexBulkDeleteResult **stats,
1232                                   LVRelStats *vacrelstats)
1233 {
1234         IndexVacuumInfo ivinfo;
1235         PGRUsage        ru0;
1236
1237         pg_rusage_init(&ru0);
1238
1239         ivinfo.index = indrel;
1240         ivinfo.analyze_only = false;
1241         ivinfo.estimated_count = true;
1242         ivinfo.message_level = elevel;
1243         ivinfo.num_heap_tuples = vacrelstats->old_rel_tuples;
1244         ivinfo.strategy = vac_strategy;
1245
1246         /* Do bulk deletion */
1247         *stats = index_bulk_delete(&ivinfo, *stats,
1248                                                            lazy_tid_reaped, (void *) vacrelstats);
1249
1250         ereport(elevel,
1251                         (errmsg("scanned index \"%s\" to remove %d row versions",
1252                                         RelationGetRelationName(indrel),
1253                                         vacrelstats->num_dead_tuples),
1254                          errdetail("%s.", pg_rusage_show(&ru0))));
1255 }
1256
1257 /*
1258  *      lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
1259  */
1260 static void
1261 lazy_cleanup_index(Relation indrel,
1262                                    IndexBulkDeleteResult *stats,
1263                                    LVRelStats *vacrelstats)
1264 {
1265         IndexVacuumInfo ivinfo;
1266         PGRUsage        ru0;
1267
1268         pg_rusage_init(&ru0);
1269
1270         ivinfo.index = indrel;
1271         ivinfo.analyze_only = false;
1272         ivinfo.estimated_count = (vacrelstats->scanned_pages < vacrelstats->rel_pages);
1273         ivinfo.message_level = elevel;
1274         ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples;
1275         ivinfo.strategy = vac_strategy;
1276
1277         stats = index_vacuum_cleanup(&ivinfo, stats);
1278
1279         if (!stats)
1280                 return;
1281
1282         /*
1283          * Now update statistics in pg_class, but only if the index says the count
1284          * is accurate.
1285          */
1286         if (!stats->estimated_count)
1287                 vac_update_relstats(indrel,
1288                                                         stats->num_pages,
1289                                                         stats->num_index_tuples,
1290                                                         0,
1291                                                         false,
1292                                                         InvalidTransactionId,
1293                                                         InvalidMultiXactId);
1294
1295         ereport(elevel,
1296                         (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
1297                                         RelationGetRelationName(indrel),
1298                                         stats->num_index_tuples,
1299                                         stats->num_pages),
1300                          errdetail("%.0f index row versions were removed.\n"
1301                          "%u index pages have been deleted, %u are currently reusable.\n"
1302                                            "%s.",
1303                                            stats->tuples_removed,
1304                                            stats->pages_deleted, stats->pages_free,
1305                                            pg_rusage_show(&ru0))));
1306
1307         pfree(stats);
1308 }
1309
1310 /*
1311  * lazy_truncate_heap - try to truncate off any empty pages at the end
1312  */
1313 static void
1314 lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
1315 {
1316         BlockNumber old_rel_pages = vacrelstats->rel_pages;
1317         BlockNumber new_rel_pages;
1318         PGRUsage        ru0;
1319         int                     lock_retry;
1320
1321         pg_rusage_init(&ru0);
1322
1323         /*
1324          * Loop until no more truncating can be done.
1325          */
1326         do
1327         {
1328                 /*
1329                  * We need full exclusive lock on the relation in order to do
1330                  * truncation. If we can't get it, give up rather than waiting --- we
1331                  * don't want to block other backends, and we don't want to deadlock
1332                  * (which is quite possible considering we already hold a lower-grade
1333                  * lock).
1334                  */
1335                 vacrelstats->lock_waiter_detected = false;
1336                 lock_retry = 0;
1337                 while (true)
1338                 {
1339                         if (ConditionalLockRelation(onerel, AccessExclusiveLock))
1340                                 break;
1341
1342                         /*
1343                          * Check for interrupts while trying to (re-)acquire the exclusive
1344                          * lock.
1345                          */
1346                         CHECK_FOR_INTERRUPTS();
1347
1348                         if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
1349                                                                 VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
1350                         {
1351                                 /*
1352                                  * We failed to establish the lock in the specified number of
1353                                  * retries. This means we give up truncating.
1354                                  */
1355                                 vacrelstats->lock_waiter_detected = true;
1356                                 ereport(elevel,
1357                                                 (errmsg("\"%s\": stopping truncate due to conflicting lock request",
1358                                                                 RelationGetRelationName(onerel))));
1359                                 return;
1360                         }
1361
1362                         pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL);
1363                 }
1364
1365                 /*
1366                  * Now that we have exclusive lock, look to see if the rel has grown
1367                  * whilst we were vacuuming with non-exclusive lock.  If so, give up;
1368                  * the newly added pages presumably contain non-deletable tuples.
1369                  */
1370                 new_rel_pages = RelationGetNumberOfBlocks(onerel);
1371                 if (new_rel_pages != old_rel_pages)
1372                 {
1373                         /*
1374                          * Note: we intentionally don't update vacrelstats->rel_pages with
1375                          * the new rel size here.  If we did, it would amount to assuming
1376                          * that the new pages are empty, which is unlikely. Leaving the
1377                          * numbers alone amounts to assuming that the new pages have the
1378                          * same tuple density as existing ones, which is less unlikely.
1379                          */
1380                         UnlockRelation(onerel, AccessExclusiveLock);
1381                         return;
1382                 }
1383
1384                 /*
1385                  * Scan backwards from the end to verify that the end pages actually
1386                  * contain no tuples.  This is *necessary*, not optional, because
1387                  * other backends could have added tuples to these pages whilst we
1388                  * were vacuuming.
1389                  */
1390                 new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
1391
1392                 if (new_rel_pages >= old_rel_pages)
1393                 {
1394                         /* can't do anything after all */
1395                         UnlockRelation(onerel, AccessExclusiveLock);
1396                         return;
1397                 }
1398
1399                 /*
1400                  * Okay to truncate.
1401                  */
1402                 RelationTruncate(onerel, new_rel_pages);
1403
1404                 /*
1405                  * We can release the exclusive lock as soon as we have truncated.
1406                  * Other backends can't safely access the relation until they have
1407                  * processed the smgr invalidation that smgrtruncate sent out ... but
1408                  * that should happen as part of standard invalidation processing once
1409                  * they acquire lock on the relation.
1410                  */
1411                 UnlockRelation(onerel, AccessExclusiveLock);
1412
1413                 /*
1414                  * Update statistics.  Here, it *is* correct to adjust rel_pages
1415                  * without also touching reltuples, since the tuple count wasn't
1416                  * changed by the truncation.
1417                  */
1418                 vacrelstats->pages_removed += old_rel_pages - new_rel_pages;
1419                 vacrelstats->rel_pages = new_rel_pages;
1420
1421                 ereport(elevel,
1422                                 (errmsg("\"%s\": truncated %u to %u pages",
1423                                                 RelationGetRelationName(onerel),
1424                                                 old_rel_pages, new_rel_pages),
1425                                  errdetail("%s.",
1426                                                    pg_rusage_show(&ru0))));
1427                 old_rel_pages = new_rel_pages;
1428         } while (new_rel_pages > vacrelstats->nonempty_pages &&
1429                          vacrelstats->lock_waiter_detected);
1430 }
1431
1432 /*
1433  * Rescan end pages to verify that they are (still) empty of tuples.
1434  *
1435  * Returns number of nondeletable pages (last nonempty page + 1).
1436  */
1437 static BlockNumber
1438 count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
1439 {
1440         BlockNumber blkno;
1441         instr_time      starttime;
1442
1443         /* Initialize the starttime if we check for conflicting lock requests */
1444         INSTR_TIME_SET_CURRENT(starttime);
1445
1446         /* Strange coding of loop control is needed because blkno is unsigned */
1447         blkno = vacrelstats->rel_pages;
1448         while (blkno > vacrelstats->nonempty_pages)
1449         {
1450                 Buffer          buf;
1451                 Page            page;
1452                 OffsetNumber offnum,
1453                                         maxoff;
1454                 bool            hastup;
1455
1456                 /*
1457                  * Check if another process requests a lock on our relation. We are
1458                  * holding an AccessExclusiveLock here, so they will be waiting. We
1459                  * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
1460                  * only check if that interval has elapsed once every 32 blocks to
1461                  * keep the number of system calls and actual shared lock table
1462                  * lookups to a minimum.
1463                  */
1464                 if ((blkno % 32) == 0)
1465                 {
1466                         instr_time      currenttime;
1467                         instr_time      elapsed;
1468
1469                         INSTR_TIME_SET_CURRENT(currenttime);
1470                         elapsed = currenttime;
1471                         INSTR_TIME_SUBTRACT(elapsed, starttime);
1472                         if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
1473                                 >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
1474                         {
1475                                 if (LockHasWaitersRelation(onerel, AccessExclusiveLock))
1476                                 {
1477                                         ereport(elevel,
1478                                                         (errmsg("\"%s\": suspending truncate due to conflicting lock request",
1479                                                                         RelationGetRelationName(onerel))));
1480
1481                                         vacrelstats->lock_waiter_detected = true;
1482                                         return blkno;
1483                                 }
1484                                 starttime = currenttime;
1485                         }
1486                 }
1487
1488                 /*
1489                  * We don't insert a vacuum delay point here, because we have an
1490                  * exclusive lock on the table which we want to hold for as short a
1491                  * time as possible.  We still need to check for interrupts however.
1492                  */
1493                 CHECK_FOR_INTERRUPTS();
1494
1495                 blkno--;
1496
1497                 buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
1498                                                                  RBM_NORMAL, vac_strategy);
1499
1500                 /* In this phase we only need shared access to the buffer */
1501                 LockBuffer(buf, BUFFER_LOCK_SHARE);
1502
1503                 page = BufferGetPage(buf);
1504
1505                 if (PageIsNew(page) || PageIsEmpty(page))
1506                 {
1507                         /* PageIsNew probably shouldn't happen... */
1508                         UnlockReleaseBuffer(buf);
1509                         continue;
1510                 }
1511
1512                 hastup = false;
1513                 maxoff = PageGetMaxOffsetNumber(page);
1514                 for (offnum = FirstOffsetNumber;
1515                          offnum <= maxoff;
1516                          offnum = OffsetNumberNext(offnum))
1517                 {
1518                         ItemId          itemid;
1519
1520                         itemid = PageGetItemId(page, offnum);
1521
1522                         /*
1523                          * Note: any non-unused item should be taken as a reason to keep
1524                          * this page.  We formerly thought that DEAD tuples could be
1525                          * thrown away, but that's not so, because we'd not have cleaned
1526                          * out their index entries.
1527                          */
1528                         if (ItemIdIsUsed(itemid))
1529                         {
1530                                 hastup = true;
1531                                 break;                  /* can stop scanning */
1532                         }
1533                 }                                               /* scan along page */
1534
1535                 UnlockReleaseBuffer(buf);
1536
1537                 /* Done scanning if we found a tuple here */
1538                 if (hastup)
1539                         return blkno + 1;
1540         }
1541
1542         /*
1543          * If we fall out of the loop, all the previously-thought-to-be-empty
1544          * pages still are; we need not bother to look at the last known-nonempty
1545          * page.
1546          */
1547         return vacrelstats->nonempty_pages;
1548 }
1549
1550 /*
1551  * lazy_space_alloc - space allocation decisions for lazy vacuum
1552  *
1553  * See the comments at the head of this file for rationale.
1554  */
1555 static void
1556 lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
1557 {
1558         long            maxtuples;
1559
1560         if (vacrelstats->hasindex)
1561         {
1562                 maxtuples = (maintenance_work_mem * 1024L) / sizeof(ItemPointerData);
1563                 maxtuples = Min(maxtuples, INT_MAX);
1564                 maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
1565
1566                 /* curious coding here to ensure the multiplication can't overflow */
1567                 if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
1568                         maxtuples = relblocks * LAZY_ALLOC_TUPLES;
1569
1570                 /* stay sane if small maintenance_work_mem */
1571                 maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
1572         }
1573         else
1574         {
1575                 maxtuples = MaxHeapTuplesPerPage;
1576         }
1577
1578         vacrelstats->num_dead_tuples = 0;
1579         vacrelstats->max_dead_tuples = (int) maxtuples;
1580         vacrelstats->dead_tuples = (ItemPointer)
1581                 palloc(maxtuples * sizeof(ItemPointerData));
1582 }
1583
1584 /*
1585  * lazy_record_dead_tuple - remember one deletable tuple
1586  */
1587 static void
1588 lazy_record_dead_tuple(LVRelStats *vacrelstats,
1589                                            ItemPointer itemptr)
1590 {
1591         /*
1592          * The array shouldn't overflow under normal behavior, but perhaps it
1593          * could if we are given a really small maintenance_work_mem. In that
1594          * case, just forget the last few tuples (we'll get 'em next time).
1595          */
1596         if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
1597         {
1598                 vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
1599                 vacrelstats->num_dead_tuples++;
1600         }
1601 }
1602
1603 /*
1604  *      lazy_tid_reaped() -- is a particular tid deletable?
1605  *
1606  *              This has the right signature to be an IndexBulkDeleteCallback.
1607  *
1608  *              Assumes dead_tuples array is in sorted order.
1609  */
1610 static bool
1611 lazy_tid_reaped(ItemPointer itemptr, void *state)
1612 {
1613         LVRelStats *vacrelstats = (LVRelStats *) state;
1614         ItemPointer res;
1615
1616         res = (ItemPointer) bsearch((void *) itemptr,
1617                                                                 (void *) vacrelstats->dead_tuples,
1618                                                                 vacrelstats->num_dead_tuples,
1619                                                                 sizeof(ItemPointerData),
1620                                                                 vac_cmp_itemptr);
1621
1622         return (res != NULL);
1623 }
1624
1625 /*
1626  * Comparator routines for use with qsort() and bsearch().
1627  */
1628 static int
1629 vac_cmp_itemptr(const void *left, const void *right)
1630 {
1631         BlockNumber lblk,
1632                                 rblk;
1633         OffsetNumber loff,
1634                                 roff;
1635
1636         lblk = ItemPointerGetBlockNumber((ItemPointer) left);
1637         rblk = ItemPointerGetBlockNumber((ItemPointer) right);
1638
1639         if (lblk < rblk)
1640                 return -1;
1641         if (lblk > rblk)
1642                 return 1;
1643
1644         loff = ItemPointerGetOffsetNumber((ItemPointer) left);
1645         roff = ItemPointerGetOffsetNumber((ItemPointer) right);
1646
1647         if (loff < roff)
1648                 return -1;
1649         if (loff > roff)
1650                 return 1;
1651
1652         return 0;
1653 }
1654
1655 /*
1656  * Check if every tuple in the given page is visible to all current and future
1657  * transactions. Also return the visibility_cutoff_xid which is the highest
1658  * xmin amongst the visible tuples.
1659  */
1660 static bool
1661 heap_page_is_all_visible(Buffer buf, TransactionId *visibility_cutoff_xid)
1662 {
1663         Page             page = BufferGetPage(buf);
1664         OffsetNumber offnum,
1665                                  maxoff;
1666         bool             all_visible = true;
1667
1668         *visibility_cutoff_xid = InvalidTransactionId;
1669
1670         /*
1671          * This is a stripped down version of the line pointer scan in
1672          * lazy_scan_heap(). So if you change anything here, also check that
1673          * code.
1674          */
1675         maxoff = PageGetMaxOffsetNumber(page);
1676         for (offnum = FirstOffsetNumber;
1677                         offnum <= maxoff && all_visible;
1678                         offnum = OffsetNumberNext(offnum))
1679         {
1680                 ItemId                  itemid;
1681                 HeapTupleData   tuple;
1682
1683                 itemid = PageGetItemId(page, offnum);
1684
1685                 /* Unused or redirect line pointers are of no interest */
1686                 if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
1687                         continue;
1688
1689                 ItemPointerSet(&(tuple.t_self), BufferGetBlockNumber(buf), offnum);
1690
1691                 /*
1692                  * Dead line pointers can have index pointers pointing to them. So they
1693                  * can't be treated as visible
1694                  */
1695                 if (ItemIdIsDead(itemid))
1696                 {
1697                         all_visible = false;
1698                         break;
1699                 }
1700
1701                 Assert(ItemIdIsNormal(itemid));
1702
1703                 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1704
1705                 switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
1706                 {
1707                         case HEAPTUPLE_LIVE:
1708                                 {
1709                                         TransactionId xmin;
1710
1711                                         /* Check comments in lazy_scan_heap. */
1712                                         if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1713                                         {
1714                                                 all_visible = false;
1715                                                 break;
1716                                         }
1717
1718                                         /*
1719                                          * The inserter definitely committed. But is it old
1720                                          * enough that everyone sees it as committed?
1721                                          */
1722                                         xmin = HeapTupleHeaderGetXmin(tuple.t_data);
1723                                         if (!TransactionIdPrecedes(xmin, OldestXmin))
1724                                         {
1725                                                 all_visible = false;
1726                                                 break;
1727                                         }
1728
1729                                         /* Track newest xmin on page. */
1730                                         if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
1731                                                 *visibility_cutoff_xid = xmin;
1732                                 }
1733                                 break;
1734
1735                         case HEAPTUPLE_DEAD:
1736                         case HEAPTUPLE_RECENTLY_DEAD:
1737                         case HEAPTUPLE_INSERT_IN_PROGRESS:
1738                         case HEAPTUPLE_DELETE_IN_PROGRESS:
1739                                 all_visible = false;
1740                                 break;
1741
1742                         default:
1743                                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1744                                 break;
1745                 }
1746         }                                               /* scan along page */
1747
1748         return all_visible;
1749 }