Revert no-op changes to BufferGetPage()

[postgresql] / src / backend / commands / vacuumlazy.c
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c

index 703b476bdfd5cf5688f69e92e62a3b772c91baf7..426e75609303901b77eed8b0e0b808b7d6379411 100644 (file)
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -10,13 +10,13 @@
   * relations with finite memory space usage.  To do that, we set upper bounds
   * on the number of tuples and pages we will keep track of at once.
   *
- * We are willing to use at most maintenance_work_mem memory space to keep
- * track of dead tuples.  We initially allocate an array of TIDs of that size,
- * with an upper limit that depends on table size (this limit ensures we don't
- * allocate a huge area uselessly for vacuuming small tables). If the array
- * threatens to overflow, we suspend the heap scan phase and perform a pass of
- * index cleanup and page compaction, then resume the heap scan with an empty
- * TID array.
+ * We are willing to use at most maintenance_work_mem (or perhaps
+ * autovacuum_work_mem) memory space to keep track of dead tuples.  We
+ * initially allocate an array of TIDs of that size, with an upper limit that
+ * depends on table size (this limit ensures we don't allocate a huge area
+ * uselessly for vacuuming small tables).  If the array threatens to overflow,
+ * we suspend the heap scan phase and perform a pass of index cleanup and page
+ * compaction, then resume the heap scan with an empty TID array.
   *
   * If we're processing a table with no indexes, we can just vacuum each page
   * as we go; there's no need to save up multiple tuples to minimize the number
@@ -24,7 +24,7 @@
   * the TID array, just enough to hold as many heap tuples as fit on one page.
   *
   *
- * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
@@ -40,13 +40,19 @@
  #include "access/genam.h"
  #include "access/heapam.h"
  #include "access/heapam_xlog.h"
+#include "access/htup_details.h"
+#include "access/multixact.h"
  #include "access/transam.h"
  #include "access/visibilitymap.h"
+#include "access/xlog.h"
+#include "catalog/catalog.h"
  #include "catalog/storage.h"
  #include "commands/dbcommands.h"
+#include "commands/progress.h"
  #include "commands/vacuum.h"
  #include "miscadmin.h"
  #include "pgstat.h"
+#include "portability/instr_time.h"
  #include "postmaster/autovacuum.h"
  #include "storage/bufmgr.h"
  #include "storage/freespace.h"
@@ -68,6 +74,17 @@
  #define REL_TRUNCATE_MINIMUM   1000
  #define REL_TRUNCATE_FRACTION  16
  
+/*
+ * Timing parameters for truncate locking heuristics.
+ *
+ * These were not exposed as user tunable GUC values because it didn't seem
+ * that the potential for improvement was great enough to merit the cost of
+ * supporting them.
+ */
+#define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL            20              /* ms */
+#define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL             50              /* ms */
+#define VACUUM_TRUNCATE_LOCK_TIMEOUT                   5000    /* ms */
+
  /*
   * Guesstimation of number of dead tuples per page.  This is used to
   * provide an upper limit to memory allocated when vacuuming small
@@ -89,9 +106,12 @@ typedef struct LVRelStats
         BlockNumber old_rel_pages;      /* previous value of pg_class.relpages */
         BlockNumber rel_pages;          /* total number of pages */
         BlockNumber scanned_pages;      /* number of pages we examined */
+       BlockNumber pinskipped_pages;           /* # of pages we skipped due to a pin */
+       BlockNumber frozenskipped_pages;        /* # of frozen pages we skipped */
         double          scanned_tuples; /* counts only tuples on scanned pages */
         double          old_rel_tuples; /* previous value of pg_class.reltuples */
         double          new_rel_tuples; /* new estimated total # of tuples */
+       double          new_dead_tuples;        /* new estimated total # of dead tuples */
         BlockNumber pages_removed;
         double          tuples_deleted;
         BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
@@ -102,6 +122,7 @@ typedef struct LVRelStats
         ItemPointer dead_tuples;        /* array of ItemPointerData */
         int                     num_index_scans;
         TransactionId latestRemovedXid;
+       bool            lock_waiter_detected;
  } LVRelStats;
  
  
@@ -110,15 +131,16 @@ static int        elevel = -1;
  
  static TransactionId OldestXmin;
  static TransactionId FreezeLimit;
+static MultiXactId MultiXactCutoff;
  
  static BufferAccessStrategy vac_strategy;
  
  
  /* non-export function prototypes */
  static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
-                          Relation *Irel, int nindexes, bool scan_all);
+                          Relation *Irel, int nindexes, bool aggressive);
  static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
-static bool lazy_check_needs_freeze(Buffer buf);
+static bool lazy_check_needs_freeze(Buffer buf, bool *hastup);
  static void lazy_vacuum_index(Relation indrel,
                                   IndexBulkDeleteResult **stats,
                                   LVRelStats *vacrelstats);
@@ -126,7 +148,8 @@ static void lazy_cleanup_index(Relation indrel,
                                    IndexBulkDeleteResult *stats,
                                    LVRelStats *vacrelstats);
  static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
-                                int tupindex, LVRelStats *vacrelstats);
+                                int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer);
+static bool should_attempt_truncation(LVRelStats *vacrelstats);
  static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
  static BlockNumber count_nondeletable_pages(Relation onerel,
                                                  LVRelStats *vacrelstats);
@@ -135,6 +158,8 @@ static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
                                            ItemPointer itemptr);
  static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
  static int     vac_cmp_itemptr(const void *left, const void *right);
+static bool heap_page_is_all_visible(Relation rel, Buffer buf,
+                                        TransactionId *visibility_cutoff_xid, bool *all_frozen);
  
  
  /*
@@ -147,74 +172,111 @@ static int       vac_cmp_itemptr(const void *left, const void *right);
   *             and locked the relation.
   */
  void
-lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
+lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params,
                                 BufferAccessStrategy bstrategy)
  {
         LVRelStats *vacrelstats;
         Relation   *Irel;
         int                     nindexes;
-       BlockNumber possibly_freeable;
         PGRUsage        ru0;
         TimestampTz starttime = 0;
         long            secs;
         int                     usecs;
         double          read_rate,
                                 write_rate;
-       bool            scan_all;
-       TransactionId freezeTableLimit;
+       bool            aggressive;             /* should we scan all unfrozen pages? */
+       bool            scanned_all_unfrozen;   /* actually scanned all such pages? */
+       TransactionId xidFullScanLimit;
+       MultiXactId mxactFullScanLimit;
         BlockNumber new_rel_pages;
         double          new_rel_tuples;
         BlockNumber new_rel_allvisible;
+       double          new_live_tuples;
         TransactionId new_frozen_xid;
+       MultiXactId new_min_multi;
+
+       Assert(params != NULL);
  
         /* measure elapsed time iff autovacuum logging requires it */
-       if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
+       if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
         {
                 pg_rusage_init(&ru0);
                 starttime = GetCurrentTimestamp();
         }
  
-       if (vacstmt->options & VACOPT_VERBOSE)
+       if (options & VACOPT_VERBOSE)
                 elevel = INFO;
         else
                 elevel = DEBUG2;
  
+       pgstat_progress_start_command(PROGRESS_COMMAND_VACUUM,
+                                                                 RelationGetRelid(onerel));
+
         vac_strategy = bstrategy;
  
-       vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age,
-                                                 onerel->rd_rel->relisshared,
-                                                 &OldestXmin, &FreezeLimit, &freezeTableLimit);
-       scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
-                                                                                        freezeTableLimit);
+       vacuum_set_xid_limits(onerel,
+                                                 params->freeze_min_age,
+                                                 params->freeze_table_age,
+                                                 params->multixact_freeze_min_age,
+                                                 params->multixact_freeze_table_age,
+                                                 &OldestXmin, &FreezeLimit, &xidFullScanLimit,
+                                                 &MultiXactCutoff, &mxactFullScanLimit);
+
+       /*
+        * We request an aggressive scan if either the table's frozen Xid is now
+        * older than or equal to the requested Xid full-table scan limit; or if
+        * the table's minimum MultiXactId is older than or equal to the requested
+        * mxid full-table scan limit.
+        */
+       aggressive = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
+                                                                                          xidFullScanLimit);
+       aggressive |= MultiXactIdPrecedesOrEquals(onerel->rd_rel->relminmxid,
+                                                                                         mxactFullScanLimit);
  
         vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
  
         vacrelstats->old_rel_pages = onerel->rd_rel->relpages;
         vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples;
         vacrelstats->num_index_scans = 0;
+       vacrelstats->pages_removed = 0;
+       vacrelstats->lock_waiter_detected = false;
  
         /* Open all indexes of the relation */
         vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
         vacrelstats->hasindex = (nindexes > 0);
  
         /* Do the vacuuming */
-       lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all);
+       lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, aggressive);
  
         /* Done with indexes */
         vac_close_indexes(nindexes, Irel, NoLock);
  
         /*
-        * Optionally truncate the relation.
+        * Compute whether we actually scanned the whole relation. If we did, we
+        * can adjust relfrozenxid and relminmxid.
          *
-        * Don't even think about it unless we have a shot at releasing a goodly
-        * number of pages.  Otherwise, the time taken isn't worth it.
+        * NB: We need to check this before truncating the relation, because that
+        * will change ->rel_pages.
          */
-       possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
-       if (possibly_freeable > 0 &&
-               (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
-                possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION))
+       if ((vacrelstats->scanned_pages + vacrelstats->frozenskipped_pages)
+               < vacrelstats->rel_pages)
+       {
+               Assert(!aggressive);
+               scanned_all_unfrozen = false;
+       }
+       else
+               scanned_all_unfrozen = true;
+
+       /*
+        * Optionally truncate the relation.
+        */
+       if (should_attempt_truncation(vacrelstats))
                 lazy_truncate_heap(onerel, vacrelstats);
  
+       /* Report that we are now doing final cleanup */
+       pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
+                                                                PROGRESS_VACUUM_PHASE_FINAL_CLEANUP);
+
         /* Vacuum the Free Space Map */
         FreeSpaceMapVacuum(onerel);
  
@@ -232,8 +294,8 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
          * is all-visible we'd definitely like to know that.  But clamp the value
          * to be not more than what we're setting relpages to.
          *
-        * Also, don't change relfrozenxid if we skipped any pages, since then we
-        * don't know for certain that all tuples have a newer xmin.
+        * Also, don't change relfrozenxid/relminmxid if we skipped any pages,
+        * since then we don't know for certain that all tuples have a newer xmin.
          */
         new_rel_pages = vacrelstats->rel_pages;
         new_rel_tuples = vacrelstats->new_rel_tuples;
@@ -243,35 +305,44 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
                 new_rel_tuples = vacrelstats->old_rel_tuples;
         }
  
-       new_rel_allvisible = visibilitymap_count(onerel);
+       visibilitymap_count(onerel, &new_rel_allvisible, NULL);
         if (new_rel_allvisible > new_rel_pages)
                 new_rel_allvisible = new_rel_pages;
  
-       new_frozen_xid = FreezeLimit;
-       if (vacrelstats->scanned_pages < vacrelstats->rel_pages)
-               new_frozen_xid = InvalidTransactionId;
+       new_frozen_xid = scanned_all_unfrozen ? FreezeLimit : InvalidTransactionId;
+       new_min_multi = scanned_all_unfrozen ? MultiXactCutoff : InvalidMultiXactId;
  
         vac_update_relstats(onerel,
                                                 new_rel_pages,
                                                 new_rel_tuples,
                                                 new_rel_allvisible,
                                                 vacrelstats->hasindex,
-                                               new_frozen_xid);
+                                               new_frozen_xid,
+                                               new_min_multi,
+                                               false);
  
         /* report results to the stats collector, too */
+       new_live_tuples = new_rel_tuples - vacrelstats->new_dead_tuples;
+       if (new_live_tuples < 0)
+               new_live_tuples = 0;    /* just in case */
+
         pgstat_report_vacuum(RelationGetRelid(onerel),
                                                  onerel->rd_rel->relisshared,
-                                                new_rel_tuples);
+                                                new_live_tuples,
+                                                vacrelstats->new_dead_tuples);
+       pgstat_progress_end_command();
  
         /* and log the action if appropriate */
-       if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
+       if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0)
         {
                 TimestampTz endtime = GetCurrentTimestamp();
  
-               if (Log_autovacuum_min_duration == 0 ||
+               if (params->log_min_duration == 0 ||
                         TimestampDifferenceExceeds(starttime, endtime,
-                                                                          Log_autovacuum_min_duration))
+                                                                          params->log_min_duration))
                 {
+                       StringInfoData buf;
+
                         TimestampDifference(starttime, endtime, &secs, &usecs);
  
                         read_rate = 0;
@@ -283,26 +354,39 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
                                 write_rate = (double) BLCKSZ *VacuumPageDirty / (1024 * 1024) /
                                                         (secs + usecs / 1000000.0);
                         }
+
+                       /*
+                        * This is pretty messy, but we split it up so that we can skip
+                        * emitting individual parts of the message when not applicable.
+                        */
+                       initStringInfo(&buf);
+                       appendStringInfo(&buf, _("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"),
+                                                        get_database_name(MyDatabaseId),
+                                                        get_namespace_name(RelationGetNamespace(onerel)),
+                                                        RelationGetRelationName(onerel),
+                                                        vacrelstats->num_index_scans);
+                       appendStringInfo(&buf, _("pages: %u removed, %u remain, %u skipped due to pins, %u skipped frozen\n"),
+                                                        vacrelstats->pages_removed,
+                                                        vacrelstats->rel_pages,
+                                                        vacrelstats->pinskipped_pages,
+                                                        vacrelstats->frozenskipped_pages);
+                       appendStringInfo(&buf,
+                                                        _("tuples: %.0f removed, %.0f remain, %.0f are dead but not yet removable\n"),
+                                                        vacrelstats->tuples_deleted,
+                                                        vacrelstats->new_rel_tuples,
+                                                        vacrelstats->new_dead_tuples);
+                       appendStringInfo(&buf,
+                                                _("buffer usage: %d hits, %d misses, %d dirtied\n"),
+                                                        VacuumPageHit,
+                                                        VacuumPageMiss,
+                                                        VacuumPageDirty);
+                       appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"),
+                                                        read_rate, write_rate);
+                       appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0));
+
                         ereport(LOG,
-                                       (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"
-                                                       "pages: %d removed, %d remain\n"
-                                                       "tuples: %.0f removed, %.0f remain\n"
-                                                       "buffer usage: %d hits, %d misses, %d dirtied\n"
-                                       "avg read rate: %.3f MiB/s, avg write rate: %.3f MiB/s\n"
-                                                       "system usage: %s",
-                                                       get_database_name(MyDatabaseId),
-                                                       get_namespace_name(RelationGetNamespace(onerel)),
-                                                       RelationGetRelationName(onerel),
-                                                       vacrelstats->num_index_scans,
-                                                       vacrelstats->pages_removed,
-                                                       vacrelstats->rel_pages,
-                                                       vacrelstats->tuples_deleted,
-                                                       vacrelstats->new_rel_tuples,
-                                                       VacuumPageHit,
-                                                       VacuumPageMiss,
-                                                       VacuumPageDirty,
-                                                       read_rate, write_rate,
-                                                       pg_rusage_show(&ru0))));
+                                       (errmsg_internal("%s", buf.data)));
+                       pfree(buf.data);
                 }
         }
  }
@@ -358,7 +442,7 @@ vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
   */
  static void
  lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
-                          Relation *Irel, int nindexes, bool scan_all)
+                          Relation *Irel, int nindexes, bool aggressive)
  {
         BlockNumber nblocks,
                                 blkno;
@@ -374,8 +458,16 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
         int                     i;
         PGRUsage        ru0;
         Buffer          vmbuffer = InvalidBuffer;
-       BlockNumber next_not_all_visible_block;
-       bool            skipping_all_visible_blocks;
+       BlockNumber next_unskippable_block;
+       bool            skipping_blocks;
+       xl_heap_freeze_tuple *frozen;
+       StringInfoData buf;
+       const int       initprog_index[] = {
+               PROGRESS_VACUUM_PHASE,
+               PROGRESS_VACUUM_TOTAL_HEAP_BLKS,
+               PROGRESS_VACUUM_MAX_DEAD_TUPLES
+       };
+       int64           initprog_val[3];
  
         pg_rusage_init(&ru0);
  
@@ -398,50 +490,83 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
         vacrelstats->latestRemovedXid = InvalidTransactionId;
  
         lazy_space_alloc(vacrelstats, nblocks);
+       frozen = palloc(sizeof(xl_heap_freeze_tuple) * MaxHeapTuplesPerPage);
+
+       /* Report that we're scanning the heap, advertising total # of blocks */
+       initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP;
+       initprog_val[1] = nblocks;
+       initprog_val[2] = vacrelstats->max_dead_tuples;
+       pgstat_progress_update_multi_param(3, initprog_index, initprog_val);
  
         /*
-        * We want to skip pages that don't require vacuuming according to the
-        * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD
-        * consecutive pages.  Since we're reading sequentially, the OS should be
-        * doing readahead for us, so there's no gain in skipping a page now and
-        * then; that's likely to disable readahead and so be counterproductive.
-        * Also, skipping even a single page means that we can't update
-        * relfrozenxid, so we only want to do it if we can skip a goodly number
-        * of pages.
+        * Except when aggressive is set, we want to skip pages that are
+        * all-visible according to the visibility map, but only when we can skip
+        * at least SKIP_PAGES_THRESHOLD consecutive pages.  Since we're reading
+        * sequentially, the OS should be doing readahead for us, so there's no
+        * gain in skipping a page now and then; that's likely to disable
+        * readahead and so be counterproductive. Also, skipping even a single
+        * page means that we can't update relfrozenxid, so we only want to do it
+        * if we can skip a goodly number of pages.
          *
-        * Before entering the main loop, establish the invariant that
-        * next_not_all_visible_block is the next block number >= blkno that's not
-        * all-visible according to the visibility map, or nblocks if there's no
-        * such block.  Also, we set up the skipping_all_visible_blocks flag,
-        * which is needed because we need hysteresis in the decision: once we've
-        * started skipping blocks, we may as well skip everything up to the next
-        * not-all-visible block.
+        * When aggressive is set, we can't skip pages just because they are
+        * all-visible, but we can still skip pages that are all-frozen, since
+        * such pages do not need freezing and do not affect the value that we can
+        * safely set for relfrozenxid or relminmxid.
          *
-        * Note: if scan_all is true, we won't actually skip any pages; but we
-        * maintain next_not_all_visible_block anyway, so as to set up the
-        * all_visible_according_to_vm flag correctly for each page.
+        * Before entering the main loop, establish the invariant that
+        * next_unskippable_block is the next block number >= blkno that's not we
+        * can't skip based on the visibility map, either all-visible for a
+        * regular scan or all-frozen for an aggressive scan.  We set it to
+        * nblocks if there's no such block.  We also set up the skipping_blocks
+        * flag correctly at this stage.
          *
-        * Note: The value returned by visibilitymap_test could be slightly
+        * Note: The value returned by visibilitymap_get_status could be slightly
          * out-of-date, since we make this test before reading the corresponding
          * heap page or locking the buffer.  This is OK.  If we mistakenly think
-        * that the page is all-visible when in fact the flag's just been cleared,
-        * we might fail to vacuum the page.  But it's OK to skip pages when
-        * scan_all is not set, so no great harm done; the next vacuum will find
-        * them.  If we make the reverse mistake and vacuum a page unnecessarily,
-        * it'll just be a no-op.
+        * that the page is all-visible or all-frozen when in fact the flag's just
+        * been cleared, we might fail to vacuum the page.  It's easy to see that
+        * skipping a page when aggressive is not set is not a very big deal; we
+        * might leave some dead tuples lying around, but the next vacuum will
+        * find them.  But even when aggressive *is* set, it's still OK if we miss
+        * a page whose all-frozen marking has just been cleared.  Any new XIDs
+        * just added to that page are necessarily newer than the GlobalXmin we
+        * computed, so they'll have no effect on the value to which we can safely
+        * set relfrozenxid.  A similar argument applies for MXIDs and relminmxid.
+        *
+        * We will scan the table's last page, at least to the extent of
+        * determining whether it has tuples or not, even if it should be skipped
+        * according to the above rules; except when we've already determined that
+        * it's not worth trying to truncate the table.  This avoids having
+        * lazy_truncate_heap() take access-exclusive lock on the table to attempt
+        * a truncation that just fails immediately because there are tuples in
+        * the last page.  This is worth avoiding mainly because such a lock must
+        * be replayed on any hot standby, where it can be disruptive.
          */
-       for (next_not_all_visible_block = 0;
-                next_not_all_visible_block < nblocks;
-                next_not_all_visible_block++)
+       for (next_unskippable_block = 0;
+                next_unskippable_block < nblocks;
+                next_unskippable_block++)
         {
-               if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer))
-                       break;
+               uint8           vmstatus;
+
+               vmstatus = visibilitymap_get_status(onerel, next_unskippable_block,
+                                                                                       &vmbuffer);
+               if (aggressive)
+               {
+                       if ((vmstatus & VISIBILITYMAP_ALL_FROZEN) == 0)
+                               break;
+               }
+               else
+               {
+                       if ((vmstatus & VISIBILITYMAP_ALL_VISIBLE) == 0)
+                               break;
+               }
                 vacuum_delay_point();
         }
-       if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD)
-               skipping_all_visible_blocks = true;
+
+       if (next_unskippable_block >= SKIP_PAGES_THRESHOLD)
+               skipping_blocks = true;
         else
-               skipping_all_visible_blocks = false;
+               skipping_blocks = false;
  
         for (blkno = 0; blkno < nblocks; blkno++)
         {
@@ -452,24 +577,42 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                 bool            tupgone,
                                         hastup;
                 int                     prev_dead_count;
-               OffsetNumber frozen[MaxOffsetNumber];
                 int                     nfrozen;
                 Size            freespace;
-               bool            all_visible_according_to_vm;
+               bool            all_visible_according_to_vm = false;
                 bool            all_visible;
+               bool            all_frozen = true;      /* provided all_visible is also true */
                 bool            has_dead_tuples;
                 TransactionId visibility_cutoff_xid = InvalidTransactionId;
  
-               if (blkno == next_not_all_visible_block)
+               /* see note above about forcing scanning of last page */
+#define FORCE_CHECK_PAGE() \
+               (blkno == nblocks - 1 && should_attempt_truncation(vacrelstats))
+
+               pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
+
+               if (blkno == next_unskippable_block)
                 {
-                       /* Time to advance next_not_all_visible_block */
-                       for (next_not_all_visible_block++;
-                                next_not_all_visible_block < nblocks;
-                                next_not_all_visible_block++)
+                       /* Time to advance next_unskippable_block */
+                       for (next_unskippable_block++;
+                                next_unskippable_block < nblocks;
+                                next_unskippable_block++)
                         {
-                               if (!visibilitymap_test(onerel, next_not_all_visible_block,
-                                                                               &vmbuffer))
-                                       break;
+                               uint8           vmskipflags;
+
+                               vmskipflags = visibilitymap_get_status(onerel,
+                                                                                                          next_unskippable_block,
+                                                                                                          &vmbuffer);
+                               if (aggressive)
+                               {
+                                       if ((vmskipflags & VISIBILITYMAP_ALL_FROZEN) == 0)
+                                               break;
+                               }
+                               else
+                               {
+                                       if ((vmskipflags & VISIBILITYMAP_ALL_VISIBLE) == 0)
+                                               break;
+                               }
                                 vacuum_delay_point();
                         }
  
@@ -478,17 +621,44 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                          * skipping_all_visible_blocks to do the right thing at the
                          * following blocks.
                          */
-                       if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD)
-                               skipping_all_visible_blocks = true;
+                       if (next_unskippable_block - blkno > SKIP_PAGES_THRESHOLD)
+                               skipping_blocks = true;
                         else
-                               skipping_all_visible_blocks = false;
-                       all_visible_according_to_vm = false;
+                               skipping_blocks = false;
+
+                       /*
+                        * Normally, the fact that we can't skip this block must mean that
+                        * it's not all-visible.  But in an aggressive vacuum we know only
+                        * that it's not all-frozen, so it might still be all-visible.
+                        */
+                       if (aggressive && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer))
+                               all_visible_according_to_vm = true;
                 }
                 else
                 {
-                       /* Current block is all-visible */
-                       if (skipping_all_visible_blocks && !scan_all)
+                       /*
+                        * The current block is potentially skippable; if we've seen a
+                        * long enough run of skippable blocks to justify skipping it, and
+                        * we're not forced to check it, then go ahead and skip.
+                        * Otherwise, the page must be at least all-visible if not
+                        * all-frozen, so we can set all_visible_according_to_vm = true.
+                        */
+                       if (skipping_blocks && !FORCE_CHECK_PAGE())
+                       {
+                               /*
+                                * Tricky, tricky.  If this is in aggressive vacuum, the page
+                                * must have been all-frozen at the time we checked whether it
+                                * was skippable, but it might not be any more.  We must be
+                                * careful to count it as a skipped all-frozen page in that
+                                * case, or else we'll think we can't update relfrozenxid and
+                                * relminmxid.  If it's not an aggressive vacuum, we don't
+                                * know whether it was all-frozen, so we have to recheck; but
+                                * in this case an approximate answer is OK.
+                                */
+                               if (aggressive || VM_ALL_FROZEN(onerel, blkno, &vmbuffer))
+                                       vacrelstats->frozenskipped_pages++;
                                 continue;
+                       }
                         all_visible_according_to_vm = true;
                 }
  
@@ -501,6 +671,12 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                 if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
                         vacrelstats->num_dead_tuples > 0)
                 {
+                       const int       hvp_index[] = {
+                               PROGRESS_VACUUM_PHASE,
+                               PROGRESS_VACUUM_NUM_INDEX_VACUUMS
+                       };
+                       int64           hvp_val[2];
+
                         /*
                          * Before beginning index vacuuming, we release any pin we may
                          * hold on the visibility map page.  This isn't necessary for
@@ -516,11 +692,26 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                         /* Log cleanup info before we touch indexes */
                         vacuum_log_cleanup_info(onerel, vacrelstats);
  
+                       /* Report that we are now vacuuming indexes */
+                       pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
+                                                                                PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
+
                         /* Remove index entries */
                         for (i = 0; i < nindexes; i++)
                                 lazy_vacuum_index(Irel[i],
                                                                   &indstats[i],
                                                                   vacrelstats);
+
+                       /*
+                        * Report that we are now vacuuming the heap.  We also increase
+                        * the number of index scans here; note that by using
+                        * pgstat_progress_update_multi_param we can update both
+                        * parameters atomically.
+                        */
+                       hvp_val[0] = PROGRESS_VACUUM_PHASE_VACUUM_HEAP;
+                       hvp_val[1] = vacrelstats->num_index_scans + 1;
+                       pgstat_progress_update_multi_param(2, hvp_index, hvp_val);
+
                         /* Remove tuples from heap */
                         lazy_vacuum_heap(onerel, vacrelstats);
  
@@ -531,15 +722,20 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                          */
                         vacrelstats->num_dead_tuples = 0;
                         vacrelstats->num_index_scans++;
+
+                       /* Report that we are once again scanning the heap */
+                       pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
+                                                                                PROGRESS_VACUUM_PHASE_SCAN_HEAP);
                 }
  
                 /*
                  * Pin the visibility map page in case we need to mark the page
                  * all-visible.  In most cases this will be very cheap, because we'll
                  * already have the correct page pinned anyway.  However, it's
-                * possible that (a) next_not_all_visible_block is covered by a
-                * different VM page than the current block or (b) we released our pin
-                * and did a cycle of index vacuuming.
+                * possible that (a) next_unskippable_block is covered by a different
+                * VM page than the current block or (b) we released our pin and did a
+                * cycle of index vacuuming.
+                *
                  */
                 visibilitymap_pin(onerel, blkno, &vmbuffer);
  
@@ -550,31 +746,54 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                 if (!ConditionalLockBufferForCleanup(buf))
                 {
                         /*
-                        * If we're not scanning the whole relation to guard against XID
-                        * wraparound, it's OK to skip vacuuming a page.  The next vacuum
-                        * will clean it up.
+                        * If we're not performing an aggressive scan to guard against XID
+                        * wraparound, and we don't want to forcibly check the page, then
+                        * it's OK to skip vacuuming pages we get a lock conflict on. They
+                        * will be dealt with in some future vacuum.
                          */
-                       if (!scan_all)
+                       if (!aggressive && !FORCE_CHECK_PAGE())
                         {
                                 ReleaseBuffer(buf);
+                               vacrelstats->pinskipped_pages++;
                                 continue;
                         }
  
                         /*
-                        * If this is a wraparound checking vacuum, then we read the page
-                        * with share lock to see if any xids need to be frozen. If the
-                        * page doesn't need attention we just skip and continue. If it
-                        * does, we wait for cleanup lock.
+                        * Read the page with share lock to see if any xids on it need to
+                        * be frozen.  If not we just skip the page, after updating our
+                        * scan statistics.  If there are some, we wait for cleanup lock.
                          *
                          * We could defer the lock request further by remembering the page
                          * and coming back to it later, or we could even register
                          * ourselves for multiple buffers and then service whichever one
                          * is received first.  For now, this seems good enough.
+                        *
+                        * If we get here with aggressive false, then we're just forcibly
+                        * checking the page, and so we don't want to insist on getting
+                        * the lock; we only need to know if the page contains tuples, so
+                        * that we can update nonempty_pages correctly.  It's convenient
+                        * to use lazy_check_needs_freeze() for both situations, though.
                          */
                         LockBuffer(buf, BUFFER_LOCK_SHARE);
-                       if (!lazy_check_needs_freeze(buf))
+                       if (!lazy_check_needs_freeze(buf, &hastup))
                         {
                                 UnlockReleaseBuffer(buf);
+                               vacrelstats->scanned_pages++;
+                               vacrelstats->pinskipped_pages++;
+                               if (hastup)
+                                       vacrelstats->nonempty_pages = blkno + 1;
+                               continue;
+                       }
+                       if (!aggressive)
+                       {
+                               /*
+                                * Here, we must not advance scanned_pages; that would amount
+                                * to claiming that the page contains no freezable tuples.
+                                */
+                               UnlockReleaseBuffer(buf);
+                               vacrelstats->pinskipped_pages++;
+                               if (hastup)
+                                       vacrelstats->nonempty_pages = blkno + 1;
                                 continue;
                         }
                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
@@ -633,13 +852,33 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                         empty_pages++;
                         freespace = PageGetHeapFreeSpace(page);
  
-                       /* empty pages are always all-visible */
+                       /* empty pages are always all-visible and all-frozen */
                         if (!PageIsAllVisible(page))
                         {
-                               PageSetAllVisible(page);
+                               START_CRIT_SECTION();
+
+                               /* mark buffer dirty before writing a WAL record */
                                 MarkBufferDirty(buf);
-                               visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer,
-                                                                 InvalidTransactionId);
+
+                               /*
+                                * It's possible that another backend has extended the heap,
+                                * initialized the page, and then failed to WAL-log the page
+                                * due to an ERROR.  Since heap extension is not WAL-logged,
+                                * recovery might try to replay our record setting the page
+                                * all-visible and find that the page isn't initialized, which
+                                * will cause a PANIC.  To prevent that, check whether the
+                                * page has been previously WAL-logged, and if not, do that
+                                * now.
+                                */
+                               if (RelationNeedsWAL(onerel) &&
+                                       PageGetLSN(page) == InvalidXLogRecPtr)
+                                       log_newpage_buffer(buf, true);
+
+                               PageSetAllVisible(page);
+                               visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
+                                                                 vmbuffer, InvalidTransactionId,
+                                          VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
+                               END_CRIT_SECTION();
                         }
  
                         UnlockReleaseBuffer(buf);
@@ -665,6 +904,11 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                 hastup = false;
                 prev_dead_count = vacrelstats->num_dead_tuples;
                 maxoff = PageGetMaxOffsetNumber(page);
+
+               /*
+                * Note: If you change anything in the loop below, also look at
+                * heap_page_is_all_visible to see if that needs to be changed.
+                */
                 for (offnum = FirstOffsetNumber;
                          offnum <= maxoff;
                          offnum = OffsetNumberNext(offnum))
@@ -706,10 +950,11 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
  
                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
                         tuple.t_len = ItemIdGetLength(itemid);
+                       tuple.t_tableOid = RelationGetRelid(onerel);
  
                         tupgone = false;
  
-                       switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
+                       switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
                         {
                                 case HEAPTUPLE_DEAD:
  
@@ -748,14 +993,14 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                                          * NB: Like with per-tuple hint bits, we can't set the
                                          * PD_ALL_VISIBLE flag if the inserter committed
                                          * asynchronously. See SetHintBits for more info. Check
-                                        * that the HEAP_XMIN_COMMITTED hint bit is set because of
+                                        * that the tuple is hinted xmin-committed because of
                                          * that.
                                          */
                                         if (all_visible)
                                         {
                                                 TransactionId xmin;
  
-                                               if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
+                                               if (!HeapTupleHeaderXminCommitted(tuple.t_data))
                                                 {
                                                         all_visible = false;
                                                         break;
@@ -816,8 +1061,11 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                                  * Each non-removable tuple must be checked to see if it needs
                                  * freezing.  Note we already have exclusive buffer lock.
                                  */
-                               if (heap_freeze_tuple(tuple.t_data, FreezeLimit))
-                                       frozen[nfrozen++] = offnum;
+                               if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit,
+                                                                                 MultiXactCutoff, &frozen[nfrozen]))
+                                       frozen[nfrozen++].offset = offnum;
+                               else if (heap_tuple_needs_eventual_freeze(tuple.t_data))
+                                       all_frozen = false;
                         }
                 }                                               /* scan along page */
  
@@ -828,7 +1076,23 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                  */
                 if (nfrozen > 0)
                 {
+                       START_CRIT_SECTION();
+
                         MarkBufferDirty(buf);
+
+                       /* execute collected freezes */
+                       for (i = 0; i < nfrozen; i++)
+                       {
+                               ItemId          itemid;
+                               HeapTupleHeader htup;
+
+                               itemid = PageGetItemId(page, frozen[i].offset);
+                               htup = (HeapTupleHeader) PageGetItem(page, itemid);
+
+                               heap_execute_freeze_tuple(htup, &frozen[i]);
+                       }
+
+                       /* Now WAL-log freezing if necessary */
                         if (RelationNeedsWAL(onerel))
                         {
                                 XLogRecPtr      recptr;
@@ -836,8 +1100,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                                 recptr = log_heap_freeze(onerel, buf, FreezeLimit,
                                                                                  frozen, nfrozen);
                                 PageSetLSN(page, recptr);
-                               PageSetTLI(page, ThisTimeLineID);
                         }
+
+                       END_CRIT_SECTION();
                 }
  
                 /*
@@ -848,7 +1113,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                         vacrelstats->num_dead_tuples > 0)
                 {
                         /* Remove tuples from heap */
-                       lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
+                       lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer);
+                       has_dead_tuples = false;
  
                         /*
                          * Forget the now-vacuumed tuples, and press on, but be careful
@@ -862,26 +1128,30 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                 freespace = PageGetHeapFreeSpace(page);
  
                 /* mark page all-visible, if appropriate */
-               if (all_visible)
+               if (all_visible && !all_visible_according_to_vm)
                 {
-                       if (!PageIsAllVisible(page))
-                       {
-                               PageSetAllVisible(page);
-                               MarkBufferDirty(buf);
-                               visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer,
-                                                                 visibility_cutoff_xid);
-                       }
-                       else if (!all_visible_according_to_vm)
-                       {
-                               /*
-                                * It should never be the case that the visibility map page is
-                                * set while the page-level bit is clear, but the reverse is
-                                * allowed.  Set the visibility map bit as well so that we get
-                                * back in sync.
-                                */
-                               visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer,
-                                                                 visibility_cutoff_xid);
-                       }
+                       uint8           flags = VISIBILITYMAP_ALL_VISIBLE;
+
+                       if (all_frozen)
+                               flags |= VISIBILITYMAP_ALL_FROZEN;
+
+                       /*
+                        * It should never be the case that the visibility map page is set
+                        * while the page-level bit is clear, but the reverse is allowed
+                        * (if checksums are not enabled).  Regardless, set the both bits
+                        * so that we get back in sync.
+                        *
+                        * NB: If the heap page is all-visible but the VM bit is not set,
+                        * we don't need to dirty the heap page.  However, if checksums
+                        * are enabled, we do need to make sure that the heap page is
+                        * dirtied before passing it to visibilitymap_set(), because it
+                        * may be logged.  Given that this situation should only happen in
+                        * rare cases after a crash, it is not worth optimizing.
+                        */
+                       PageSetAllVisible(page);
+                       MarkBufferDirty(buf);
+                       visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
+                                                         vmbuffer, visibility_cutoff_xid, flags);
                 }
  
                 /*
@@ -892,7 +1162,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                  * that something bad has happened.
                  */
                 else if (all_visible_according_to_vm && !PageIsAllVisible(page)
-                                && visibilitymap_test(onerel, blkno, &vmbuffer))
+                                && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer))
                 {
                         elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
                                  relname, blkno);
@@ -921,6 +1191,24 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                         visibilitymap_clear(onerel, blkno, vmbuffer);
                 }
  
+               /*
+                * If the page is marked as all-visible but not all-frozen, we should
+                * so mark it.  Note that all_frozen is only valid if all_visible is
+                * true, so we must check both.
+                */
+               else if (all_visible_according_to_vm && all_visible && all_frozen &&
+                                !VM_ALL_FROZEN(onerel, blkno, &vmbuffer))
+               {
+                       /*
+                        * We can pass InvalidTransactionId as the cutoff XID here,
+                        * because setting the all-frozen bit doesn't cause recovery
+                        * conflicts.
+                        */
+                       visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
+                                                         vmbuffer, InvalidTransactionId,
+                                                         VISIBILITYMAP_ALL_FROZEN);
+               }
+
                 UnlockReleaseBuffer(buf);
  
                 /* Remember the location of the last page with nonremovable tuples */
@@ -930,17 +1218,23 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                 /*
                  * If we remembered any tuples for deletion, then the page will be
                  * visited again by lazy_vacuum_heap, which will compute and record
-                * its post-compaction free space.      If not, then we're done with this
-                * page, so remember its free space as-is.      (This path will always be
+                * its post-compaction free space.  If not, then we're done with this
+                * page, so remember its free space as-is.  (This path will always be
                  * taken if there are no indexes.)
                  */
                 if (vacrelstats->num_dead_tuples == prev_dead_count)
                         RecordPageWithFreeSpace(onerel, blkno, freespace);
         }
  
+       /* report that everything is scanned and vacuumed */
+       pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
+
+       pfree(frozen);
+
         /* save stats for use later */
         vacrelstats->scanned_tuples = num_tuples;
         vacrelstats->tuples_deleted = tups_vacuumed;
+       vacrelstats->new_dead_tuples = nkeep;
  
         /* now we can compute the new value for pg_class.reltuples */
         vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false,
@@ -948,29 +1242,54 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                                                                                                   vacrelstats->scanned_pages,
                                                                                                                  num_tuples);
  
+       /*
+        * Release any remaining pin on visibility map page.
+        */
+       if (BufferIsValid(vmbuffer))
+       {
+               ReleaseBuffer(vmbuffer);
+               vmbuffer = InvalidBuffer;
+       }
+
         /* If any tuples need to be deleted, perform final vacuum cycle */
         /* XXX put a threshold on min number of tuples here? */
         if (vacrelstats->num_dead_tuples > 0)
         {
+               const int       hvp_index[] = {
+                       PROGRESS_VACUUM_PHASE,
+                       PROGRESS_VACUUM_NUM_INDEX_VACUUMS
+               };
+               int64           hvp_val[2];
+
                 /* Log cleanup info before we touch indexes */
                 vacuum_log_cleanup_info(onerel, vacrelstats);
  
+               /* Report that we are now vacuuming indexes */
+               pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
+                                                                        PROGRESS_VACUUM_PHASE_VACUUM_INDEX);
+
                 /* Remove index entries */
                 for (i = 0; i < nindexes; i++)
                         lazy_vacuum_index(Irel[i],
                                                           &indstats[i],
                                                           vacrelstats);
+
+               /* Report that we are now vacuuming the heap */
+               hvp_val[0] = PROGRESS_VACUUM_PHASE_VACUUM_HEAP;
+               hvp_val[1] = vacrelstats->num_index_scans + 1;
+               pgstat_progress_update_multi_param(2, hvp_index, hvp_val);
+
                 /* Remove tuples from heap */
+               pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
+                                                                        PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
                 lazy_vacuum_heap(onerel, vacrelstats);
                 vacrelstats->num_index_scans++;
         }
  
-       /* Release the pin on the visibility map page */
-       if (BufferIsValid(vmbuffer))
-       {
-               ReleaseBuffer(vmbuffer);
-               vmbuffer = InvalidBuffer;
-       }
+       /* report all blocks vacuumed; and that we're cleaning up */
+       pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
+       pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
+                                                                PROGRESS_VACUUM_PHASE_INDEX_CLEANUP);
  
         /* Do post-vacuum cleanup and statistics update for each index */
         for (i = 0; i < nindexes; i++)
@@ -983,19 +1302,34 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                                                 RelationGetRelationName(onerel),
                                                 tups_vacuumed, vacuumed_pages)));
  
+       /*
+        * This is pretty messy, but we split it up so that we can skip emitting
+        * individual parts of the message when not applicable.
+        */
+       initStringInfo(&buf);
+       appendStringInfo(&buf,
+                                        _("%.0f dead row versions cannot be removed yet.\n"),
+                                        nkeep);
+       appendStringInfo(&buf, _("There were %.0f unused item pointers.\n"),
+                                        nunused);
+       appendStringInfo(&buf, ngettext("Skipped %u page due to buffer pins.\n",
+                                                                       "Skipped %u pages due to buffer pins.\n",
+                                                                       vacrelstats->pinskipped_pages),
+                                        vacrelstats->pinskipped_pages);
+       appendStringInfo(&buf, ngettext("%u page is entirely empty.\n",
+                                                                       "%u pages are entirely empty.\n",
+                                                                       empty_pages),
+                                        empty_pages);
+       appendStringInfo(&buf, _("%s."),
+                                        pg_rusage_show(&ru0));
+
         ereport(elevel,
                         (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
                                         RelationGetRelationName(onerel),
                                         tups_vacuumed, num_tuples,
                                         vacrelstats->scanned_pages, nblocks),
-                        errdetail("%.0f dead row versions cannot be removed yet.\n"
-                                          "There were %.0f unused item pointers.\n"
-                                          "%u pages are entirely empty.\n"
-                                          "%s.",
-                                          nkeep,
-                                          nunused,
-                                          empty_pages,
-                                          pg_rusage_show(&ru0))));
+                        errdetail_internal("%s", buf.data)));
+       pfree(buf.data);
  }
  
  
@@ -1016,6 +1350,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
         int                     tupindex;
         int                     npages;
         PGRUsage        ru0;
+       Buffer          vmbuffer = InvalidBuffer;
  
         pg_rusage_init(&ru0);
         npages = 0;
@@ -1039,7 +1374,8 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
                         ++tupindex;
                         continue;
                 }
-               tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
+               tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats,
+                                                                       &vmbuffer);
  
                 /* Now that we've compacted the page, record its available space */
                 page = BufferGetPage(buf);
@@ -1050,6 +1386,12 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
                 npages++;
         }
  
+       if (BufferIsValid(vmbuffer))
+       {
+               ReleaseBuffer(vmbuffer);
+               vmbuffer = InvalidBuffer;
+       }
+
         ereport(elevel,
                         (errmsg("\"%s\": removed %d row versions in %d pages",
                                         RelationGetRelationName(onerel),
@@ -1070,11 +1412,15 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
   */
  static int
  lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
-                                int tupindex, LVRelStats *vacrelstats)
+                                int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer)
  {
         Page            page = BufferGetPage(buffer);
         OffsetNumber unused[MaxOffsetNumber];
         int                     uncnt = 0;
+       TransactionId visibility_cutoff_xid;
+       bool            all_frozen;
+
+       pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
  
         START_CRIT_SECTION();
  
@@ -1095,6 +1441,9 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
  
         PageRepairFragmentation(page);
  
+       /*
+        * Mark buffer dirty before we write WAL.
+        */
         MarkBufferDirty(buffer);
  
         /* XLOG stuff */
@@ -1107,11 +1456,48 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
                                                                 unused, uncnt,
                                                                 vacrelstats->latestRemovedXid);
                 PageSetLSN(page, recptr);
-               PageSetTLI(page, ThisTimeLineID);
         }
  
+       /*
+        * End critical section, so we safely can do visibility tests (which
+        * possibly need to perform IO and allocate memory!). If we crash now the
+        * page (including the corresponding vm bit) might not be marked all
+        * visible, but that's fine. A later vacuum will fix that.
+        */
         END_CRIT_SECTION();
  
+       /*
+        * Now that we have removed the dead tuples from the page, once again
+        * check if the page has become all-visible.  The page is already marked
+        * dirty, exclusively locked, and, if needed, a full page image has been
+        * emitted in the log_heap_clean() above.
+        */
+       if (heap_page_is_all_visible(onerel, buffer, &visibility_cutoff_xid,
+                                                                &all_frozen))
+               PageSetAllVisible(page);
+
+       /*
+        * All the changes to the heap page have been done. If the all-visible
+        * flag is now set, also set the VM all-visible bit (and, if possible, the
+        * all-frozen bit) unless this has already been done previously.
+        */
+       if (PageIsAllVisible(page))
+       {
+               uint8           vm_status = visibilitymap_get_status(onerel, blkno, vmbuffer);
+               uint8           flags = 0;
+
+               /* Set the VM all-frozen bit to flag, if needed */
+               if ((vm_status & VISIBILITYMAP_ALL_VISIBLE) == 0)
+                       flags |= VISIBILITYMAP_ALL_VISIBLE;
+               if ((vm_status & VISIBILITYMAP_ALL_FROZEN) == 0 && all_frozen)
+                       flags |= VISIBILITYMAP_ALL_FROZEN;
+
+               Assert(BufferIsValid(*vmbuffer));
+               if (flags != 0)
+                       visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr,
+                                                         *vmbuffer, visibility_cutoff_xid, flags);
+       }
+
         return tupindex;
  }
  
@@ -1120,22 +1506,25 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
   *                                      need to be cleaned to avoid wraparound
   *
   * Returns true if the page needs to be vacuumed using cleanup lock.
+ * Also returns a flag indicating whether page contains any tuples at all.
   */
  static bool
-lazy_check_needs_freeze(Buffer buf)
+lazy_check_needs_freeze(Buffer buf, bool *hastup)
  {
-       Page            page;
+       Page            page = BufferGetPage(buf);
         OffsetNumber offnum,
                                 maxoff;
         HeapTupleHeader tupleheader;
  
-       page = BufferGetPage(buf);
+       *hastup = false;
  
-       if (PageIsNew(page) || PageIsEmpty(page))
-       {
-               /* PageIsNew probably shouldn't happen... */
+       /* If we hit an uninitialized page, we want to force vacuuming it. */
+       if (PageIsNew(page))
+               return true;
+
+       /* Quick out for ordinary empty page. */
+       if (PageIsEmpty(page))
                 return false;
-       }
  
         maxoff = PageGetMaxOffsetNumber(page);
         for (offnum = FirstOffsetNumber;
@@ -1146,12 +1535,18 @@ lazy_check_needs_freeze(Buffer buf)
  
                 itemid = PageGetItemId(page, offnum);
  
+               /* this should match hastup test in count_nondeletable_pages() */
+               if (ItemIdIsUsed(itemid))
+                       *hastup = true;
+
+               /* dead and redirect items never need freezing */
                 if (!ItemIdIsNormal(itemid))
                         continue;
  
                 tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
  
-               if (heap_tuple_needs_freeze(tupleheader, FreezeLimit, buf))
+               if (heap_tuple_needs_freeze(tupleheader, FreezeLimit,
+                                                                       MultiXactCutoff, buf))
                         return true;
         }                                                       /* scan along page */
  
@@ -1228,7 +1623,9 @@ lazy_cleanup_index(Relation indrel,
                                                         stats->num_index_tuples,
                                                         0,
                                                         false,
-                                                       InvalidTransactionId);
+                                                       InvalidTransactionId,
+                                                       InvalidMultiXactId,
+                                                       false);
  
         ereport(elevel,
                         (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
@@ -1245,6 +1642,31 @@ lazy_cleanup_index(Relation indrel,
         pfree(stats);
  }
  
+/*
+ * should_attempt_truncation - should we attempt to truncate the heap?
+ *
+ * Don't even think about it unless we have a shot at releasing a goodly
+ * number of pages.  Otherwise, the time taken isn't worth it.
+ *
+ * This is split out so that we can test whether truncation is going to be
+ * called for before we actually do it.  If you change the logic here, be
+ * careful to depend only on fields that lazy_scan_heap updates on-the-fly.
+ */
+static bool
+should_attempt_truncation(LVRelStats *vacrelstats)
+{
+       BlockNumber possibly_freeable;
+
+       possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
+       if (possibly_freeable > 0 &&
+               (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
+                possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION) &&
+               old_snapshot_threshold < 0)
+               return true;
+       else
+               return false;
+}
+
  /*
   * lazy_truncate_heap - try to truncate off any empty pages at the end
   */
@@ -1254,80 +1676,121 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
         BlockNumber old_rel_pages = vacrelstats->rel_pages;
         BlockNumber new_rel_pages;
         PGRUsage        ru0;
+       int                     lock_retry;
  
         pg_rusage_init(&ru0);
  
-       /*
-        * We need full exclusive lock on the relation in order to do truncation.
-        * If we can't get it, give up rather than waiting --- we don't want to
-        * block other backends, and we don't want to deadlock (which is quite
-        * possible considering we already hold a lower-grade lock).
-        */
-       if (!ConditionalLockRelation(onerel, AccessExclusiveLock))
-               return;
+       /* Report that we are now truncating */
+       pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
+                                                                PROGRESS_VACUUM_PHASE_TRUNCATE);
  
         /*
-        * Now that we have exclusive lock, look to see if the rel has grown
-        * whilst we were vacuuming with non-exclusive lock.  If so, give up; the
-        * newly added pages presumably contain non-deletable tuples.
+        * Loop until no more truncating can be done.
          */
-       new_rel_pages = RelationGetNumberOfBlocks(onerel);
-       if (new_rel_pages != old_rel_pages)
+       do
         {
                 /*
-                * Note: we intentionally don't update vacrelstats->rel_pages with the
-                * new rel size here.  If we did, it would amount to assuming that the
-                * new pages are empty, which is unlikely.      Leaving the numbers alone
-                * amounts to assuming that the new pages have the same tuple density
-                * as existing ones, which is less unlikely.
+                * We need full exclusive lock on the relation in order to do
+                * truncation. If we can't get it, give up rather than waiting --- we
+                * don't want to block other backends, and we don't want to deadlock
+                * (which is quite possible considering we already hold a lower-grade
+                * lock).
                  */
-               UnlockRelation(onerel, AccessExclusiveLock);
-               return;
-       }
+               vacrelstats->lock_waiter_detected = false;
+               lock_retry = 0;
+               while (true)
+               {
+                       if (ConditionalLockRelation(onerel, AccessExclusiveLock))
+                               break;
  
-       /*
-        * Scan backwards from the end to verify that the end pages actually
-        * contain no tuples.  This is *necessary*, not optional, because other
-        * backends could have added tuples to these pages whilst we were
-        * vacuuming.
-        */
-       new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
+                       /*
+                        * Check for interrupts while trying to (re-)acquire the exclusive
+                        * lock.
+                        */
+                       CHECK_FOR_INTERRUPTS();
  
-       if (new_rel_pages >= old_rel_pages)
-       {
-               /* can't do anything after all */
-               UnlockRelation(onerel, AccessExclusiveLock);
-               return;
-       }
+                       if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
+                                                               VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
+                       {
+                               /*
+                                * We failed to establish the lock in the specified number of
+                                * retries. This means we give up truncating.
+                                */
+                               vacrelstats->lock_waiter_detected = true;
+                               ereport(elevel,
+                                               (errmsg("\"%s\": stopping truncate due to conflicting lock request",
+                                                               RelationGetRelationName(onerel))));
+                               return;
+                       }
  
-       /*
-        * Okay to truncate.
-        */
-       RelationTruncate(onerel, new_rel_pages);
+                       pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL);
+               }
  
-       /*
-        * We can release the exclusive lock as soon as we have truncated.      Other
-        * backends can't safely access the relation until they have processed the
-        * smgr invalidation that smgrtruncate sent out ... but that should happen
-        * as part of standard invalidation processing once they acquire lock on
-        * the relation.
-        */
-       UnlockRelation(onerel, AccessExclusiveLock);
+               /*
+                * Now that we have exclusive lock, look to see if the rel has grown
+                * whilst we were vacuuming with non-exclusive lock.  If so, give up;
+                * the newly added pages presumably contain non-deletable tuples.
+                */
+               new_rel_pages = RelationGetNumberOfBlocks(onerel);
+               if (new_rel_pages != old_rel_pages)
+               {
+                       /*
+                        * Note: we intentionally don't update vacrelstats->rel_pages with
+                        * the new rel size here.  If we did, it would amount to assuming
+                        * that the new pages are empty, which is unlikely. Leaving the
+                        * numbers alone amounts to assuming that the new pages have the
+                        * same tuple density as existing ones, which is less unlikely.
+                        */
+                       UnlockRelation(onerel, AccessExclusiveLock);
+                       return;
+               }
  
-       /*
-        * Update statistics.  Here, it *is* correct to adjust rel_pages without
-        * also touching reltuples, since the tuple count wasn't changed by the
-        * truncation.
-        */
-       vacrelstats->rel_pages = new_rel_pages;
-       vacrelstats->pages_removed = old_rel_pages - new_rel_pages;
+               /*
+                * Scan backwards from the end to verify that the end pages actually
+                * contain no tuples.  This is *necessary*, not optional, because
+                * other backends could have added tuples to these pages whilst we
+                * were vacuuming.
+                */
+               new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
  
-       ereport(elevel,
-                       (errmsg("\"%s\": truncated %u to %u pages",
-                                       RelationGetRelationName(onerel),
-                                       old_rel_pages, new_rel_pages),
-                        errdetail("%s.",
-                                          pg_rusage_show(&ru0))));
+               if (new_rel_pages >= old_rel_pages)
+               {
+                       /* can't do anything after all */
+                       UnlockRelation(onerel, AccessExclusiveLock);
+                       return;
+               }
+
+               /*
+                * Okay to truncate.
+                */
+               RelationTruncate(onerel, new_rel_pages);
+
+               /*
+                * We can release the exclusive lock as soon as we have truncated.
+                * Other backends can't safely access the relation until they have
+                * processed the smgr invalidation that smgrtruncate sent out ... but
+                * that should happen as part of standard invalidation processing once
+                * they acquire lock on the relation.
+                */
+               UnlockRelation(onerel, AccessExclusiveLock);
+
+               /*
+                * Update statistics.  Here, it *is* correct to adjust rel_pages
+                * without also touching reltuples, since the tuple count wasn't
+                * changed by the truncation.
+                */
+               vacrelstats->pages_removed += old_rel_pages - new_rel_pages;
+               vacrelstats->rel_pages = new_rel_pages;
+
+               ereport(elevel,
+                               (errmsg("\"%s\": truncated %u to %u pages",
+                                               RelationGetRelationName(onerel),
+                                               old_rel_pages, new_rel_pages),
+                                errdetail("%s.",
+                                                  pg_rusage_show(&ru0))));
+               old_rel_pages = new_rel_pages;
+       } while (new_rel_pages > vacrelstats->nonempty_pages &&
+                        vacrelstats->lock_waiter_detected);
  }
  
  /*
@@ -1339,6 +1802,10 @@ static BlockNumber
  count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
  {
         BlockNumber blkno;
+       instr_time      starttime;
+
+       /* Initialize the starttime if we check for conflicting lock requests */
+       INSTR_TIME_SET_CURRENT(starttime);
  
         /* Strange coding of loop control is needed because blkno is unsigned */
         blkno = vacrelstats->rel_pages;
@@ -1350,6 +1817,38 @@ count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
                                         maxoff;
                 bool            hastup;
  
+               /*
+                * Check if another process requests a lock on our relation. We are
+                * holding an AccessExclusiveLock here, so they will be waiting. We
+                * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
+                * only check if that interval has elapsed once every 32 blocks to
+                * keep the number of system calls and actual shared lock table
+                * lookups to a minimum.
+                */
+               if ((blkno % 32) == 0)
+               {
+                       instr_time      currenttime;
+                       instr_time      elapsed;
+
+                       INSTR_TIME_SET_CURRENT(currenttime);
+                       elapsed = currenttime;
+                       INSTR_TIME_SUBTRACT(elapsed, starttime);
+                       if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
+                               >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
+                       {
+                               if (LockHasWaitersRelation(onerel, AccessExclusiveLock))
+                               {
+                                       ereport(elevel,
+                                                       (errmsg("\"%s\": suspending truncate due to conflicting lock request",
+                                                                       RelationGetRelationName(onerel))));
+
+                                       vacrelstats->lock_waiter_detected = true;
+                                       return blkno;
+                               }
+                               starttime = currenttime;
+                       }
+               }
+
                 /*
                  * We don't insert a vacuum delay point here, because we have an
                  * exclusive lock on the table which we want to hold for as short a
@@ -1421,10 +1920,13 @@ static void
  lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
  {
         long            maxtuples;
+       int                     vac_work_mem = IsAutoVacuumWorkerProcess() &&
+       autovacuum_work_mem != -1 ?
+       autovacuum_work_mem : maintenance_work_mem;
  
         if (vacrelstats->hasindex)
         {
-               maxtuples = (maintenance_work_mem * 1024L) / sizeof(ItemPointerData);
+               maxtuples = (vac_work_mem * 1024L) / sizeof(ItemPointerData);
                 maxtuples = Min(maxtuples, INT_MAX);
                 maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
  
@@ -1462,6 +1964,8 @@ lazy_record_dead_tuple(LVRelStats *vacrelstats,
         {
                 vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
                 vacrelstats->num_dead_tuples++;
+               pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES,
+                                                                        vacrelstats->num_dead_tuples);
         }
  }
  
@@ -1516,3 +2020,119 @@ vac_cmp_itemptr(const void *left, const void *right)
  
         return 0;
  }
+
+/*
+ * Check if every tuple in the given page is visible to all current and future
+ * transactions. Also return the visibility_cutoff_xid which is the highest
+ * xmin amongst the visible tuples.  Set *all_frozen to true if every tuple
+ * on this page is frozen.
+ */
+static bool
+heap_page_is_all_visible(Relation rel, Buffer buf,
+                                                TransactionId *visibility_cutoff_xid,
+                                                bool *all_frozen)
+{
+       Page            page = BufferGetPage(buf);
+       BlockNumber blockno = BufferGetBlockNumber(buf);
+       OffsetNumber offnum,
+                               maxoff;
+       bool            all_visible = true;
+
+       *visibility_cutoff_xid = InvalidTransactionId;
+       *all_frozen = true;
+
+       /*
+        * This is a stripped down version of the line pointer scan in
+        * lazy_scan_heap(). So if you change anything here, also check that code.
+        */
+       maxoff = PageGetMaxOffsetNumber(page);
+       for (offnum = FirstOffsetNumber;
+                offnum <= maxoff && all_visible;
+                offnum = OffsetNumberNext(offnum))
+       {
+               ItemId          itemid;
+               HeapTupleData tuple;
+
+               itemid = PageGetItemId(page, offnum);
+
+               /* Unused or redirect line pointers are of no interest */
+               if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
+                       continue;
+
+               ItemPointerSet(&(tuple.t_self), blockno, offnum);
+
+               /*
+                * Dead line pointers can have index pointers pointing to them. So
+                * they can't be treated as visible
+                */
+               if (ItemIdIsDead(itemid))
+               {
+                       all_visible = false;
+                       break;
+               }
+
+               Assert(ItemIdIsNormal(itemid));
+
+               tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
+               tuple.t_len = ItemIdGetLength(itemid);
+               tuple.t_tableOid = RelationGetRelid(rel);
+
+               switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
+               {
+                       case HEAPTUPLE_LIVE:
+                               {
+                                       TransactionId xmin;
+
+                                       /* Check comments in lazy_scan_heap. */
+                                       if (!HeapTupleHeaderXminCommitted(tuple.t_data))
+                                       {
+                                               all_visible = false;
+                                               break;
+                                       }
+
+                                       /*
+                                        * The inserter definitely committed. But is it old enough
+                                        * that everyone sees it as committed?
+                                        */
+                                       xmin = HeapTupleHeaderGetXmin(tuple.t_data);
+                                       if (!TransactionIdPrecedes(xmin, OldestXmin))
+                                       {
+                                               all_visible = false;
+                                               break;
+                                       }
+
+                                       /* Track newest xmin on page. */
+                                       if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
+                                               *visibility_cutoff_xid = xmin;
+
+                                       /* Check whether this tuple is already frozen or not */
+                                       if (all_visible && *all_frozen &&
+                                               heap_tuple_needs_eventual_freeze(tuple.t_data))
+                                               *all_frozen = false;
+                               }
+                               break;
+
+                       case HEAPTUPLE_DEAD:
+                       case HEAPTUPLE_RECENTLY_DEAD:
+                       case HEAPTUPLE_INSERT_IN_PROGRESS:
+                       case HEAPTUPLE_DELETE_IN_PROGRESS:
+                               all_visible = false;
+                               break;
+
+                       default:
+                               elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
+                               break;
+               }
+       }                                                       /* scan along page */
+
+       /*
+        * We don't bother clearing *all_frozen when the page is discovered not to
+        * be all-visible, so do that now if necessary.  The page might fail to be
+        * all-frozen for other reasons anyway, but if it's not all-visible, then
+        * it definitely isn't all-frozen.
+        */
+       if (!all_visible)
+               *all_frozen = false;
+
+       return all_visible;
+}