granicus.if.org Git - postgresql/blob - src/backend/commands/vacuumlazy.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * vacuumlazy.c
   4  *        Concurrent ("lazy") vacuuming.
   5  *
   6  *
   7  * The major space usage for LAZY VACUUM is storage for the array of dead
   8  * tuple TIDs, with the next biggest need being storage for per-disk-page
   9  * free space info.  We want to ensure we can vacuum even the very largest
  10  * relations with finite memory space usage.  To do that, we set upper bounds
  11  * on the number of tuples and pages we will keep track of at once.
  12  *
  13  * We are willing to use at most maintenance_work_mem memory space to keep
  14  * track of dead tuples.  We initially allocate an array of TIDs of that size.
  15  * If the array threatens to overflow, we suspend the heap scan phase and
  16  * perform a pass of index cleanup and page compaction, then resume the heap
  17  * scan with an empty TID array.
  18  *
  19  * We can limit the storage for page free space to MaxFSMPages entries,
  20  * since that's the most the free space map will be willing to remember
  21  * anyway.      If the relation has fewer than that many pages with free space,
  22  * life is easy: just build an array of per-page info.  If it has more,
  23  * we store the free space info as a heap ordered by amount of free space,
  24  * so that we can discard the pages with least free space to ensure we never
  25  * have more than MaxFSMPages entries in all.  The surviving page entries
  26  * are passed to the free space map at conclusion of the scan.
  27  *
  28  *
  29  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  30  * Portions Copyright (c) 1994, Regents of the University of California
  31  *
  32  *
  33  * IDENTIFICATION
  34  *        $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.66 2006/02/11 23:31:34 tgl Exp $
  35  *
  36  *-------------------------------------------------------------------------
  37  */
  38 #include "postgres.h"
  39
  40 #include <math.h>
  41
  42 #include "access/genam.h"
  43 #include "access/heapam.h"
  44 #include "access/xlog.h"
  45 #include "commands/vacuum.h"
  46 #include "miscadmin.h"
  47 #include "pgstat.h"
  48 #include "storage/freespace.h"
  49 #include "storage/smgr.h"
  50 #include "utils/lsyscache.h"
  51 #include "utils/pg_rusage.h"
  52
  53
  54 /*
  55  * Space/time tradeoff parameters: do these need to be user-tunable?
  56  *
  57  * To consider truncating the relation, we want there to be at least
  58  * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
  59  * is less) potentially-freeable pages.
  60  */
  61 #define REL_TRUNCATE_MINIMUM    1000
  62 #define REL_TRUNCATE_FRACTION   16
  63
  64
  65 typedef struct LVRelStats
  66 {
  67         /* Overall statistics about rel */
  68         BlockNumber rel_pages;
  69         double          rel_tuples;
  70         BlockNumber pages_removed;
  71         double          tuples_deleted;
  72         BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
  73         Size            threshold;              /* minimum interesting free space */
  74         /* List of TIDs of tuples we intend to delete */
  75         /* NB: this list is ordered by TID address */
  76         int                     num_dead_tuples;        /* current # of entries */
  77         int                     max_dead_tuples;        /* # slots allocated in array */
  78         ItemPointer dead_tuples;        /* array of ItemPointerData */
  79         /* Array or heap of per-page info about free space */
  80         /* We use a simple array until it fills up, then convert to heap */
  81         bool            fs_is_heap;             /* are we using heap organization? */
  82         int                     num_free_pages; /* current # of entries */
  83         int                     max_free_pages; /* # slots allocated in array */
  84         PageFreeSpaceInfo *free_pages;          /* array or heap of blkno/avail */
  85 } LVRelStats;
  86
  87
  88 static int      elevel = -1;
  89
  90 static TransactionId OldestXmin;
  91 static TransactionId FreezeLimit;
  92
  93
  94 /* non-export function prototypes */
  95 static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
  96                            Relation *Irel, int nindexes);
  97 static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
  98 static void lazy_scan_index(Relation indrel, LVRelStats *vacrelstats);
  99 static void lazy_vacuum_index(Relation indrel,
 100                                   double *index_tups_vacuumed,
 101                                   BlockNumber *index_pages_removed,
 102                                   LVRelStats *vacrelstats);
 103 static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 104                                  int tupindex, LVRelStats *vacrelstats);
 105 static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
 106 static BlockNumber count_nondeletable_pages(Relation onerel,
 107                                                  LVRelStats *vacrelstats);
 108 static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
 109 static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
 110                                            ItemPointer itemptr);
 111 static void lazy_record_free_space(LVRelStats *vacrelstats,
 112                                            BlockNumber page, Size avail);
 113 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
 114 static bool dummy_tid_reaped(ItemPointer itemptr, void *state);
 115 static void lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats);
 116 static int      vac_cmp_itemptr(const void *left, const void *right);
 117 static int      vac_cmp_page_spaces(const void *left, const void *right);
 118
 119
 120 /*
 121  *      lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
 122  *
 123  *              This routine vacuums a single heap, cleans out its indexes, and
 124  *              updates its num_pages and num_tuples statistics.
 125  *
 126  *              At entry, we have already established a transaction and opened
 127  *              and locked the relation.
 128  */
 129 void
 130 lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
 131 {
 132         LVRelStats *vacrelstats;
 133         Relation   *Irel;
 134         int                     nindexes;
 135         bool            hasindex;
 136         BlockNumber possibly_freeable;
 137
 138         if (vacstmt->verbose)
 139                 elevel = INFO;
 140         else
 141                 elevel = DEBUG2;
 142
 143         vacuum_set_xid_limits(vacstmt, onerel->rd_rel->relisshared,
 144                                                   &OldestXmin, &FreezeLimit);
 145
 146         vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
 147
 148         /* Set threshold for interesting free space = average request size */
 149         /* XXX should we scale it up or down?  Adjust vacuum.c too, if so */
 150         vacrelstats->threshold = GetAvgFSMRequestSize(&onerel->rd_node);
 151
 152         /* Open all indexes of the relation */
 153         vac_open_indexes(onerel, ShareUpdateExclusiveLock, &nindexes, &Irel);
 154         hasindex = (nindexes > 0);
 155
 156         /* Do the vacuuming */
 157         lazy_scan_heap(onerel, vacrelstats, Irel, nindexes);
 158
 159         /* Done with indexes */
 160         vac_close_indexes(nindexes, Irel, NoLock);
 161
 162         /*
 163          * Optionally truncate the relation.
 164          *
 165          * Don't even think about it unless we have a shot at releasing a goodly
 166          * number of pages.  Otherwise, the time taken isn't worth it.
 167          */
 168         possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
 169         if (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
 170                 possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION)
 171                 lazy_truncate_heap(onerel, vacrelstats);
 172
 173         /* Update shared free space map with final free space info */
 174         lazy_update_fsm(onerel, vacrelstats);
 175
 176         /* Update statistics in pg_class */
 177         vac_update_relstats(RelationGetRelid(onerel),
 178                                                 vacrelstats->rel_pages,
 179                                                 vacrelstats->rel_tuples,
 180                                                 hasindex);
 181
 182         /* report results to the stats collector, too */
 183         pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared,
 184                                                  vacstmt->analyze, vacrelstats->rel_tuples);
 185 }
 186
 187
 188 /*
 189  *      lazy_scan_heap() -- scan an open heap relation
 190  *
 191  *              This routine sets commit status bits, builds lists of dead tuples
 192  *              and pages with free space, and calculates statistics on the number
 193  *              of live tuples in the heap.  When done, or when we run low on space
 194  *              for dead-tuple TIDs, invoke vacuuming of indexes and heap.
 195  */
 196 static void
 197 lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 198                            Relation *Irel, int nindexes)
 199 {
 200         BlockNumber nblocks,
 201                                 blkno;
 202         HeapTupleData tuple;
 203         char       *relname;
 204         BlockNumber empty_pages;
 205         double          num_tuples,
 206                                 tups_vacuumed,
 207                                 nkeep,
 208                                 nunused;
 209         double     *index_tups_vacuumed;
 210         BlockNumber *index_pages_removed;
 211         bool            did_vacuum_index = false;
 212         int                     i;
 213         PGRUsage        ru0;
 214
 215         pg_rusage_init(&ru0);
 216
 217         relname = RelationGetRelationName(onerel);
 218         ereport(elevel,
 219                         (errmsg("vacuuming \"%s.%s\"",
 220                                         get_namespace_name(RelationGetNamespace(onerel)),
 221                                         relname)));
 222
 223         empty_pages = 0;
 224         num_tuples = tups_vacuumed = nkeep = nunused = 0;
 225
 226         /*
 227          * Because index vacuuming is done in multiple passes, we have to keep
 228          * track of the total number of rows and pages removed from each index.
 229          * index_tups_vacuumed[i] is the number removed so far from the i'th
 230          * index.  (For partial indexes this could well be different from
 231          * tups_vacuumed.)      Likewise for index_pages_removed[i].
 232          */
 233         index_tups_vacuumed = (double *) palloc0(nindexes * sizeof(double));
 234         index_pages_removed = (BlockNumber *) palloc0(nindexes * sizeof(BlockNumber));
 235
 236         nblocks = RelationGetNumberOfBlocks(onerel);
 237         vacrelstats->rel_pages = nblocks;
 238         vacrelstats->nonempty_pages = 0;
 239
 240         lazy_space_alloc(vacrelstats, nblocks);
 241
 242         for (blkno = 0; blkno < nblocks; blkno++)
 243         {
 244                 Buffer          buf;
 245                 Page            page;
 246                 OffsetNumber offnum,
 247                                         maxoff;
 248                 bool            pgchanged,
 249                                         tupgone,
 250                                         hastup;
 251                 int                     prev_dead_count;
 252
 253                 vacuum_delay_point();
 254
 255                 /*
 256                  * If we are close to overrunning the available space for dead-tuple
 257                  * TIDs, pause and do a cycle of vacuuming before we tackle this page.
 258                  */
 259                 if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
 260                         vacrelstats->num_dead_tuples > 0)
 261                 {
 262                         /* Remove index entries */
 263                         for (i = 0; i < nindexes; i++)
 264                                 lazy_vacuum_index(Irel[i],
 265                                                                   &index_tups_vacuumed[i],
 266                                                                   &index_pages_removed[i],
 267                                                                   vacrelstats);
 268                         did_vacuum_index = true;
 269                         /* Remove tuples from heap */
 270                         lazy_vacuum_heap(onerel, vacrelstats);
 271                         /* Forget the now-vacuumed tuples, and press on */
 272                         vacrelstats->num_dead_tuples = 0;
 273                 }
 274
 275                 buf = ReadBuffer(onerel, blkno);
 276
 277                 /* In this phase we only need shared access to the buffer */
 278                 LockBuffer(buf, BUFFER_LOCK_SHARE);
 279
 280                 page = BufferGetPage(buf);
 281
 282                 if (PageIsNew(page))
 283                 {
 284                         /*
 285                          * An all-zeroes page could be left over if a backend extends the
 286                          * relation but crashes before initializing the page. Reclaim such
 287                          * pages for use.
 288                          *
 289                          * We have to be careful here because we could be looking at a
 290                          * page that someone has just added to the relation and not yet
 291                          * been able to initialize (see RelationGetBufferForTuple). To
 292                          * interlock against that, release the buffer read lock (which we
 293                          * must do anyway) and grab the relation extension lock before
 294                          * re-locking in exclusive mode.  If the page is still
 295                          * uninitialized by then, it must be left over from a crashed
 296                          * backend, and we can initialize it.
 297                          *
 298                          * We don't really need the relation lock when this is a new or
 299                          * temp relation, but it's probably not worth the code space to
 300                          * check that, since this surely isn't a critical path.
 301                          *
 302                          * Note: the comparable code in vacuum.c need not worry because
 303                          * it's got exclusive lock on the whole relation.
 304                          */
 305                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 306                         LockRelationForExtension(onerel, ExclusiveLock);
 307                         UnlockRelationForExtension(onerel, ExclusiveLock);
 308                         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 309                         if (PageIsNew(page))
 310                         {
 311                                 ereport(WARNING,
 312                                 (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
 313                                                 relname, blkno)));
 314                                 PageInit(page, BufferGetPageSize(buf), 0);
 315                                 empty_pages++;
 316                                 lazy_record_free_space(vacrelstats, blkno,
 317                                                                            PageGetFreeSpace(page));
 318                         }
 319                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 320                         WriteBuffer(buf);
 321                         continue;
 322                 }
 323
 324                 if (PageIsEmpty(page))
 325                 {
 326                         empty_pages++;
 327                         lazy_record_free_space(vacrelstats, blkno,
 328                                                                    PageGetFreeSpace(page));
 329                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 330                         ReleaseBuffer(buf);
 331                         continue;
 332                 }
 333
 334                 pgchanged = false;
 335                 hastup = false;
 336                 prev_dead_count = vacrelstats->num_dead_tuples;
 337                 maxoff = PageGetMaxOffsetNumber(page);
 338                 for (offnum = FirstOffsetNumber;
 339                          offnum <= maxoff;
 340                          offnum = OffsetNumberNext(offnum))
 341                 {
 342                         ItemId          itemid;
 343
 344                         itemid = PageGetItemId(page, offnum);
 345
 346                         if (!ItemIdIsUsed(itemid))
 347                         {
 348                                 nunused += 1;
 349                                 continue;
 350                         }
 351
 352                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
 353                         tuple.t_len = ItemIdGetLength(itemid);
 354                         ItemPointerSet(&(tuple.t_self), blkno, offnum);
 355
 356                         tupgone = false;
 357
 358                         switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
 359                         {
 360                                 case HEAPTUPLE_DEAD:
 361                                         tupgone = true;         /* we can delete the tuple */
 362                                         break;
 363                                 case HEAPTUPLE_LIVE:
 364
 365                                         /*
 366                                          * Tuple is good.  Consider whether to replace its xmin
 367                                          * value with FrozenTransactionId.
 368                                          *
 369                                          * NB: Since we hold only a shared buffer lock here, we
 370                                          * are assuming that TransactionId read/write is atomic.
 371                                          * This is not the only place that makes such an
 372                                          * assumption. It'd be possible to avoid the assumption by
 373                                          * momentarily acquiring exclusive lock, but for the
 374                                          * moment I see no need to.
 375                                          */
 376                                         if (TransactionIdIsNormal(HeapTupleHeaderGetXmin(tuple.t_data)) &&
 377                                                 TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
 378                                                                                           FreezeLimit))
 379                                         {
 380                                                 HeapTupleHeaderSetXmin(tuple.t_data, FrozenTransactionId);
 381                                                 /* infomask should be okay already */
 382                                                 Assert(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED);
 383                                                 pgchanged = true;
 384                                         }
 385
 386                                         /*
 387                                          * Other checks...
 388                                          */
 389                                         if (onerel->rd_rel->relhasoids &&
 390                                                 !OidIsValid(HeapTupleGetOid(&tuple)))
 391                                                 elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
 392                                                          relname, blkno, offnum);
 393                                         break;
 394                                 case HEAPTUPLE_RECENTLY_DEAD:
 395
 396                                         /*
 397                                          * If tuple is recently deleted then we must not remove it
 398                                          * from relation.
 399                                          */
 400                                         nkeep += 1;
 401                                         break;
 402                                 case HEAPTUPLE_INSERT_IN_PROGRESS:
 403                                         /* This is an expected case during concurrent vacuum */
 404                                         break;
 405                                 case HEAPTUPLE_DELETE_IN_PROGRESS:
 406                                         /* This is an expected case during concurrent vacuum */
 407                                         break;
 408                                 default:
 409                                         elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
 410                                         break;
 411                         }
 412
 413                         if (tupgone)
 414                         {
 415                                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
 416                                 tups_vacuumed += 1;
 417                         }
 418                         else
 419                         {
 420                                 num_tuples += 1;
 421                                 hastup = true;
 422                         }
 423                 }                                               /* scan along page */
 424
 425                 /*
 426                  * If we remembered any tuples for deletion, then the page will be
 427                  * visited again by lazy_vacuum_heap, which will compute and record
 428                  * its post-compaction free space.      If not, then we're done with this
 429                  * page, so remember its free space as-is.
 430                  */
 431                 if (vacrelstats->num_dead_tuples == prev_dead_count)
 432                 {
 433                         lazy_record_free_space(vacrelstats, blkno,
 434                                                                    PageGetFreeSpace(page));
 435                 }
 436
 437                 /* Remember the location of the last page with nonremovable tuples */
 438                 if (hastup)
 439                         vacrelstats->nonempty_pages = blkno + 1;
 440
 441                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 442
 443                 if (pgchanged)
 444                         WriteBuffer(buf);
 445                 else
 446                         ReleaseBuffer(buf);
 447         }
 448
 449         /* save stats for use later */
 450         vacrelstats->rel_tuples = num_tuples;
 451         vacrelstats->tuples_deleted = tups_vacuumed;
 452
 453         /* If any tuples need to be deleted, perform final vacuum cycle */
 454         /* XXX put a threshold on min number of tuples here? */
 455         if (vacrelstats->num_dead_tuples > 0)
 456         {
 457                 /* Remove index entries */
 458                 for (i = 0; i < nindexes; i++)
 459                         lazy_vacuum_index(Irel[i],
 460                                                           &index_tups_vacuumed[i],
 461                                                           &index_pages_removed[i],
 462                                                           vacrelstats);
 463                 /* Remove tuples from heap */
 464                 lazy_vacuum_heap(onerel, vacrelstats);
 465         }
 466         else if (!did_vacuum_index)
 467         {
 468                 /* Must do post-vacuum cleanup and statistics update anyway */
 469                 for (i = 0; i < nindexes; i++)
 470                         lazy_scan_index(Irel[i], vacrelstats);
 471         }
 472
 473         ereport(elevel,
 474                         (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
 475                                         RelationGetRelationName(onerel),
 476                                         tups_vacuumed, num_tuples, nblocks),
 477                          errdetail("%.0f dead row versions cannot be removed yet.\n"
 478                                            "There were %.0f unused item pointers.\n"
 479                                            "%u pages are entirely empty.\n"
 480                                            "%s.",
 481                                            nkeep,
 482                                            nunused,
 483                                            empty_pages,
 484                                            pg_rusage_show(&ru0))));
 485 }
 486
 487
 488 /*
 489  *      lazy_vacuum_heap() -- second pass over the heap
 490  *
 491  *              This routine marks dead tuples as unused and compacts out free
 492  *              space on their pages.  Pages not having dead tuples recorded from
 493  *              lazy_scan_heap are not visited at all.
 494  *
 495  * Note: the reason for doing this as a second pass is we cannot remove
 496  * the tuples until we've removed their index entries, and we want to
 497  * process index entry removal in batches as large as possible.
 498  */
 499 static void
 500 lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 501 {
 502         int                     tupindex;
 503         int                     npages;
 504         PGRUsage        ru0;
 505
 506         pg_rusage_init(&ru0);
 507         npages = 0;
 508
 509         tupindex = 0;
 510         while (tupindex < vacrelstats->num_dead_tuples)
 511         {
 512                 BlockNumber tblk;
 513                 Buffer          buf;
 514                 Page            page;
 515
 516                 vacuum_delay_point();
 517
 518                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
 519                 buf = ReadBuffer(onerel, tblk);
 520                 LockBufferForCleanup(buf);
 521                 tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
 522                 /* Now that we've compacted the page, record its available space */
 523                 page = BufferGetPage(buf);
 524                 lazy_record_free_space(vacrelstats, tblk,
 525                                                            PageGetFreeSpace(page));
 526                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 527                 WriteBuffer(buf);
 528                 npages++;
 529         }
 530
 531         ereport(elevel,
 532                         (errmsg("\"%s\": removed %d row versions in %d pages",
 533                                         RelationGetRelationName(onerel),
 534                                         tupindex, npages),
 535                          errdetail("%s.",
 536                                            pg_rusage_show(&ru0))));
 537 }
 538
 539 /*
 540  *      lazy_vacuum_page() -- free dead tuples on a page
 541  *                                       and repair its fragmentation.
 542  *
 543  * Caller is expected to handle reading, locking, and writing the buffer.
 544  *
 545  * tupindex is the index in vacrelstats->dead_tuples of the first dead
 546  * tuple for this page.  We assume the rest follow sequentially.
 547  * The return value is the first tupindex after the tuples of this page.
 548  */
 549 static int
 550 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 551                                  int tupindex, LVRelStats *vacrelstats)
 552 {
 553         OffsetNumber unused[MaxOffsetNumber];
 554         int                     uncnt;
 555         Page            page = BufferGetPage(buffer);
 556         ItemId          itemid;
 557
 558         START_CRIT_SECTION();
 559         for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
 560         {
 561                 BlockNumber tblk;
 562                 OffsetNumber toff;
 563
 564                 tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
 565                 if (tblk != blkno)
 566                         break;                          /* past end of tuples for this block */
 567                 toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
 568                 itemid = PageGetItemId(page, toff);
 569                 itemid->lp_flags &= ~LP_USED;
 570         }
 571
 572         uncnt = PageRepairFragmentation(page, unused);
 573
 574         /* XLOG stuff */
 575         if (!onerel->rd_istemp)
 576         {
 577                 XLogRecPtr      recptr;
 578
 579                 recptr = log_heap_clean(onerel, buffer, unused, uncnt);
 580                 PageSetLSN(page, recptr);
 581                 PageSetTLI(page, ThisTimeLineID);
 582         }
 583         else
 584         {
 585                 /* No XLOG record, but still need to flag that XID exists on disk */
 586                 MyXactMadeTempRelUpdate = true;
 587         }
 588
 589         END_CRIT_SECTION();
 590
 591         return tupindex;
 592 }
 593
 594 /*
 595  *      lazy_scan_index() -- scan one index relation to update pg_class statistic.
 596  *
 597  * We use this when we have no deletions to do.
 598  */
 599 static void
 600 lazy_scan_index(Relation indrel, LVRelStats *vacrelstats)
 601 {
 602         IndexBulkDeleteResult *stats;
 603         IndexVacuumCleanupInfo vcinfo;
 604         PGRUsage        ru0;
 605
 606         pg_rusage_init(&ru0);
 607
 608         /*
 609          * Acquire appropriate type of lock on index: must be exclusive if index
 610          * AM isn't concurrent-safe.
 611          */
 612         if (indrel->rd_am->amconcurrent)
 613                 LockRelation(indrel, RowExclusiveLock);
 614         else
 615                 LockRelation(indrel, AccessExclusiveLock);
 616
 617         /*
 618          * Even though we're not planning to delete anything, we use the
 619          * ambulkdelete call, because (a) the scan happens within the index AM for
 620          * more speed, and (b) it may want to pass private statistics to the
 621          * amvacuumcleanup call.
 622          */
 623         stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);
 624
 625         /* Do post-VACUUM cleanup, even though we deleted nothing */
 626         vcinfo.vacuum_full = false;
 627         vcinfo.message_level = elevel;
 628         vcinfo.num_heap_tuples = vacrelstats->rel_tuples;
 629
 630         stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
 631
 632         /*
 633          * Release lock acquired above.
 634          */
 635         if (indrel->rd_am->amconcurrent)
 636                 UnlockRelation(indrel, RowExclusiveLock);
 637         else
 638                 UnlockRelation(indrel, AccessExclusiveLock);
 639
 640         if (!stats)
 641                 return;
 642
 643         /* now update statistics in pg_class */
 644         vac_update_relstats(RelationGetRelid(indrel),
 645                                                 stats->num_pages,
 646                                                 stats->num_index_tuples,
 647                                                 false);
 648
 649         ereport(elevel,
 650                         (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
 651                                         RelationGetRelationName(indrel),
 652                                         stats->num_index_tuples,
 653                                         stats->num_pages),
 654         errdetail("%u index pages have been deleted, %u are currently reusable.\n"
 655                           "%s.",
 656                           stats->pages_deleted, stats->pages_free,
 657                           pg_rusage_show(&ru0))));
 658
 659         pfree(stats);
 660 }
 661
 662 /*
 663  *      lazy_vacuum_index() -- vacuum one index relation.
 664  *
 665  *              Delete all the index entries pointing to tuples listed in
 666  *              vacrelstats->dead_tuples.
 667  *
 668  *              Increment *index_tups_vacuumed by the number of index entries
 669  *              removed, and *index_pages_removed by the number of pages removed.
 670  *
 671  *              Finally, we arrange to update the index relation's statistics in
 672  *              pg_class.
 673  */
 674 static void
 675 lazy_vacuum_index(Relation indrel,
 676                                   double *index_tups_vacuumed,
 677                                   BlockNumber *index_pages_removed,
 678                                   LVRelStats *vacrelstats)
 679 {
 680         IndexBulkDeleteResult *stats;
 681         IndexVacuumCleanupInfo vcinfo;
 682         PGRUsage        ru0;
 683
 684         pg_rusage_init(&ru0);
 685
 686         /*
 687          * Acquire appropriate type of lock on index: must be exclusive if index
 688          * AM isn't concurrent-safe.
 689          */
 690         if (indrel->rd_am->amconcurrent)
 691                 LockRelation(indrel, RowExclusiveLock);
 692         else
 693                 LockRelation(indrel, AccessExclusiveLock);
 694
 695         /* Do bulk deletion */
 696         stats = index_bulk_delete(indrel, lazy_tid_reaped, (void *) vacrelstats);
 697
 698         /* Do post-VACUUM cleanup */
 699         vcinfo.vacuum_full = false;
 700         vcinfo.message_level = elevel;
 701         /* We don't yet know rel_tuples, so pass -1 */
 702         /* index_bulk_delete can't have skipped scan anyway ... */
 703         vcinfo.num_heap_tuples = -1;
 704
 705         stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
 706
 707         /*
 708          * Release lock acquired above.
 709          */
 710         if (indrel->rd_am->amconcurrent)
 711                 UnlockRelation(indrel, RowExclusiveLock);
 712         else
 713                 UnlockRelation(indrel, AccessExclusiveLock);
 714
 715         if (!stats)
 716                 return;
 717
 718         /* accumulate total removed over multiple index-cleaning cycles */
 719         *index_tups_vacuumed += stats->tuples_removed;
 720         *index_pages_removed += stats->pages_removed;
 721
 722         /* now update statistics in pg_class */
 723         vac_update_relstats(RelationGetRelid(indrel),
 724                                                 stats->num_pages,
 725                                                 stats->num_index_tuples,
 726                                                 false);
 727
 728         ereport(elevel,
 729                         (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
 730                                         RelationGetRelationName(indrel),
 731                                         stats->num_index_tuples,
 732                                         stats->num_pages),
 733                          errdetail("%.0f index row versions were removed.\n"
 734                          "%u index pages have been deleted, %u are currently reusable.\n"
 735                                            "%s.",
 736                                            stats->tuples_removed,
 737                                            stats->pages_deleted, stats->pages_free,
 738                                            pg_rusage_show(&ru0))));
 739
 740         pfree(stats);
 741 }
 742
 743 /*
 744  * lazy_truncate_heap - try to truncate off any empty pages at the end
 745  */
 746 static void
 747 lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
 748 {
 749         BlockNumber old_rel_pages = vacrelstats->rel_pages;
 750         BlockNumber new_rel_pages;
 751         PageFreeSpaceInfo *pageSpaces;
 752         int                     n;
 753         int                     i,
 754                                 j;
 755         PGRUsage        ru0;
 756
 757         pg_rusage_init(&ru0);
 758
 759         /*
 760          * We need full exclusive lock on the relation in order to do truncation.
 761          * If we can't get it, give up rather than waiting --- we don't want to
 762          * block other backends, and we don't want to deadlock (which is quite
 763          * possible considering we already hold a lower-grade lock).
 764          */
 765         if (!ConditionalLockRelation(onerel, AccessExclusiveLock))
 766                 return;
 767
 768         /*
 769          * Now that we have exclusive lock, look to see if the rel has grown
 770          * whilst we were vacuuming with non-exclusive lock.  If so, give up; the
 771          * newly added pages presumably contain non-deletable tuples.
 772          */
 773         new_rel_pages = RelationGetNumberOfBlocks(onerel);
 774         if (new_rel_pages != old_rel_pages)
 775         {
 776                 /* might as well use the latest news when we update pg_class stats */
 777                 vacrelstats->rel_pages = new_rel_pages;
 778                 UnlockRelation(onerel, AccessExclusiveLock);
 779                 return;
 780         }
 781
 782         /*
 783          * Scan backwards from the end to verify that the end pages actually
 784          * contain nothing we need to keep.  This is *necessary*, not optional,
 785          * because other backends could have added tuples to these pages whilst we
 786          * were vacuuming.
 787          */
 788         new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
 789
 790         if (new_rel_pages >= old_rel_pages)
 791         {
 792                 /* can't do anything after all */
 793                 UnlockRelation(onerel, AccessExclusiveLock);
 794                 return;
 795         }
 796
 797         /*
 798          * Okay to truncate.
 799          */
 800         RelationTruncate(onerel, new_rel_pages);
 801
 802         /*
 803          * Drop free-space info for removed blocks; these must not get entered
 804          * into the FSM!
 805          */
 806         pageSpaces = vacrelstats->free_pages;
 807         n = vacrelstats->num_free_pages;
 808         j = 0;
 809         for (i = 0; i < n; i++)
 810         {
 811                 if (pageSpaces[i].blkno < new_rel_pages)
 812                 {
 813                         pageSpaces[j] = pageSpaces[i];
 814                         j++;
 815                 }
 816         }
 817         vacrelstats->num_free_pages = j;
 818         /* We destroyed the heap ordering, so mark array unordered */
 819         vacrelstats->fs_is_heap = false;
 820
 821         /* update statistics */
 822         vacrelstats->rel_pages = new_rel_pages;
 823         vacrelstats->pages_removed = old_rel_pages - new_rel_pages;
 824
 825         /*
 826          * We keep the exclusive lock until commit (perhaps not necessary)?
 827          */
 828
 829         ereport(elevel,
 830                         (errmsg("\"%s\": truncated %u to %u pages",
 831                                         RelationGetRelationName(onerel),
 832                                         old_rel_pages, new_rel_pages),
 833                          errdetail("%s.",
 834                                            pg_rusage_show(&ru0))));
 835 }
 836
 837 /*
 838  * Rescan end pages to verify that they are (still) empty of needed tuples.
 839  *
 840  * Returns number of nondeletable pages (last nonempty page + 1).
 841  */
 842 static BlockNumber
 843 count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
 844 {
 845         BlockNumber blkno;
 846         HeapTupleData tuple;
 847
 848         /* Strange coding of loop control is needed because blkno is unsigned */
 849         blkno = vacrelstats->rel_pages;
 850         while (blkno > vacrelstats->nonempty_pages)
 851         {
 852                 Buffer          buf;
 853                 Page            page;
 854                 OffsetNumber offnum,
 855                                         maxoff;
 856                 bool            tupgone,
 857                                         hastup;
 858
 859                 vacuum_delay_point();
 860
 861                 blkno--;
 862
 863                 buf = ReadBuffer(onerel, blkno);
 864
 865                 /* In this phase we only need shared access to the buffer */
 866                 LockBuffer(buf, BUFFER_LOCK_SHARE);
 867
 868                 page = BufferGetPage(buf);
 869
 870                 if (PageIsNew(page) || PageIsEmpty(page))
 871                 {
 872                         /* PageIsNew probably shouldn't happen... */
 873                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 874                         ReleaseBuffer(buf);
 875                         continue;
 876                 }
 877
 878                 hastup = false;
 879                 maxoff = PageGetMaxOffsetNumber(page);
 880                 for (offnum = FirstOffsetNumber;
 881                          offnum <= maxoff;
 882                          offnum = OffsetNumberNext(offnum))
 883                 {
 884                         ItemId          itemid;
 885
 886                         itemid = PageGetItemId(page, offnum);
 887
 888                         if (!ItemIdIsUsed(itemid))
 889                                 continue;
 890
 891                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
 892                         tuple.t_len = ItemIdGetLength(itemid);
 893                         ItemPointerSet(&(tuple.t_self), blkno, offnum);
 894
 895                         tupgone = false;
 896
 897                         switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
 898                         {
 899                                 case HEAPTUPLE_DEAD:
 900                                         tupgone = true;         /* we can delete the tuple */
 901                                         break;
 902                                 case HEAPTUPLE_LIVE:
 903                                         /* Shouldn't be necessary to re-freeze anything */
 904                                         break;
 905                                 case HEAPTUPLE_RECENTLY_DEAD:
 906
 907                                         /*
 908                                          * If tuple is recently deleted then we must not remove it
 909                                          * from relation.
 910                                          */
 911                                         break;
 912                                 case HEAPTUPLE_INSERT_IN_PROGRESS:
 913                                         /* This is an expected case during concurrent vacuum */
 914                                         break;
 915                                 case HEAPTUPLE_DELETE_IN_PROGRESS:
 916                                         /* This is an expected case during concurrent vacuum */
 917                                         break;
 918                                 default:
 919                                         elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
 920                                         break;
 921                         }
 922
 923                         if (!tupgone)
 924                         {
 925                                 hastup = true;
 926                                 break;                  /* can stop scanning */
 927                         }
 928                 }                                               /* scan along page */
 929
 930                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 931
 932                 ReleaseBuffer(buf);
 933
 934                 /* Done scanning if we found a tuple here */
 935                 if (hastup)
 936                         return blkno + 1;
 937         }
 938
 939         /*
 940          * If we fall out of the loop, all the previously-thought-to-be-empty
 941          * pages really are; we need not bother to look at the last known-nonempty
 942          * page.
 943          */
 944         return vacrelstats->nonempty_pages;
 945 }
 946
 947 /*
 948  * lazy_space_alloc - space allocation decisions for lazy vacuum
 949  *
 950  * See the comments at the head of this file for rationale.
 951  */
 952 static void
 953 lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
 954 {
 955         long            maxtuples;
 956         int                     maxpages;
 957
 958         maxtuples = (maintenance_work_mem * 1024L) / sizeof(ItemPointerData);
 959         maxtuples = Min(maxtuples, INT_MAX);
 960         /* stay sane if small maintenance_work_mem */
 961         maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
 962
 963         vacrelstats->num_dead_tuples = 0;
 964         vacrelstats->max_dead_tuples = (int) maxtuples;
 965         vacrelstats->dead_tuples = (ItemPointer)
 966                 palloc(maxtuples * sizeof(ItemPointerData));
 967
 968         maxpages = MaxFSMPages;
 969         /* No need to allocate more pages than the relation has blocks */
 970         if (relblocks < (BlockNumber) maxpages)
 971                 maxpages = (int) relblocks;
 972
 973         vacrelstats->fs_is_heap = false;
 974         vacrelstats->num_free_pages = 0;
 975         vacrelstats->max_free_pages = maxpages;
 976         vacrelstats->free_pages = (PageFreeSpaceInfo *)
 977                 palloc(maxpages * sizeof(PageFreeSpaceInfo));
 978 }
 979
 980 /*
 981  * lazy_record_dead_tuple - remember one deletable tuple
 982  */
 983 static void
 984 lazy_record_dead_tuple(LVRelStats *vacrelstats,
 985                                            ItemPointer itemptr)
 986 {
 987         /*
 988          * The array shouldn't overflow under normal behavior, but perhaps it
 989          * could if we are given a really small maintenance_work_mem. In that
 990          * case, just forget the last few tuples.
 991          */
 992         if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
 993         {
 994                 vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
 995                 vacrelstats->num_dead_tuples++;
 996         }
 997 }
 998
 999 /*
1000  * lazy_record_free_space - remember free space on one page
1001  */
1002 static void
1003 lazy_record_free_space(LVRelStats *vacrelstats,
1004                                            BlockNumber page,
1005                                            Size avail)
1006 {
1007         PageFreeSpaceInfo *pageSpaces;
1008         int                     n;
1009
1010         /*
1011          * A page with less than stats->threshold free space will be forgotten
1012          * immediately, and never passed to the free space map.  Removing the
1013          * uselessly small entries early saves cycles, and in particular reduces
1014          * the amount of time we spend holding the FSM lock when we finally call
1015          * RecordRelationFreeSpace.  Since the FSM will probably drop pages with
1016          * little free space anyway, there's no point in making this really small.
1017          *
1018          * XXX Is it worth trying to measure average tuple size, and using that to
1019          * adjust the threshold?  Would be worthwhile if FSM has no stats yet for
1020          * this relation.  But changing the threshold as we scan the rel might
1021          * lead to bizarre behavior, too.  Also, it's probably better if vacuum.c
1022          * has the same thresholding behavior as we do here.
1023          */
1024         if (avail < vacrelstats->threshold)
1025                 return;
1026
1027         /* Copy pointers to local variables for notational simplicity */
1028         pageSpaces = vacrelstats->free_pages;
1029         n = vacrelstats->max_free_pages;
1030
1031         /* If we haven't filled the array yet, just keep adding entries */
1032         if (vacrelstats->num_free_pages < n)
1033         {
1034                 pageSpaces[vacrelstats->num_free_pages].blkno = page;
1035                 pageSpaces[vacrelstats->num_free_pages].avail = avail;
1036                 vacrelstats->num_free_pages++;
1037                 return;
1038         }
1039
1040         /*----------
1041          * The rest of this routine works with "heap" organization of the
1042          * free space arrays, wherein we maintain the heap property
1043          *                      avail[(j-1) div 2] <= avail[j]  for 0 < j < n.
1044          * In particular, the zero'th element always has the smallest available
1045          * space and can be discarded to make room for a new page with more space.
1046          * See Knuth's discussion of heap-based priority queues, sec 5.2.3;
1047          * but note he uses 1-origin array subscripts, not 0-origin.
1048          *----------
1049          */
1050
1051         /* If we haven't yet converted the array to heap organization, do it */
1052         if (!vacrelstats->fs_is_heap)
1053         {
1054                 /*
1055                  * Scan backwards through the array, "sift-up" each value into its
1056                  * correct position.  We can start the scan at n/2-1 since each entry
1057                  * above that position has no children to worry about.
1058                  */
1059                 int                     l = n / 2;
1060
1061                 while (--l >= 0)
1062                 {
1063                         BlockNumber R = pageSpaces[l].blkno;
1064                         Size            K = pageSpaces[l].avail;
1065                         int                     i;              /* i is where the "hole" is */
1066
1067                         i = l;
1068                         for (;;)
1069                         {
1070                                 int                     j = 2 * i + 1;
1071
1072                                 if (j >= n)
1073                                         break;
1074                                 if (j + 1 < n && pageSpaces[j].avail > pageSpaces[j + 1].avail)
1075                                         j++;
1076                                 if (K <= pageSpaces[j].avail)
1077                                         break;
1078                                 pageSpaces[i] = pageSpaces[j];
1079                                 i = j;
1080                         }
1081                         pageSpaces[i].blkno = R;
1082                         pageSpaces[i].avail = K;
1083                 }
1084
1085                 vacrelstats->fs_is_heap = true;
1086         }
1087
1088         /* If new page has more than zero'th entry, insert it into heap */
1089         if (avail > pageSpaces[0].avail)
1090         {
1091                 /*
1092                  * Notionally, we replace the zero'th entry with the new data, and
1093                  * then sift-up to maintain the heap property.  Physically, the new
1094                  * data doesn't get stored into the arrays until we find the right
1095                  * location for it.
1096                  */
1097                 int                     i = 0;          /* i is where the "hole" is */
1098
1099                 for (;;)
1100                 {
1101                         int                     j = 2 * i + 1;
1102
1103                         if (j >= n)
1104                                 break;
1105                         if (j + 1 < n && pageSpaces[j].avail > pageSpaces[j + 1].avail)
1106                                 j++;
1107                         if (avail <= pageSpaces[j].avail)
1108                                 break;
1109                         pageSpaces[i] = pageSpaces[j];
1110                         i = j;
1111                 }
1112                 pageSpaces[i].blkno = page;
1113                 pageSpaces[i].avail = avail;
1114         }
1115 }
1116
1117 /*
1118  *      lazy_tid_reaped() -- is a particular tid deletable?
1119  *
1120  *              This has the right signature to be an IndexBulkDeleteCallback.
1121  *
1122  *              Assumes dead_tuples array is in sorted order.
1123  */
1124 static bool
1125 lazy_tid_reaped(ItemPointer itemptr, void *state)
1126 {
1127         LVRelStats *vacrelstats = (LVRelStats *) state;
1128         ItemPointer res;
1129
1130         res = (ItemPointer) bsearch((void *) itemptr,
1131                                                                 (void *) vacrelstats->dead_tuples,
1132                                                                 vacrelstats->num_dead_tuples,
1133                                                                 sizeof(ItemPointerData),
1134                                                                 vac_cmp_itemptr);
1135
1136         return (res != NULL);
1137 }
1138
1139 /*
1140  * Dummy version for lazy_scan_index.
1141  */
1142 static bool
1143 dummy_tid_reaped(ItemPointer itemptr, void *state)
1144 {
1145         return false;
1146 }
1147
1148 /*
1149  * Update the shared Free Space Map with the info we now have about
1150  * free space in the relation, discarding any old info the map may have.
1151  */
1152 static void
1153 lazy_update_fsm(Relation onerel, LVRelStats *vacrelstats)
1154 {
1155         PageFreeSpaceInfo *pageSpaces = vacrelstats->free_pages;
1156         int                     nPages = vacrelstats->num_free_pages;
1157
1158         /*
1159          * Sort data into order, as required by RecordRelationFreeSpace.
1160          */
1161         if (nPages > 1)
1162                 qsort(pageSpaces, nPages, sizeof(PageFreeSpaceInfo),
1163                           vac_cmp_page_spaces);
1164
1165         RecordRelationFreeSpace(&onerel->rd_node, nPages, pageSpaces);
1166 }
1167
1168 /*
1169  * Comparator routines for use with qsort() and bsearch().
1170  */
1171 static int
1172 vac_cmp_itemptr(const void *left, const void *right)
1173 {
1174         BlockNumber lblk,
1175                                 rblk;
1176         OffsetNumber loff,
1177                                 roff;
1178
1179         lblk = ItemPointerGetBlockNumber((ItemPointer) left);
1180         rblk = ItemPointerGetBlockNumber((ItemPointer) right);
1181
1182         if (lblk < rblk)
1183                 return -1;
1184         if (lblk > rblk)
1185                 return 1;
1186
1187         loff = ItemPointerGetOffsetNumber((ItemPointer) left);
1188         roff = ItemPointerGetOffsetNumber((ItemPointer) right);
1189
1190         if (loff < roff)
1191                 return -1;
1192         if (loff > roff)
1193                 return 1;
1194
1195         return 0;
1196 }
1197
1198 static int
1199 vac_cmp_page_spaces(const void *left, const void *right)
1200 {
1201         PageFreeSpaceInfo *linfo = (PageFreeSpaceInfo *) left;
1202         PageFreeSpaceInfo *rinfo = (PageFreeSpaceInfo *) right;
1203
1204         if (linfo->blkno < rinfo->blkno)
1205                 return -1;
1206         else if (linfo->blkno > rinfo->blkno)
1207                 return 1;
1208         return 0;
1209 }