granicus.if.org Git - postgresql/blob - src/backend/commands/vacuum.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * vacuum.c
   4  *        The postgres vacuum cleaner.
   5  *
   6  * This file includes the "full" version of VACUUM, as well as control code
   7  * used by all three of full VACUUM, lazy VACUUM, and ANALYZE.  See
   8  * vacuumlazy.c and analyze.c for the rest of the code for the latter two.
   9  *
  10  *
  11  * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
  12  * Portions Copyright (c) 1994, Regents of the University of California
  13  *
  14  *
  15  * IDENTIFICATION
  16  *        $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.229 2002/06/15 21:52:31 tgl Exp $
  17  *
  18  *-------------------------------------------------------------------------
  19  */
  20 #include "postgres.h"
  21
  22 #include <unistd.h>
  23
  24 #include "access/clog.h"
  25 #include "access/genam.h"
  26 #include "access/heapam.h"
  27 #include "access/xlog.h"
  28 #include "catalog/catalog.h"
  29 #include "catalog/catname.h"
  30 #include "catalog/namespace.h"
  31 #include "catalog/pg_database.h"
  32 #include "catalog/pg_index.h"
  33 #include "commands/vacuum.h"
  34 #include "executor/executor.h"
  35 #include "miscadmin.h"
  36 #include "storage/freespace.h"
  37 #include "storage/sinval.h"
  38 #include "storage/smgr.h"
  39 #include "tcop/pquery.h"
  40 #include "utils/acl.h"
  41 #include "utils/builtins.h"
  42 #include "utils/fmgroids.h"
  43 #include "utils/inval.h"
  44 #include "utils/lsyscache.h"
  45 #include "utils/relcache.h"
  46 #include "utils/syscache.h"
  47 #include "pgstat.h"
  48
  49
  50 typedef struct VacPageData
  51 {
  52         BlockNumber blkno;                      /* BlockNumber of this Page */
  53         Size            free;                   /* FreeSpace on this Page */
  54         uint16          offsets_used;   /* Number of OffNums used by vacuum */
  55         uint16          offsets_free;   /* Number of OffNums free or to be free */
  56         OffsetNumber offsets[1];        /* Array of free OffNums */
  57 } VacPageData;
  58
  59 typedef VacPageData *VacPage;
  60
  61 typedef struct VacPageListData
  62 {
  63         BlockNumber empty_end_pages;    /* Number of "empty" end-pages */
  64         int                     num_pages;              /* Number of pages in pagedesc */
  65         int                     num_allocated_pages;    /* Number of allocated pages in
  66                                                                                  * pagedesc */
  67         VacPage    *pagedesc;           /* Descriptions of pages */
  68 } VacPageListData;
  69
  70 typedef VacPageListData *VacPageList;
  71
  72 typedef struct VTupleLinkData
  73 {
  74         ItemPointerData new_tid;
  75         ItemPointerData this_tid;
  76 } VTupleLinkData;
  77
  78 typedef VTupleLinkData *VTupleLink;
  79
  80 typedef struct VTupleMoveData
  81 {
  82         ItemPointerData tid;            /* tuple ID */
  83         VacPage         vacpage;                /* where to move */
  84         bool            cleanVpd;               /* clean vacpage before using */
  85 } VTupleMoveData;
  86
  87 typedef VTupleMoveData *VTupleMove;
  88
  89 typedef struct VRelStats
  90 {
  91         BlockNumber rel_pages;
  92         double          rel_tuples;
  93         Size            min_tlen;
  94         Size            max_tlen;
  95         bool            hasindex;
  96         int                     num_vtlinks;
  97         VTupleLink      vtlinks;
  98 } VRelStats;
  99
 100
 101 static MemoryContext vac_context = NULL;
 102
 103 static int elevel = -1;
 104
 105 static TransactionId OldestXmin;
 106 static TransactionId FreezeLimit;
 107
 108 static TransactionId initialOldestXmin;
 109 static TransactionId initialFreezeLimit;
 110
 111
 112 /* non-export function prototypes */
 113 static List *getrels(const RangeVar *vacrel, const char *stmttype);
 114 static void vac_update_dbstats(Oid dbid,
 115                                    TransactionId vacuumXID,
 116                                    TransactionId frozenXID);
 117 static void vac_truncate_clog(TransactionId vacuumXID,
 118                                   TransactionId frozenXID);
 119 static void vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind);
 120 static void full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt);
 121 static void scan_heap(VRelStats *vacrelstats, Relation onerel,
 122                   VacPageList vacuum_pages, VacPageList fraged_pages);
 123 static void repair_frag(VRelStats *vacrelstats, Relation onerel,
 124                         VacPageList vacuum_pages, VacPageList fraged_pages,
 125                         int nindexes, Relation *Irel);
 126 static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
 127                         VacPageList vacpagelist);
 128 static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
 129 static void vacuum_index(VacPageList vacpagelist, Relation indrel,
 130                          double num_tuples, int keep_tuples);
 131 static void scan_index(Relation indrel, double num_tuples);
 132 static bool tid_reaped(ItemPointer itemptr, void *state);
 133 static bool dummy_tid_reaped(ItemPointer itemptr, void *state);
 134 static void vac_update_fsm(Relation onerel, VacPageList fraged_pages,
 135                            BlockNumber rel_pages);
 136 static VacPage copy_vac_page(VacPage vacpage);
 137 static void vpage_insert(VacPageList vacpagelist, VacPage vpnew);
 138 static void *vac_bsearch(const void *key, const void *base,
 139                         size_t nelem, size_t size,
 140                         int (*compar) (const void *, const void *));
 141 static int      vac_cmp_blk(const void *left, const void *right);
 142 static int      vac_cmp_offno(const void *left, const void *right);
 143 static int      vac_cmp_vtlinks(const void *left, const void *right);
 144 static bool enough_space(VacPage vacpage, Size len);
 145
 146
 147 /****************************************************************************
 148  *                                                                                                                                                      *
 149  *                      Code common to all flavors of VACUUM and ANALYZE                                *
 150  *                                                                                                                                                      *
 151  ****************************************************************************
 152  */
 153
 154
 155 /*
 156  * Primary entry point for VACUUM and ANALYZE commands.
 157  */
 158 void
 159 vacuum(VacuumStmt *vacstmt)
 160 {
 161         const char *stmttype = vacstmt->vacuum ? "VACUUM" : "ANALYZE";
 162         MemoryContext anl_context = NULL;
 163         List       *vrl,
 164                            *cur;
 165
 166         if (vacstmt->verbose)
 167                 elevel = INFO;
 168         else
 169                 elevel = DEBUG1;
 170
 171         /*
 172          * We cannot run VACUUM inside a user transaction block; if we were
 173          * inside a transaction, then our commit- and
 174          * start-transaction-command calls would not have the intended effect!
 175          * Furthermore, the forced commit that occurs before truncating the
 176          * relation's file would have the effect of committing the rest of the
 177          * user's transaction too, which would certainly not be the desired
 178          * behavior.
 179          */
 180         if (vacstmt->vacuum && IsTransactionBlock())
 181                 elog(ERROR, "%s cannot run inside a BEGIN/END block", stmttype);
 182
 183         /* Running VACUUM from a function would free the function context */
 184         if (vacstmt->vacuum && !MemoryContextContains(QueryContext, vacstmt))
 185                 elog(ERROR, "%s cannot be executed from a function", stmttype);
 186
 187         /*
 188          * Send info about dead objects to the statistics collector
 189          */
 190         if (vacstmt->vacuum)
 191                 pgstat_vacuum_tabstat();
 192
 193         /*
 194          * Create special memory context for cross-transaction storage.
 195          *
 196          * Since it is a child of QueryContext, it will go away eventually even
 197          * if we suffer an error; there's no need for special abort cleanup
 198          * logic.
 199          */
 200         vac_context = AllocSetContextCreate(QueryContext,
 201                                                                                 "Vacuum",
 202                                                                                 ALLOCSET_DEFAULT_MINSIZE,
 203                                                                                 ALLOCSET_DEFAULT_INITSIZE,
 204                                                                                 ALLOCSET_DEFAULT_MAXSIZE);
 205
 206         /*
 207          * If we are running only ANALYZE, we don't need per-table transactions,
 208          * but we still need a memory context with table lifetime.
 209          */
 210         if (vacstmt->analyze && !vacstmt->vacuum)
 211                 anl_context = AllocSetContextCreate(QueryContext,
 212                                                                                         "Analyze",
 213                                                                                         ALLOCSET_DEFAULT_MINSIZE,
 214                                                                                         ALLOCSET_DEFAULT_INITSIZE,
 215                                                                                         ALLOCSET_DEFAULT_MAXSIZE);
 216
 217         /* Build list of relations to process (note this lives in vac_context) */
 218         vrl = getrels(vacstmt->relation, stmttype);
 219
 220         /*
 221          * Formerly, there was code here to prevent more than one VACUUM from
 222          * executing concurrently in the same database.  However, there's no
 223          * good reason to prevent that, and manually removing lockfiles after
 224          * a vacuum crash was a pain for dbadmins.  So, forget about lockfiles,
 225          * and just rely on the locks we grab on each target table
 226          * to ensure that there aren't two VACUUMs running on the same table
 227          * at the same time.
 228          */
 229
 230         /*
 231          * The strangeness with committing and starting transactions here is due
 232          * to wanting to run each table's VACUUM as a separate transaction, so
 233          * that we don't hold locks unnecessarily long.  Also, if we are doing
 234          * VACUUM ANALYZE, the ANALYZE part runs as a separate transaction from
 235          * the VACUUM to further reduce locking.
 236          *
 237          * vacuum_rel expects to be entered with no transaction active; it will
 238          * start and commit its own transaction.  But we are called by an SQL
 239          * command, and so we are executing inside a transaction already.  We
 240          * commit the transaction started in PostgresMain() here, and start
 241          * another one before exiting to match the commit waiting for us back in
 242          * PostgresMain().
 243          *
 244          * In the case of an ANALYZE statement (no vacuum, just analyze) it's
 245          * okay to run the whole thing in the outer transaction, and so we skip
 246          * transaction start/stop operations.
 247          */
 248         if (vacstmt->vacuum)
 249         {
 250                 if (vacstmt->relation == NULL)
 251                 {
 252                         /*
 253                          * It's a database-wide VACUUM.
 254                          *
 255                          * Compute the initially applicable OldestXmin and FreezeLimit
 256                          * XIDs, so that we can record these values at the end of the
 257                          * VACUUM. Note that individual tables may well be processed with
 258                          * newer values, but we can guarantee that no (non-shared)
 259                          * relations are processed with older ones.
 260                          *
 261                          * It is okay to record non-shared values in pg_database, even though
 262                          * we may vacuum shared relations with older cutoffs, because only
 263                          * the minimum of the values present in pg_database matters.  We
 264                          * can be sure that shared relations have at some time been
 265                          * vacuumed with cutoffs no worse than the global minimum; for, if
 266                          * there is a backend in some other DB with xmin = OLDXMIN that's
 267                          * determining the cutoff with which we vacuum shared relations,
 268                          * it is not possible for that database to have a cutoff newer
 269                          * than OLDXMIN recorded in pg_database.
 270                          */
 271                         vacuum_set_xid_limits(vacstmt, false,
 272                                                                   &initialOldestXmin, &initialFreezeLimit);
 273                 }
 274
 275                 /* matches the StartTransaction in PostgresMain() */
 276                 CommitTransactionCommand();
 277         }
 278
 279         /*
 280          * Loop to process each selected relation.
 281          */
 282         foreach(cur, vrl)
 283         {
 284                 Oid             relid = (Oid) lfirsti(cur);
 285
 286                 if (vacstmt->vacuum)
 287                         vacuum_rel(relid, vacstmt, RELKIND_RELATION);
 288                 if (vacstmt->analyze)
 289                 {
 290                         MemoryContext old_context = NULL;
 291
 292                         /*
 293                          * If we vacuumed, use new transaction for analyze.  Otherwise,
 294                          * we can use the outer transaction, but we still need to call
 295                          * analyze_rel in a memory context that will be cleaned up on
 296                          * return (else we leak memory while processing multiple tables).
 297                          */
 298                         if (vacstmt->vacuum)
 299                                 StartTransactionCommand();
 300                         else
 301                                 old_context = MemoryContextSwitchTo(anl_context);
 302
 303                         analyze_rel(relid, vacstmt);
 304
 305                         if (vacstmt->vacuum)
 306                                 CommitTransactionCommand();
 307                         else
 308                         {
 309                                 MemoryContextSwitchTo(old_context);
 310                                 MemoryContextResetAndDeleteChildren(anl_context);
 311                         }
 312                 }
 313         }
 314
 315         /*
 316          * Finish up processing.
 317          */
 318         if (vacstmt->vacuum)
 319         {
 320                 /* here, we are not in a transaction */
 321
 322                 /* matches the CommitTransaction in PostgresMain() */
 323                 StartTransactionCommand();
 324
 325                 /*
 326                  * If we did a database-wide VACUUM, update the database's pg_database
 327                  * row with info about the transaction IDs used, and try to truncate
 328                  * pg_clog.
 329                  */
 330                 if (vacstmt->relation == NULL)
 331                 {
 332                         vac_update_dbstats(MyDatabaseId,
 333                                                            initialOldestXmin, initialFreezeLimit);
 334                         vac_truncate_clog(initialOldestXmin, initialFreezeLimit);
 335                 }
 336         }
 337
 338         /*
 339          * Clean up working storage --- note we must do this after
 340          * StartTransactionCommand, else we might be trying to delete the
 341          * active context!
 342          */
 343         MemoryContextDelete(vac_context);
 344         vac_context = NULL;
 345
 346         if (anl_context)
 347                 MemoryContextDelete(anl_context);
 348 }
 349
 350 /*
 351  * Build a list of Oids for each relation to be processed
 352  *
 353  * The list is built in vac_context so that it will survive across our
 354  * per-relation transactions.
 355  */
 356 static List *
 357 getrels(const RangeVar *vacrel, const char *stmttype)
 358 {
 359         List       *vrl = NIL;
 360         MemoryContext oldcontext;
 361
 362         if (vacrel)
 363         {
 364                 /* Process specific relation */
 365                 Oid             relid;
 366
 367                 relid = RangeVarGetRelid(vacrel, false);
 368
 369                 /* Make a relation list entry for this guy */
 370                 oldcontext = MemoryContextSwitchTo(vac_context);
 371                 vrl = lappendi(vrl, relid);
 372                 MemoryContextSwitchTo(oldcontext);
 373         }
 374         else
 375         {
 376                 /* Process all plain relations listed in pg_class */
 377                 Relation        pgclass;
 378                 HeapScanDesc scan;
 379                 HeapTuple       tuple;
 380                 ScanKeyData key;
 381
 382                 ScanKeyEntryInitialize(&key, 0x0,
 383                                                            Anum_pg_class_relkind,
 384                                                            F_CHAREQ,
 385                                                            CharGetDatum(RELKIND_RELATION));
 386
 387                 pgclass = heap_openr(RelationRelationName, AccessShareLock);
 388
 389                 scan = heap_beginscan(pgclass, SnapshotNow, 1, &key);
 390
 391                 while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
 392                 {
 393                         /* Make a relation list entry for this guy */
 394                         oldcontext = MemoryContextSwitchTo(vac_context);
 395                         vrl = lappendi(vrl, tuple->t_data->t_oid);
 396                         MemoryContextSwitchTo(oldcontext);
 397                 }
 398
 399                 heap_endscan(scan);
 400                 heap_close(pgclass, AccessShareLock);
 401         }
 402
 403         return vrl;
 404 }
 405
 406 /*
 407  * vacuum_set_xid_limits() -- compute oldest-Xmin and freeze cutoff points
 408  */
 409 void
 410 vacuum_set_xid_limits(VacuumStmt *vacstmt, bool sharedRel,
 411                                           TransactionId *oldestXmin,
 412                                           TransactionId *freezeLimit)
 413 {
 414         TransactionId limit;
 415
 416         *oldestXmin = GetOldestXmin(sharedRel);
 417
 418         Assert(TransactionIdIsNormal(*oldestXmin));
 419
 420         if (vacstmt->freeze)
 421         {
 422                 /* FREEZE option: use oldest Xmin as freeze cutoff too */
 423                 limit = *oldestXmin;
 424         }
 425         else
 426         {
 427                 /*
 428                  * Normal case: freeze cutoff is well in the past, to wit, about
 429                  * halfway to the wrap horizon
 430                  */
 431                 limit = GetCurrentTransactionId() - (MaxTransactionId >> 2);
 432         }
 433
 434         /*
 435          * Be careful not to generate a "permanent" XID
 436          */
 437         if (!TransactionIdIsNormal(limit))
 438                 limit = FirstNormalTransactionId;
 439
 440         /*
 441          * Ensure sane relationship of limits
 442          */
 443         if (TransactionIdFollows(limit, *oldestXmin))
 444         {
 445                 elog(WARNING, "oldest Xmin is far in the past --- close open transactions soon to avoid wraparound problems");
 446                 limit = *oldestXmin;
 447         }
 448
 449         *freezeLimit = limit;
 450 }
 451
 452
 453 /*
 454  *      vac_update_relstats() -- update statistics for one relation
 455  *
 456  *              Update the whole-relation statistics that are kept in its pg_class
 457  *              row.  There are additional stats that will be updated if we are
 458  *              doing ANALYZE, but we always update these stats.  This routine works
 459  *              for both index and heap relation entries in pg_class.
 460  *
 461  *              We violate no-overwrite semantics here by storing new values for the
 462  *              statistics columns directly into the pg_class tuple that's already on
 463  *              the page.  The reason for this is that if we updated these tuples in
 464  *              the usual way, vacuuming pg_class itself wouldn't work very well ---
 465  *              by the time we got done with a vacuum cycle, most of the tuples in
 466  *              pg_class would've been obsoleted.  Of course, this only works for
 467  *              fixed-size never-null columns, but these are.
 468  *
 469  *              This routine is shared by full VACUUM, lazy VACUUM, and stand-alone
 470  *              ANALYZE.
 471  */
 472 void
 473 vac_update_relstats(Oid relid, BlockNumber num_pages, double num_tuples,
 474                                         bool hasindex)
 475 {
 476         Relation        rd;
 477         HeapTupleData rtup;
 478         HeapTuple       ctup;
 479         Form_pg_class pgcform;
 480         Buffer          buffer;
 481
 482         /*
 483          * update number of tuples and number of pages in pg_class
 484          */
 485         rd = heap_openr(RelationRelationName, RowExclusiveLock);
 486
 487         ctup = SearchSysCache(RELOID,
 488                                                   ObjectIdGetDatum(relid),
 489                                                   0, 0, 0);
 490         if (!HeapTupleIsValid(ctup))
 491                 elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
 492                          relid);
 493
 494         /* get the buffer cache tuple */
 495         rtup.t_self = ctup->t_self;
 496         ReleaseSysCache(ctup);
 497         if (!heap_fetch(rd, SnapshotNow, &rtup, &buffer, false, NULL))
 498                 elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
 499                          relid);
 500
 501         /* overwrite the existing statistics in the tuple */
 502         pgcform = (Form_pg_class) GETSTRUCT(&rtup);
 503         pgcform->relpages = (int32) num_pages;
 504         pgcform->reltuples = num_tuples;
 505         pgcform->relhasindex = hasindex;
 506
 507         /*
 508          * If we have discovered that there are no indexes, then there's no
 509          * primary key either.  This could be done more thoroughly...
 510          */
 511         if (!hasindex)
 512                 pgcform->relhaspkey = false;
 513
 514         /*
 515          * Invalidate the tuple in the catcaches; this also arranges to flush
 516          * the relation's relcache entry.  (If we fail to commit for some reason,
 517          * no flush will occur, but no great harm is done since there are no
 518          * noncritical state updates here.)
 519          */
 520         CacheInvalidateHeapTuple(rd, &rtup);
 521
 522         /* Write the buffer */
 523         WriteBuffer(buffer);
 524
 525         heap_close(rd, RowExclusiveLock);
 526 }
 527
 528
 529 /*
 530  *      vac_update_dbstats() -- update statistics for one database
 531  *
 532  *              Update the whole-database statistics that are kept in its pg_database
 533  *              row.
 534  *
 535  *              We violate no-overwrite semantics here by storing new values for the
 536  *              statistics columns directly into the tuple that's already on the page.
 537  *              As with vac_update_relstats, this avoids leaving dead tuples behind
 538  *              after a VACUUM; which is good since GetRawDatabaseInfo
 539  *              can get confused by finding dead tuples in pg_database.
 540  *
 541  *              This routine is shared by full and lazy VACUUM.  Note that it is only
 542  *              applied after a database-wide VACUUM operation.
 543  */
 544 static void
 545 vac_update_dbstats(Oid dbid,
 546                                    TransactionId vacuumXID,
 547                                    TransactionId frozenXID)
 548 {
 549         Relation        relation;
 550         ScanKeyData entry[1];
 551         HeapScanDesc scan;
 552         HeapTuple       tuple;
 553         Form_pg_database dbform;
 554
 555         relation = heap_openr(DatabaseRelationName, RowExclusiveLock);
 556
 557         /* Must use a heap scan, since there's no syscache for pg_database */
 558         ScanKeyEntryInitialize(&entry[0], 0x0,
 559                                                    ObjectIdAttributeNumber, F_OIDEQ,
 560                                                    ObjectIdGetDatum(dbid));
 561
 562         scan = heap_beginscan(relation, SnapshotNow, 1, entry);
 563
 564         tuple = heap_getnext(scan, ForwardScanDirection);
 565
 566         if (!HeapTupleIsValid(tuple))
 567                 elog(ERROR, "database %u does not exist", dbid);
 568
 569         dbform = (Form_pg_database) GETSTRUCT(tuple);
 570
 571         /* overwrite the existing statistics in the tuple */
 572         dbform->datvacuumxid = vacuumXID;
 573         dbform->datfrozenxid = frozenXID;
 574
 575         /* invalidate the tuple in the cache and write the buffer */
 576         CacheInvalidateHeapTuple(relation, tuple);
 577         WriteNoReleaseBuffer(scan->rs_cbuf);
 578
 579         heap_endscan(scan);
 580
 581         heap_close(relation, RowExclusiveLock);
 582 }
 583
 584
 585 /*
 586  *      vac_truncate_clog() -- attempt to truncate the commit log
 587  *
 588  *              Scan pg_database to determine the system-wide oldest datvacuumxid,
 589  *              and use it to truncate the transaction commit log (pg_clog).
 590  *              Also generate a warning if the system-wide oldest datfrozenxid
 591  *              seems to be in danger of wrapping around.
 592  *
 593  *              The passed XIDs are simply the ones I just wrote into my pg_database
 594  *              entry.  They're used to initialize the "min" calculations.
 595  *
 596  *              This routine is shared by full and lazy VACUUM.  Note that it is only
 597  *              applied after a database-wide VACUUM operation.
 598  */
 599 static void
 600 vac_truncate_clog(TransactionId vacuumXID, TransactionId frozenXID)
 601 {
 602         TransactionId myXID;
 603         Relation        relation;
 604         HeapScanDesc scan;
 605         HeapTuple       tuple;
 606         int32           age;
 607         bool            vacuumAlreadyWrapped = false;
 608         bool            frozenAlreadyWrapped = false;
 609
 610         myXID = GetCurrentTransactionId();
 611
 612         relation = heap_openr(DatabaseRelationName, AccessShareLock);
 613
 614         scan = heap_beginscan(relation, SnapshotNow, 0, NULL);
 615
 616         while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
 617         {
 618                 Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple);
 619
 620                 /* Ignore non-connectable databases (eg, template0) */
 621                 /* It's assumed that these have been frozen correctly */
 622                 if (!dbform->datallowconn)
 623                         continue;
 624
 625                 if (TransactionIdIsNormal(dbform->datvacuumxid))
 626                 {
 627                         if (TransactionIdPrecedes(myXID, dbform->datvacuumxid))
 628                                 vacuumAlreadyWrapped = true;
 629                         else if (TransactionIdPrecedes(dbform->datvacuumxid, vacuumXID))
 630                                 vacuumXID = dbform->datvacuumxid;
 631                 }
 632                 if (TransactionIdIsNormal(dbform->datfrozenxid))
 633                 {
 634                         if (TransactionIdPrecedes(myXID, dbform->datfrozenxid))
 635                                 frozenAlreadyWrapped = true;
 636                         else if (TransactionIdPrecedes(dbform->datfrozenxid, frozenXID))
 637                                 frozenXID = dbform->datfrozenxid;
 638                 }
 639         }
 640
 641         heap_endscan(scan);
 642
 643         heap_close(relation, AccessShareLock);
 644
 645         /*
 646          * Do not truncate CLOG if we seem to have suffered wraparound already;
 647          * the computed minimum XID might be bogus.
 648          */
 649         if (vacuumAlreadyWrapped)
 650         {
 651                 elog(WARNING, "Some databases have not been vacuumed in over 2 billion transactions."
 652                          "\n\tYou may have already suffered transaction-wraparound data loss.");
 653                 return;
 654         }
 655
 656         /* Truncate CLOG to the oldest vacuumxid */
 657         TruncateCLOG(vacuumXID);
 658
 659         /* Give warning about impending wraparound problems */
 660         if (frozenAlreadyWrapped)
 661         {
 662                 elog(WARNING, "Some databases have not been vacuumed in over 1 billion transactions."
 663                          "\n\tBetter vacuum them soon, or you may have a wraparound failure.");
 664         }
 665         else
 666         {
 667                 age = (int32) (myXID - frozenXID);
 668                 if (age > (int32) ((MaxTransactionId >> 3) * 3))
 669                         elog(WARNING, "Some databases have not been vacuumed in %d transactions."
 670                                  "\n\tBetter vacuum them within %d transactions,"
 671                                  "\n\tor you may have a wraparound failure.",
 672                                  age, (int32) (MaxTransactionId >> 1) - age);
 673         }
 674 }
 675
 676
 677 /****************************************************************************
 678  *                                                                                                                                                      *
 679  *                      Code common to both flavors of VACUUM                                                   *
 680  *                                                                                                                                                      *
 681  ****************************************************************************
 682  */
 683
 684
 685 /*
 686  *      vacuum_rel() -- vacuum one heap relation
 687  *
 688  *              Doing one heap at a time incurs extra overhead, since we need to
 689  *              check that the heap exists again just before we vacuum it.      The
 690  *              reason that we do this is so that vacuuming can be spread across
 691  *              many small transactions.  Otherwise, two-phase locking would require
 692  *              us to lock the entire database during one pass of the vacuum cleaner.
 693  *
 694  *              At entry and exit, we are not inside a transaction.
 695  */
 696 static void
 697 vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind)
 698 {
 699         LOCKMODE        lmode;
 700         Relation        onerel;
 701         LockRelId       onerelid;
 702         Oid                     toast_relid;
 703
 704         /* Begin a transaction for vacuuming this relation */
 705         StartTransactionCommand();
 706
 707         /*
 708          * Check for user-requested abort.      Note we want this to be inside a
 709          * transaction, so xact.c doesn't issue useless WARNING.
 710          */
 711         CHECK_FOR_INTERRUPTS();
 712
 713         /*
 714          * Race condition -- if the pg_class tuple has gone away since the
 715          * last time we saw it, we don't need to vacuum it.
 716          */
 717         if (!SearchSysCacheExists(RELOID,
 718                                                           ObjectIdGetDatum(relid),
 719                                                           0, 0, 0))
 720         {
 721                 CommitTransactionCommand();
 722                 return;
 723         }
 724
 725         /*
 726          * Determine the type of lock we want --- hard exclusive lock for a
 727          * FULL vacuum, but just ShareUpdateExclusiveLock for concurrent
 728          * vacuum.      Either way, we can be sure that no other backend is
 729          * vacuuming the same table.
 730          */
 731         lmode = vacstmt->full ? AccessExclusiveLock : ShareUpdateExclusiveLock;
 732
 733         /*
 734          * Open the class, get an appropriate lock on it, and check
 735          * permissions.
 736          *
 737          * We allow the user to vacuum a table if he is superuser, the table
 738          * owner, or the database owner (but in the latter case, only if it's
 739          * not a shared relation).      pg_class_ownercheck includes the superuser case.
 740          *
 741          * Note we choose to treat permissions failure as a WARNING and keep
 742          * trying to vacuum the rest of the DB --- is this appropriate?
 743          */
 744         onerel = relation_open(relid, lmode);
 745
 746         if (!(pg_class_ownercheck(RelationGetRelid(onerel), GetUserId()) ||
 747                   (is_dbadmin(MyDatabaseId) && !onerel->rd_rel->relisshared)))
 748         {
 749                 elog(WARNING, "Skipping \"%s\" --- only table or database owner can VACUUM it",
 750                          RelationGetRelationName(onerel));
 751                 relation_close(onerel, lmode);
 752                 CommitTransactionCommand();
 753                 return;
 754         }
 755
 756         /*
 757          * Check that it's a plain table; we used to do this in getrels() but
 758          * seems safer to check after we've locked the relation.
 759          */
 760         if (onerel->rd_rel->relkind != expected_relkind)
 761         {
 762                 elog(WARNING, "Skipping \"%s\" --- can not process indexes, views or special system tables",
 763                          RelationGetRelationName(onerel));
 764                 relation_close(onerel, lmode);
 765                 CommitTransactionCommand();
 766                 return;
 767         }
 768
 769         /*
 770          * Get a session-level lock too. This will protect our access to the
 771          * relation across multiple transactions, so that we can vacuum the
 772          * relation's TOAST table (if any) secure in the knowledge that no one
 773          * is deleting the parent relation.
 774          *
 775          * NOTE: this cannot block, even if someone else is waiting for access,
 776          * because the lock manager knows that both lock requests are from the
 777          * same process.
 778          */
 779         onerelid = onerel->rd_lockInfo.lockRelId;
 780         LockRelationForSession(&onerelid, lmode);
 781
 782         /*
 783          * Remember the relation's TOAST relation for later
 784          */
 785         toast_relid = onerel->rd_rel->reltoastrelid;
 786
 787         /*
 788          * Do the actual work --- either FULL or "lazy" vacuum
 789          */
 790         if (vacstmt->full)
 791                 full_vacuum_rel(onerel, vacstmt);
 792         else
 793                 lazy_vacuum_rel(onerel, vacstmt);
 794
 795         /* all done with this class, but hold lock until commit */
 796         relation_close(onerel, NoLock);
 797
 798         /*
 799          * Complete the transaction and free all temporary memory used.
 800          */
 801         CommitTransactionCommand();
 802
 803         /*
 804          * If the relation has a secondary toast rel, vacuum that too while we
 805          * still hold the session lock on the master table.  Note however that
 806          * "analyze" will not get done on the toast table.      This is good,
 807          * because the toaster always uses hardcoded index access and
 808          * statistics are totally unimportant for toast relations.
 809          */
 810         if (toast_relid != InvalidOid)
 811                 vacuum_rel(toast_relid, vacstmt, RELKIND_TOASTVALUE);
 812
 813         /*
 814          * Now release the session-level lock on the master table.
 815          */
 816         UnlockRelationForSession(&onerelid, lmode);
 817 }
 818
 819
 820 /****************************************************************************
 821  *                                                                                                                                                      *
 822  *                      Code for VACUUM FULL (only)                                                                             *
 823  *                                                                                                                                                      *
 824  ****************************************************************************
 825  */
 826
 827
 828 /*
 829  *      full_vacuum_rel() -- perform FULL VACUUM for one heap relation
 830  *
 831  *              This routine vacuums a single heap, cleans out its indexes, and
 832  *              updates its num_pages and num_tuples statistics.
 833  *
 834  *              At entry, we have already established a transaction and opened
 835  *              and locked the relation.
 836  */
 837 static void
 838 full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
 839 {
 840         VacPageListData vacuum_pages;           /* List of pages to vacuum and/or
 841                                                                                  * clean indexes */
 842         VacPageListData fraged_pages;           /* List of pages with space enough
 843                                                                                  * for re-using */
 844         Relation   *Irel;
 845         int                     nindexes,
 846                                 i;
 847         VRelStats  *vacrelstats;
 848         bool            reindex = false;
 849
 850         if (IsIgnoringSystemIndexes() &&
 851                 IsSystemRelation(onerel))
 852                 reindex = true;
 853
 854         vacuum_set_xid_limits(vacstmt, onerel->rd_rel->relisshared,
 855                                                   &OldestXmin, &FreezeLimit);
 856
 857         /*
 858          * Set up statistics-gathering machinery.
 859          */
 860         vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
 861         vacrelstats->rel_pages = 0;
 862         vacrelstats->rel_tuples = 0;
 863         vacrelstats->hasindex = false;
 864
 865         /* scan the heap */
 866         vacuum_pages.num_pages = fraged_pages.num_pages = 0;
 867         scan_heap(vacrelstats, onerel, &vacuum_pages, &fraged_pages);
 868
 869         /* Now open all indexes of the relation */
 870         vac_open_indexes(onerel, &nindexes, &Irel);
 871         if (!Irel)
 872                 reindex = false;
 873         else if (!RelationGetForm(onerel)->relhasindex)
 874                 reindex = true;
 875         if (nindexes > 0)
 876                 vacrelstats->hasindex = true;
 877
 878 #ifdef NOT_USED
 879
 880         /*
 881          * reindex in VACUUM is dangerous under WAL. ifdef out until it
 882          * becomes safe.
 883          */
 884         if (reindex)
 885         {
 886                 vac_close_indexes(nindexes, Irel);
 887                 Irel = (Relation *) NULL;
 888                 activate_indexes_of_a_table(RelationGetRelid(onerel), false);
 889         }
 890 #endif   /* NOT_USED */
 891
 892         /* Clean/scan index relation(s) */
 893         if (Irel != (Relation *) NULL)
 894         {
 895                 if (vacuum_pages.num_pages > 0)
 896                 {
 897                         for (i = 0; i < nindexes; i++)
 898                                 vacuum_index(&vacuum_pages, Irel[i],
 899                                                          vacrelstats->rel_tuples, 0);
 900                 }
 901                 else
 902                 {
 903                         /* just scan indexes to update statistic */
 904                         for (i = 0; i < nindexes; i++)
 905                                 scan_index(Irel[i], vacrelstats->rel_tuples);
 906                 }
 907         }
 908
 909         if (fraged_pages.num_pages > 0)
 910         {
 911                 /* Try to shrink heap */
 912                 repair_frag(vacrelstats, onerel, &vacuum_pages, &fraged_pages,
 913                                         nindexes, Irel);
 914                 vac_close_indexes(nindexes, Irel);
 915         }
 916         else
 917         {
 918                 vac_close_indexes(nindexes, Irel);
 919                 if (vacuum_pages.num_pages > 0)
 920                 {
 921                         /* Clean pages from vacuum_pages list */
 922                         vacuum_heap(vacrelstats, onerel, &vacuum_pages);
 923                 }
 924                 else
 925                 {
 926                         /*
 927                          * Flush dirty pages out to disk.  We must do this even if we
 928                          * didn't do anything else, because we want to ensure that all
 929                          * tuples have correct on-row commit status on disk (see
 930                          * bufmgr.c's comments for FlushRelationBuffers()).
 931                          */
 932                         i = FlushRelationBuffers(onerel, vacrelstats->rel_pages);
 933                         if (i < 0)
 934                                 elog(ERROR, "VACUUM (full_vacuum_rel): FlushRelationBuffers returned %d",
 935                                          i);
 936                 }
 937         }
 938
 939 #ifdef NOT_USED
 940         if (reindex)
 941                 activate_indexes_of_a_table(RelationGetRelid(onerel), true);
 942 #endif   /* NOT_USED */
 943
 944         /* update shared free space map with final free space info */
 945         vac_update_fsm(onerel, &fraged_pages, vacrelstats->rel_pages);
 946
 947         /* update statistics in pg_class */
 948         vac_update_relstats(RelationGetRelid(onerel), vacrelstats->rel_pages,
 949                                                 vacrelstats->rel_tuples, vacrelstats->hasindex);
 950 }
 951
 952
 953 /*
 954  *      scan_heap() -- scan an open heap relation
 955  *
 956  *              This routine sets commit status bits, constructs vacuum_pages (list
 957  *              of pages we need to compact free space on and/or clean indexes of
 958  *              deleted tuples), constructs fraged_pages (list of pages with free
 959  *              space that tuples could be moved into), and calculates statistics
 960  *              on the number of live tuples in the heap.
 961  */
 962 static void
 963 scan_heap(VRelStats *vacrelstats, Relation onerel,
 964                   VacPageList vacuum_pages, VacPageList fraged_pages)
 965 {
 966         BlockNumber nblocks,
 967                                 blkno;
 968         ItemId          itemid;
 969         Buffer          buf;
 970         HeapTupleData tuple;
 971         OffsetNumber offnum,
 972                                 maxoff;
 973         bool            pgchanged,
 974                                 tupgone,
 975                                 notup;
 976         char       *relname;
 977         VacPage         vacpage,
 978                                 vacpagecopy;
 979         BlockNumber empty_pages,
 980                                 new_pages,
 981                                 changed_pages,
 982                                 empty_end_pages;
 983         double          num_tuples,
 984                                 tups_vacuumed,
 985                                 nkeep,
 986                                 nunused;
 987         double          free_size,
 988                                 usable_free_size;
 989         Size            min_tlen = MaxTupleSize;
 990         Size            max_tlen = 0;
 991         int                     i;
 992         bool            do_shrinking = true;
 993         VTupleLink      vtlinks = (VTupleLink) palloc(100 * sizeof(VTupleLinkData));
 994         int                     num_vtlinks = 0;
 995         int                     free_vtlinks = 100;
 996         VacRUsage       ru0;
 997
 998         vac_init_rusage(&ru0);
 999
1000         relname = RelationGetRelationName(onerel);
1001         elog(elevel, "--Relation %s.%s--",
1002                  get_namespace_name(RelationGetNamespace(onerel)),
1003                  relname);
1004
1005         empty_pages = new_pages = changed_pages = empty_end_pages = 0;
1006         num_tuples = tups_vacuumed = nkeep = nunused = 0;
1007         free_size = 0;
1008
1009         nblocks = RelationGetNumberOfBlocks(onerel);
1010
1011         /*
1012          * We initially create each VacPage item in a maximal-sized workspace,
1013          * then copy the workspace into a just-large-enough copy.
1014          */
1015         vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
1016
1017         for (blkno = 0; blkno < nblocks; blkno++)
1018         {
1019                 Page            page,
1020                                         tempPage = NULL;
1021                 bool            do_reap,
1022                                         do_frag;
1023
1024                 CHECK_FOR_INTERRUPTS();
1025
1026                 buf = ReadBuffer(onerel, blkno);
1027                 page = BufferGetPage(buf);
1028
1029                 vacpage->blkno = blkno;
1030                 vacpage->offsets_used = 0;
1031                 vacpage->offsets_free = 0;
1032
1033                 if (PageIsNew(page))
1034                 {
1035                         elog(WARNING, "Rel %s: Uninitialized page %u - fixing",
1036                                  relname, blkno);
1037                         PageInit(page, BufferGetPageSize(buf), 0);
1038                         vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1039                         free_size += (vacpage->free - sizeof(ItemIdData));
1040                         new_pages++;
1041                         empty_end_pages++;
1042                         vacpagecopy = copy_vac_page(vacpage);
1043                         vpage_insert(vacuum_pages, vacpagecopy);
1044                         vpage_insert(fraged_pages, vacpagecopy);
1045                         WriteBuffer(buf);
1046                         continue;
1047                 }
1048
1049                 if (PageIsEmpty(page))
1050                 {
1051                         vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1052                         free_size += (vacpage->free - sizeof(ItemIdData));
1053                         empty_pages++;
1054                         empty_end_pages++;
1055                         vacpagecopy = copy_vac_page(vacpage);
1056                         vpage_insert(vacuum_pages, vacpagecopy);
1057                         vpage_insert(fraged_pages, vacpagecopy);
1058                         ReleaseBuffer(buf);
1059                         continue;
1060                 }
1061
1062                 pgchanged = false;
1063                 notup = true;
1064                 maxoff = PageGetMaxOffsetNumber(page);
1065                 for (offnum = FirstOffsetNumber;
1066                          offnum <= maxoff;
1067                          offnum = OffsetNumberNext(offnum))
1068                 {
1069                         uint16          sv_infomask;
1070
1071                         itemid = PageGetItemId(page, offnum);
1072
1073                         /*
1074                          * Collect un-used items too - it's possible to have indexes
1075                          * pointing here after crash.
1076                          */
1077                         if (!ItemIdIsUsed(itemid))
1078                         {
1079                                 vacpage->offsets[vacpage->offsets_free++] = offnum;
1080                                 nunused += 1;
1081                                 continue;
1082                         }
1083
1084                         tuple.t_datamcxt = NULL;
1085                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1086                         tuple.t_len = ItemIdGetLength(itemid);
1087                         ItemPointerSet(&(tuple.t_self), blkno, offnum);
1088
1089                         tupgone = false;
1090                         sv_infomask = tuple.t_data->t_infomask;
1091
1092                         switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin))
1093                         {
1094                                 case HEAPTUPLE_DEAD:
1095                                         tupgone = true;         /* we can delete the tuple */
1096                                         break;
1097                                 case HEAPTUPLE_LIVE:
1098
1099                                         /*
1100                                          * Tuple is good.  Consider whether to replace its
1101                                          * xmin value with FrozenTransactionId.
1102                                          */
1103                                         if (TransactionIdIsNormal(HeapTupleHeaderGetXmin(tuple.t_data)) &&
1104                                                 TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
1105                                                                                           FreezeLimit))
1106                                         {
1107                                                 HeapTupleHeaderSetXmin(tuple.t_data, FrozenTransactionId);
1108                                                 /* infomask should be okay already */
1109                                                 Assert(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED);
1110                                                 pgchanged = true;
1111                                         }
1112                                         break;
1113                                 case HEAPTUPLE_RECENTLY_DEAD:
1114
1115                                         /*
1116                                          * If tuple is recently deleted then we must not
1117                                          * remove it from relation.
1118                                          */
1119                                         nkeep += 1;
1120
1121                                         /*
1122                                          * If we do shrinking and this tuple is updated one
1123                                          * then remember it to construct updated tuple
1124                                          * dependencies.
1125                                          */
1126                                         if (do_shrinking &&
1127                                                 !(ItemPointerEquals(&(tuple.t_self),
1128                                                                                         &(tuple.t_data->t_ctid))))
1129                                         {
1130                                                 if (free_vtlinks == 0)
1131                                                 {
1132                                                         free_vtlinks = 1000;
1133                                                         vtlinks = (VTupleLink) repalloc(vtlinks,
1134                                                                                    (free_vtlinks + num_vtlinks) *
1135                                                                                                  sizeof(VTupleLinkData));
1136                                                 }
1137                                                 vtlinks[num_vtlinks].new_tid = tuple.t_data->t_ctid;
1138                                                 vtlinks[num_vtlinks].this_tid = tuple.t_self;
1139                                                 free_vtlinks--;
1140                                                 num_vtlinks++;
1141                                         }
1142                                         break;
1143                                 case HEAPTUPLE_INSERT_IN_PROGRESS:
1144
1145                                         /*
1146                                          * This should not happen, since we hold exclusive
1147                                          * lock on the relation; shouldn't we raise an error?
1148                                          */
1149                                         elog(WARNING, "Rel %s: TID %u/%u: InsertTransactionInProgress %u - can't shrink relation",
1150                                                  relname, blkno, offnum, HeapTupleHeaderGetXmin(tuple.t_data));
1151                                         do_shrinking = false;
1152                                         break;
1153                                 case HEAPTUPLE_DELETE_IN_PROGRESS:
1154
1155                                         /*
1156                                          * This should not happen, since we hold exclusive
1157                                          * lock on the relation; shouldn't we raise an error?
1158                                          */
1159                                         elog(WARNING, "Rel %s: TID %u/%u: DeleteTransactionInProgress %u - can't shrink relation",
1160                                                  relname, blkno, offnum, HeapTupleHeaderGetXmax(tuple.t_data));
1161                                         do_shrinking = false;
1162                                         break;
1163                                 default:
1164                                         elog(ERROR, "Unexpected HeapTupleSatisfiesVacuum result");
1165                                         break;
1166                         }
1167
1168                         /* check for hint-bit update by HeapTupleSatisfiesVacuum */
1169                         if (sv_infomask != tuple.t_data->t_infomask)
1170                                 pgchanged = true;
1171
1172                         /*
1173                          * Other checks...
1174                          */
1175                         if (!OidIsValid(tuple.t_data->t_oid) &&
1176                                 onerel->rd_rel->relhasoids)
1177                                 elog(WARNING, "Rel %s: TID %u/%u: OID IS INVALID. TUPGONE %d.",
1178                                          relname, blkno, offnum, (int) tupgone);
1179
1180                         if (tupgone)
1181                         {
1182                                 ItemId          lpp;
1183
1184                                 /*
1185                                  * Here we are building a temporary copy of the page with
1186                                  * dead tuples removed.  Below we will apply
1187                                  * PageRepairFragmentation to the copy, so that we can
1188                                  * determine how much space will be available after
1189                                  * removal of dead tuples.      But note we are NOT changing
1190                                  * the real page yet...
1191                                  */
1192                                 if (tempPage == (Page) NULL)
1193                                 {
1194                                         Size            pageSize;
1195
1196                                         pageSize = PageGetPageSize(page);
1197                                         tempPage = (Page) palloc(pageSize);
1198                                         memcpy(tempPage, page, pageSize);
1199                                 }
1200
1201                                 /* mark it unused on the temp page */
1202                                 lpp = PageGetItemId(tempPage, offnum);
1203                                 lpp->lp_flags &= ~LP_USED;
1204
1205                                 vacpage->offsets[vacpage->offsets_free++] = offnum;
1206                                 tups_vacuumed += 1;
1207                         }
1208                         else
1209                         {
1210                                 num_tuples += 1;
1211                                 notup = false;
1212                                 if (tuple.t_len < min_tlen)
1213                                         min_tlen = tuple.t_len;
1214                                 if (tuple.t_len > max_tlen)
1215                                         max_tlen = tuple.t_len;
1216                         }
1217                 }                                               /* scan along page */
1218
1219                 if (tempPage != (Page) NULL)
1220                 {
1221                         /* Some tuples are removable; figure free space after removal */
1222                         PageRepairFragmentation(tempPage, NULL);
1223                         vacpage->free = ((PageHeader) tempPage)->pd_upper - ((PageHeader) tempPage)->pd_lower;
1224                         pfree(tempPage);
1225                         do_reap = true;
1226                 }
1227                 else
1228                 {
1229                         /* Just use current available space */
1230                         vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1231                         /* Need to reap the page if it has ~LP_USED line pointers */
1232                         do_reap = (vacpage->offsets_free > 0);
1233                 }
1234
1235                 free_size += vacpage->free;
1236
1237                 /*
1238                  * Add the page to fraged_pages if it has a useful amount of free
1239                  * space.  "Useful" means enough for a minimal-sized tuple. But we
1240                  * don't know that accurately near the start of the relation, so
1241                  * add pages unconditionally if they have >= BLCKSZ/10 free space.
1242                  */
1243                 do_frag = (vacpage->free >= min_tlen || vacpage->free >= BLCKSZ / 10);
1244
1245                 if (do_reap || do_frag)
1246                 {
1247                         vacpagecopy = copy_vac_page(vacpage);
1248                         if (do_reap)
1249                                 vpage_insert(vacuum_pages, vacpagecopy);
1250                         if (do_frag)
1251                                 vpage_insert(fraged_pages, vacpagecopy);
1252                 }
1253
1254                 if (notup)
1255                         empty_end_pages++;
1256                 else
1257                         empty_end_pages = 0;
1258
1259                 if (pgchanged)
1260                 {
1261                         WriteBuffer(buf);
1262                         changed_pages++;
1263                 }
1264                 else
1265                         ReleaseBuffer(buf);
1266         }
1267
1268         pfree(vacpage);
1269
1270         /* save stats in the rel list for use later */
1271         vacrelstats->rel_tuples = num_tuples;
1272         vacrelstats->rel_pages = nblocks;
1273         if (num_tuples == 0)
1274                 min_tlen = max_tlen = 0;
1275         vacrelstats->min_tlen = min_tlen;
1276         vacrelstats->max_tlen = max_tlen;
1277
1278         vacuum_pages->empty_end_pages = empty_end_pages;
1279         fraged_pages->empty_end_pages = empty_end_pages;
1280
1281         /*
1282          * Clear the fraged_pages list if we found we couldn't shrink. Else,
1283          * remove any "empty" end-pages from the list, and compute usable free
1284          * space = free space in remaining pages.
1285          */
1286         if (do_shrinking)
1287         {
1288                 Assert((BlockNumber) fraged_pages->num_pages >= empty_end_pages);
1289                 fraged_pages->num_pages -= empty_end_pages;
1290                 usable_free_size = 0;
1291                 for (i = 0; i < fraged_pages->num_pages; i++)
1292                         usable_free_size += fraged_pages->pagedesc[i]->free;
1293         }
1294         else
1295         {
1296                 fraged_pages->num_pages = 0;
1297                 usable_free_size = 0;
1298         }
1299
1300         if (usable_free_size > 0 && num_vtlinks > 0)
1301         {
1302                 qsort((char *) vtlinks, num_vtlinks, sizeof(VTupleLinkData),
1303                           vac_cmp_vtlinks);
1304                 vacrelstats->vtlinks = vtlinks;
1305                 vacrelstats->num_vtlinks = num_vtlinks;
1306         }
1307         else
1308         {
1309                 vacrelstats->vtlinks = NULL;
1310                 vacrelstats->num_vtlinks = 0;
1311                 pfree(vtlinks);
1312         }
1313
1314         elog(elevel, "Pages %u: Changed %u, reaped %u, Empty %u, New %u; \
1315 Tup %.0f: Vac %.0f, Keep/VTL %.0f/%u, UnUsed %.0f, MinLen %lu, MaxLen %lu; \
1316 Re-using: Free/Avail. Space %.0f/%.0f; EndEmpty/Avail. Pages %u/%u.\n\t%s",
1317                  nblocks, changed_pages, vacuum_pages->num_pages, empty_pages,
1318                  new_pages, num_tuples, tups_vacuumed,
1319                  nkeep, vacrelstats->num_vtlinks,
1320                  nunused, (unsigned long) min_tlen, (unsigned long) max_tlen,
1321                  free_size, usable_free_size,
1322                  empty_end_pages, fraged_pages->num_pages,
1323                  vac_show_rusage(&ru0));
1324
1325 }
1326
1327
1328 /*
1329  *      repair_frag() -- try to repair relation's fragmentation
1330  *
1331  *              This routine marks dead tuples as unused and tries re-use dead space
1332  *              by moving tuples (and inserting indexes if needed). It constructs
1333  *              Nvacpagelist list of free-ed pages (moved tuples) and clean indexes
1334  *              for them after committing (in hack-manner - without losing locks
1335  *              and freeing memory!) current transaction. It truncates relation
1336  *              if some end-blocks are gone away.
1337  */
1338 static void
1339 repair_frag(VRelStats *vacrelstats, Relation onerel,
1340                         VacPageList vacuum_pages, VacPageList fraged_pages,
1341                         int nindexes, Relation *Irel)
1342 {
1343         TransactionId myXID;
1344         CommandId       myCID;
1345         Buffer          buf,
1346                                 cur_buffer;
1347         BlockNumber nblocks,
1348                                 blkno;
1349         BlockNumber last_move_dest_block = 0,
1350                                 last_vacuum_block;
1351         Page            page,
1352                                 ToPage = NULL;
1353         OffsetNumber offnum,
1354                                 maxoff,
1355                                 newoff,
1356                                 max_offset;
1357         ItemId          itemid,
1358                                 newitemid;
1359         HeapTupleData tuple,
1360                                 newtup;
1361         TupleDesc       tupdesc;
1362         ResultRelInfo *resultRelInfo;
1363         EState     *estate;
1364         TupleTable      tupleTable;
1365         TupleTableSlot *slot;
1366         VacPageListData Nvacpagelist;
1367         VacPage         cur_page = NULL,
1368                                 last_vacuum_page,
1369                                 vacpage,
1370                            *curpage;
1371         int                     cur_item = 0;
1372         int                     i;
1373         Size            tuple_len;
1374         int                     num_moved,
1375                                 num_fraged_pages,
1376                                 vacuumed_pages;
1377         int                     checked_moved,
1378                                 num_tuples,
1379                                 keep_tuples = 0;
1380         bool            isempty,
1381                                 dowrite,
1382                                 chain_tuple_moved;
1383         VacRUsage       ru0;
1384
1385         vac_init_rusage(&ru0);
1386
1387         myXID = GetCurrentTransactionId();
1388         myCID = GetCurrentCommandId();
1389
1390         tupdesc = RelationGetDescr(onerel);
1391
1392         /*
1393          * We need a ResultRelInfo and an EState so we can use the regular
1394          * executor's index-entry-making machinery.
1395          */
1396         resultRelInfo = makeNode(ResultRelInfo);
1397         resultRelInfo->ri_RangeTableIndex = 1;          /* dummy */
1398         resultRelInfo->ri_RelationDesc = onerel;
1399         resultRelInfo->ri_TrigDesc = NULL;      /* we don't fire triggers */
1400
1401         ExecOpenIndices(resultRelInfo);
1402
1403         estate = CreateExecutorState();
1404         estate->es_result_relations = resultRelInfo;
1405         estate->es_num_result_relations = 1;
1406         estate->es_result_relation_info = resultRelInfo;
1407
1408         /* Set up a dummy tuple table too */
1409         tupleTable = ExecCreateTupleTable(1);
1410         slot = ExecAllocTableSlot(tupleTable);
1411         ExecSetSlotDescriptor(slot, tupdesc, false);
1412
1413         Nvacpagelist.num_pages = 0;
1414         num_fraged_pages = fraged_pages->num_pages;
1415         Assert((BlockNumber) vacuum_pages->num_pages >= vacuum_pages->empty_end_pages);
1416         vacuumed_pages = vacuum_pages->num_pages - vacuum_pages->empty_end_pages;
1417         if (vacuumed_pages > 0)
1418         {
1419                 /* get last reaped page from vacuum_pages */
1420                 last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
1421                 last_vacuum_block = last_vacuum_page->blkno;
1422         }
1423         else
1424         {
1425                 last_vacuum_page = NULL;
1426                 last_vacuum_block = InvalidBlockNumber;
1427         }
1428         cur_buffer = InvalidBuffer;
1429         num_moved = 0;
1430
1431         vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
1432         vacpage->offsets_used = vacpage->offsets_free = 0;
1433
1434         /*
1435          * Scan pages backwards from the last nonempty page, trying to move
1436          * tuples down to lower pages.  Quit when we reach a page that we have
1437          * moved any tuples onto, or the first page if we haven't moved
1438          * anything, or when we find a page we cannot completely empty (this
1439          * last condition is handled by "break" statements within the loop).
1440          *
1441          * NB: this code depends on the vacuum_pages and fraged_pages lists being
1442          * in order by blkno.
1443          */
1444         nblocks = vacrelstats->rel_pages;
1445         for (blkno = nblocks - vacuum_pages->empty_end_pages - 1;
1446                  blkno > last_move_dest_block;
1447                  blkno--)
1448         {
1449                 CHECK_FOR_INTERRUPTS();
1450
1451                 /*
1452                  * Forget fraged_pages pages at or after this one; they're no
1453                  * longer useful as move targets, since we only want to move down.
1454                  * Note that since we stop the outer loop at last_move_dest_block,
1455                  * pages removed here cannot have had anything moved onto them
1456                  * already.
1457                  *
1458                  * Also note that we don't change the stored fraged_pages list, only
1459                  * our local variable num_fraged_pages; so the forgotten pages are
1460                  * still available to be loaded into the free space map later.
1461                  */
1462                 while (num_fraged_pages > 0 &&
1463                         fraged_pages->pagedesc[num_fraged_pages - 1]->blkno >= blkno)
1464                 {
1465                         Assert(fraged_pages->pagedesc[num_fraged_pages - 1]->offsets_used == 0);
1466                         --num_fraged_pages;
1467                 }
1468
1469                 /*
1470                  * Process this page of relation.
1471                  */
1472                 buf = ReadBuffer(onerel, blkno);
1473                 page = BufferGetPage(buf);
1474
1475                 vacpage->offsets_free = 0;
1476
1477                 isempty = PageIsEmpty(page);
1478
1479                 dowrite = false;
1480
1481                 /* Is the page in the vacuum_pages list? */
1482                 if (blkno == last_vacuum_block)
1483                 {
1484                         if (last_vacuum_page->offsets_free > 0)
1485                         {
1486                                 /* there are dead tuples on this page - clean them */
1487                                 Assert(!isempty);
1488                                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1489                                 vacuum_page(onerel, buf, last_vacuum_page);
1490                                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1491                                 dowrite = true;
1492                         }
1493                         else
1494                                 Assert(isempty);
1495                         --vacuumed_pages;
1496                         if (vacuumed_pages > 0)
1497                         {
1498                                 /* get prev reaped page from vacuum_pages */
1499                                 last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
1500                                 last_vacuum_block = last_vacuum_page->blkno;
1501                         }
1502                         else
1503                         {
1504                                 last_vacuum_page = NULL;
1505                                 last_vacuum_block = InvalidBlockNumber;
1506                         }
1507                         if (isempty)
1508                         {
1509                                 ReleaseBuffer(buf);
1510                                 continue;
1511                         }
1512                 }
1513                 else
1514                         Assert(!isempty);
1515
1516                 chain_tuple_moved = false;              /* no one chain-tuple was moved
1517                                                                                  * off this page, yet */
1518                 vacpage->blkno = blkno;
1519                 maxoff = PageGetMaxOffsetNumber(page);
1520                 for (offnum = FirstOffsetNumber;
1521                          offnum <= maxoff;
1522                          offnum = OffsetNumberNext(offnum))
1523                 {
1524                         itemid = PageGetItemId(page, offnum);
1525
1526                         if (!ItemIdIsUsed(itemid))
1527                                 continue;
1528
1529                         tuple.t_datamcxt = NULL;
1530                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1531                         tuple_len = tuple.t_len = ItemIdGetLength(itemid);
1532                         ItemPointerSet(&(tuple.t_self), blkno, offnum);
1533
1534                         if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1535                         {
1536                                 if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
1537                                         elog(ERROR, "Invalid XVAC in tuple header");
1538                                 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1539                                         elog(ERROR, "HEAP_MOVED_IN was not expected");
1540
1541                                 /*
1542                                  * If this (chain) tuple is moved by me already then I
1543                                  * have to check is it in vacpage or not - i.e. is it
1544                                  * moved while cleaning this page or some previous one.
1545                                  */
1546                                 if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
1547                                 {
1548                                         if (keep_tuples == 0)
1549                                                 continue;
1550                                         if (chain_tuple_moved)          /* some chains was moved
1551                                                                                                  * while */
1552                                         {                       /* cleaning this page */
1553                                                 Assert(vacpage->offsets_free > 0);
1554                                                 for (i = 0; i < vacpage->offsets_free; i++)
1555                                                 {
1556                                                         if (vacpage->offsets[i] == offnum)
1557                                                                 break;
1558                                                 }
1559                                                 if (i >= vacpage->offsets_free) /* not found */
1560                                                 {
1561                                                         vacpage->offsets[vacpage->offsets_free++] = offnum;
1562                                                         keep_tuples--;
1563                                                 }
1564                                         }
1565                                         else
1566                                         {
1567                                                 vacpage->offsets[vacpage->offsets_free++] = offnum;
1568                                                 keep_tuples--;
1569                                         }
1570                                         continue;
1571                                 }
1572                                 elog(ERROR, "HEAP_MOVED_OFF was expected");
1573                         }
1574
1575                         /*
1576                          * If this tuple is in the chain of tuples created in updates
1577                          * by "recent" transactions then we have to move all chain of
1578                          * tuples to another places.
1579                          */
1580                         if ((tuple.t_data->t_infomask & HEAP_UPDATED &&
1581                          !TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
1582                                                 OldestXmin)) ||
1583                                 (!(tuple.t_data->t_infomask & HEAP_XMAX_INVALID) &&
1584                                  !(ItemPointerEquals(&(tuple.t_self),
1585                                                                          &(tuple.t_data->t_ctid)))))
1586                         {
1587                                 Buffer          Cbuf = buf;
1588                                 Page            Cpage;
1589                                 ItemId          Citemid;
1590                                 ItemPointerData Ctid;
1591                                 HeapTupleData tp = tuple;
1592                                 Size            tlen = tuple_len;
1593                                 VTupleMove      vtmove = (VTupleMove)
1594                                 palloc(100 * sizeof(VTupleMoveData));
1595                                 int                     num_vtmove = 0;
1596                                 int                     free_vtmove = 100;
1597                                 VacPage         to_vacpage = NULL;
1598                                 int                     to_item = 0;
1599                                 bool            freeCbuf = false;
1600                                 int                     ti;
1601
1602                                 if (vacrelstats->vtlinks == NULL)
1603                                         elog(ERROR, "No one parent tuple was found");
1604                                 if (cur_buffer != InvalidBuffer)
1605                                 {
1606                                         WriteBuffer(cur_buffer);
1607                                         cur_buffer = InvalidBuffer;
1608                                 }
1609
1610                                 /*
1611                                  * If this tuple is in the begin/middle of the chain then
1612                                  * we have to move to the end of chain.
1613                                  */
1614                                 while (!(tp.t_data->t_infomask & HEAP_XMAX_INVALID) &&
1615                                            !(ItemPointerEquals(&(tp.t_self),
1616                                                                                    &(tp.t_data->t_ctid))))
1617                                 {
1618                                         Ctid = tp.t_data->t_ctid;
1619                                         if (freeCbuf)
1620                                                 ReleaseBuffer(Cbuf);
1621                                         freeCbuf = true;
1622                                         Cbuf = ReadBuffer(onerel,
1623                                                                           ItemPointerGetBlockNumber(&Ctid));
1624                                         Cpage = BufferGetPage(Cbuf);
1625                                         Citemid = PageGetItemId(Cpage,
1626                                                                           ItemPointerGetOffsetNumber(&Ctid));
1627                                         if (!ItemIdIsUsed(Citemid))
1628                                         {
1629                                                 /*
1630                                                  * This means that in the middle of chain there
1631                                                  * was tuple updated by older (than OldestXmin)
1632                                                  * xaction and this tuple is already deleted by
1633                                                  * me. Actually, upper part of chain should be
1634                                                  * removed and seems that this should be handled
1635                                                  * in scan_heap(), but it's not implemented at the
1636                                                  * moment and so we just stop shrinking here.
1637                                                  */
1638                                                 ReleaseBuffer(Cbuf);
1639                                                 pfree(vtmove);
1640                                                 vtmove = NULL;
1641                                                 elog(WARNING, "Child itemid in update-chain marked as unused - can't continue repair_frag");
1642                                                 break;
1643                                         }
1644                                         tp.t_datamcxt = NULL;
1645                                         tp.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
1646                                         tp.t_self = Ctid;
1647                                         tlen = tp.t_len = ItemIdGetLength(Citemid);
1648                                 }
1649                                 if (vtmove == NULL)
1650                                         break;
1651                                 /* first, can chain be moved ? */
1652                                 for (;;)
1653                                 {
1654                                         if (to_vacpage == NULL ||
1655                                                 !enough_space(to_vacpage, tlen))
1656                                         {
1657                                                 for (i = 0; i < num_fraged_pages; i++)
1658                                                 {
1659                                                         if (enough_space(fraged_pages->pagedesc[i], tlen))
1660                                                                 break;
1661                                                 }
1662
1663                                                 if (i == num_fraged_pages)
1664                                                 {
1665                                                         /* can't move item anywhere */
1666                                                         for (i = 0; i < num_vtmove; i++)
1667                                                         {
1668                                                                 Assert(vtmove[i].vacpage->offsets_used > 0);
1669                                                                 (vtmove[i].vacpage->offsets_used)--;
1670                                                         }
1671                                                         num_vtmove = 0;
1672                                                         break;
1673                                                 }
1674                                                 to_item = i;
1675                                                 to_vacpage = fraged_pages->pagedesc[to_item];
1676                                         }
1677                                         to_vacpage->free -= MAXALIGN(tlen);
1678                                         if (to_vacpage->offsets_used >= to_vacpage->offsets_free)
1679                                                 to_vacpage->free -= MAXALIGN(sizeof(ItemIdData));
1680                                         (to_vacpage->offsets_used)++;
1681                                         if (free_vtmove == 0)
1682                                         {
1683                                                 free_vtmove = 1000;
1684                                                 vtmove = (VTupleMove) repalloc(vtmove,
1685                                                                                          (free_vtmove + num_vtmove) *
1686                                                                                                  sizeof(VTupleMoveData));
1687                                         }
1688                                         vtmove[num_vtmove].tid = tp.t_self;
1689                                         vtmove[num_vtmove].vacpage = to_vacpage;
1690                                         if (to_vacpage->offsets_used == 1)
1691                                                 vtmove[num_vtmove].cleanVpd = true;
1692                                         else
1693                                                 vtmove[num_vtmove].cleanVpd = false;
1694                                         free_vtmove--;
1695                                         num_vtmove++;
1696
1697                                         /* All done ? */
1698                                         if (!(tp.t_data->t_infomask & HEAP_UPDATED) ||
1699                                             TransactionIdPrecedes(HeapTupleHeaderGetXmin(tp.t_data),
1700                                                                   OldestXmin))
1701                                                 break;
1702
1703                                         /* Well, try to find tuple with old row version */
1704                                         for (;;)
1705                                         {
1706                                                 Buffer          Pbuf;
1707                                                 Page            Ppage;
1708                                                 ItemId          Pitemid;
1709                                                 HeapTupleData Ptp;
1710                                                 VTupleLinkData vtld,
1711                                                                    *vtlp;
1712
1713                                                 vtld.new_tid = tp.t_self;
1714                                                 vtlp = (VTupleLink)
1715                                                         vac_bsearch((void *) &vtld,
1716                                                                                 (void *) (vacrelstats->vtlinks),
1717                                                                                 vacrelstats->num_vtlinks,
1718                                                                                 sizeof(VTupleLinkData),
1719                                                                                 vac_cmp_vtlinks);
1720                                                 if (vtlp == NULL)
1721                                                         elog(ERROR, "Parent tuple was not found");
1722                                                 tp.t_self = vtlp->this_tid;
1723                                                 Pbuf = ReadBuffer(onerel,
1724                                                                 ItemPointerGetBlockNumber(&(tp.t_self)));
1725                                                 Ppage = BufferGetPage(Pbuf);
1726                                                 Pitemid = PageGetItemId(Ppage,
1727                                                            ItemPointerGetOffsetNumber(&(tp.t_self)));
1728                                                 if (!ItemIdIsUsed(Pitemid))
1729                                                         elog(ERROR, "Parent itemid marked as unused");
1730                                                 Ptp.t_datamcxt = NULL;
1731                                                 Ptp.t_data = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
1732                                                 Assert(ItemPointerEquals(&(vtld.new_tid),
1733                                                                                                  &(Ptp.t_data->t_ctid)));
1734
1735                                                 /*
1736                                                  * Read above about cases when
1737                                                  * !ItemIdIsUsed(Citemid) (child item is
1738                                                  * removed)... Due to the fact that at the moment
1739                                                  * we don't remove unuseful part of update-chain,
1740                                                  * it's possible to get too old parent row here.
1741                                                  * Like as in the case which caused this problem,
1742                                                  * we stop shrinking here. I could try to find
1743                                                  * real parent row but want not to do it because
1744                                                  * of real solution will be implemented anyway,
1745                                                  * latter, and we are too close to 6.5 release. -
1746                                                  * vadim 06/11/99
1747                                                  */
1748                                                 if (!(TransactionIdEquals(HeapTupleHeaderGetXmax(Ptp.t_data),
1749                                                                                                   HeapTupleHeaderGetXmin(tp.t_data))))
1750                                                 {
1751                                                         if (freeCbuf)
1752                                                                 ReleaseBuffer(Cbuf);
1753                                                         freeCbuf = false;
1754                                                         ReleaseBuffer(Pbuf);
1755                                                         for (i = 0; i < num_vtmove; i++)
1756                                                         {
1757                                                                 Assert(vtmove[i].vacpage->offsets_used > 0);
1758                                                                 (vtmove[i].vacpage->offsets_used)--;
1759                                                         }
1760                                                         num_vtmove = 0;
1761                                                         elog(WARNING, "Too old parent tuple found - can't continue repair_frag");
1762                                                         break;
1763                                                 }
1764 #ifdef NOT_USED                                 /* I'm not sure that this will wotk
1765                                                                  * properly... */
1766
1767                                                 /*
1768                                                  * If this tuple is updated version of row and it
1769                                                  * was created by the same transaction then no one
1770                                                  * is interested in this tuple - mark it as
1771                                                  * removed.
1772                                                  */
1773                                                 if (Ptp.t_data->t_infomask & HEAP_UPDATED &&
1774                                                         TransactionIdEquals(HeapTupleHeaderGetXmin(Ptp.t_data),
1775                                                                                                 HeapTupleHeaderGetXmax(Ptp.t_data)))
1776                                                 {
1777                                                         Ptp.t_data->t_infomask &=
1778                                                                 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1779                                                         Ptp.t_data->t_infomask |= HEAP_MOVED_OFF;
1780                                                         HeapTupleHeaderSetXvac(Ptp.t_data, myXID);
1781                                                         WriteBuffer(Pbuf);
1782                                                         continue;
1783                                                 }
1784 #endif
1785                                                 tp.t_datamcxt = Ptp.t_datamcxt;
1786                                                 tp.t_data = Ptp.t_data;
1787                                                 tlen = tp.t_len = ItemIdGetLength(Pitemid);
1788                                                 if (freeCbuf)
1789                                                         ReleaseBuffer(Cbuf);
1790                                                 Cbuf = Pbuf;
1791                                                 freeCbuf = true;
1792                                                 break;
1793                                         }
1794                                         if (num_vtmove == 0)
1795                                                 break;
1796                                 }
1797                                 if (freeCbuf)
1798                                         ReleaseBuffer(Cbuf);
1799                                 if (num_vtmove == 0)    /* chain can't be moved */
1800                                 {
1801                                         pfree(vtmove);
1802                                         break;
1803                                 }
1804                                 ItemPointerSetInvalid(&Ctid);
1805                                 for (ti = 0; ti < num_vtmove; ti++)
1806                                 {
1807                                         VacPage         destvacpage = vtmove[ti].vacpage;
1808
1809                                         /* Get page to move from */
1810                                         tuple.t_self = vtmove[ti].tid;
1811                                         Cbuf = ReadBuffer(onerel,
1812                                                          ItemPointerGetBlockNumber(&(tuple.t_self)));
1813
1814                                         /* Get page to move to */
1815                                         cur_buffer = ReadBuffer(onerel, destvacpage->blkno);
1816
1817                                         LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
1818                                         if (cur_buffer != Cbuf)
1819                                                 LockBuffer(Cbuf, BUFFER_LOCK_EXCLUSIVE);
1820
1821                                         ToPage = BufferGetPage(cur_buffer);
1822                                         Cpage = BufferGetPage(Cbuf);
1823
1824                                         Citemid = PageGetItemId(Cpage,
1825                                                         ItemPointerGetOffsetNumber(&(tuple.t_self)));
1826                                         tuple.t_datamcxt = NULL;
1827                                         tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
1828                                         tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
1829
1830                                         /*
1831                                          * make a copy of the source tuple, and then mark the
1832                                          * source tuple MOVED_OFF.
1833                                          */
1834                                         heap_copytuple_with_tuple(&tuple, &newtup);
1835
1836                                         /*
1837                                          * register invalidation of source tuple in catcaches.
1838                                          */
1839                                         CacheInvalidateHeapTuple(onerel, &tuple);
1840
1841                                         /* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
1842                                         START_CRIT_SECTION();
1843
1844                                         tuple.t_data->t_infomask &=
1845                                                 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1846                                         tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
1847                                         HeapTupleHeaderSetXvac(tuple.t_data, myXID);
1848
1849                                         /*
1850                                          * If this page was not used before - clean it.
1851                                          *
1852                                          * NOTE: a nasty bug used to lurk here.  It is possible
1853                                          * for the source and destination pages to be the same
1854                                          * (since this tuple-chain member can be on a page
1855                                          * lower than the one we're currently processing in
1856                                          * the outer loop).  If that's true, then after
1857                                          * vacuum_page() the source tuple will have been
1858                                          * moved, and tuple.t_data will be pointing at
1859                                          * garbage.  Therefore we must do everything that uses
1860                                          * tuple.t_data BEFORE this step!!
1861                                          *
1862                                          * This path is different from the other callers of
1863                                          * vacuum_page, because we have already incremented
1864                                          * the vacpage's offsets_used field to account for the
1865                                          * tuple(s) we expect to move onto the page. Therefore
1866                                          * vacuum_page's check for offsets_used == 0 is wrong.
1867                                          * But since that's a good debugging check for all
1868                                          * other callers, we work around it here rather than
1869                                          * remove it.
1870                                          */
1871                                         if (!PageIsEmpty(ToPage) && vtmove[ti].cleanVpd)
1872                                         {
1873                                                 int                     sv_offsets_used = destvacpage->offsets_used;
1874
1875                                                 destvacpage->offsets_used = 0;
1876                                                 vacuum_page(onerel, cur_buffer, destvacpage);
1877                                                 destvacpage->offsets_used = sv_offsets_used;
1878                                         }
1879
1880                                         /*
1881                                          * Update the state of the copied tuple, and store it
1882                                          * on the destination page.
1883                                          */
1884                                         newtup.t_data->t_infomask &=
1885                                                 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF);
1886                                         newtup.t_data->t_infomask |= HEAP_MOVED_IN;
1887                                         HeapTupleHeaderSetXvac(newtup.t_data, myXID);
1888                                         newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
1889                                                                                  InvalidOffsetNumber, LP_USED);
1890                                         if (newoff == InvalidOffsetNumber)
1891                                         {
1892                                                 elog(PANIC, "moving chain: failed to add item with len = %lu to page %u",
1893                                                   (unsigned long) tuple_len, destvacpage->blkno);
1894                                         }
1895                                         newitemid = PageGetItemId(ToPage, newoff);
1896                                         pfree(newtup.t_data);
1897                                         newtup.t_datamcxt = NULL;
1898                                         newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
1899                                         ItemPointerSet(&(newtup.t_self), destvacpage->blkno, newoff);
1900
1901                                         {
1902                                                 XLogRecPtr      recptr =
1903                                                 log_heap_move(onerel, Cbuf, tuple.t_self,
1904                                                                           cur_buffer, &newtup);
1905
1906                                                 if (Cbuf != cur_buffer)
1907                                                 {
1908                                                         PageSetLSN(Cpage, recptr);
1909                                                         PageSetSUI(Cpage, ThisStartUpID);
1910                                                 }
1911                                                 PageSetLSN(ToPage, recptr);
1912                                                 PageSetSUI(ToPage, ThisStartUpID);
1913                                         }
1914                                         END_CRIT_SECTION();
1915
1916                                         if (destvacpage->blkno > last_move_dest_block)
1917                                                 last_move_dest_block = destvacpage->blkno;
1918
1919                                         /*
1920                                          * Set new tuple's t_ctid pointing to itself for last
1921                                          * tuple in chain, and to next tuple in chain
1922                                          * otherwise.
1923                                          */
1924                                         if (!ItemPointerIsValid(&Ctid))
1925                                                 newtup.t_data->t_ctid = newtup.t_self;
1926                                         else
1927                                                 newtup.t_data->t_ctid = Ctid;
1928                                         Ctid = newtup.t_self;
1929
1930                                         num_moved++;
1931
1932                                         /*
1933                                          * Remember that we moved tuple from the current page
1934                                          * (corresponding index tuple will be cleaned).
1935                                          */
1936                                         if (Cbuf == buf)
1937                                                 vacpage->offsets[vacpage->offsets_free++] =
1938                                                         ItemPointerGetOffsetNumber(&(tuple.t_self));
1939                                         else
1940                                                 keep_tuples++;
1941
1942                                         LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
1943                                         if (cur_buffer != Cbuf)
1944                                                 LockBuffer(Cbuf, BUFFER_LOCK_UNLOCK);
1945
1946                                         /* Create index entries for the moved tuple */
1947                                         if (resultRelInfo->ri_NumIndices > 0)
1948                                         {
1949                                                 ExecStoreTuple(&newtup, slot, InvalidBuffer, false);
1950                                                 ExecInsertIndexTuples(slot, &(newtup.t_self),
1951                                                                                           estate, true);
1952                                         }
1953
1954                                         WriteBuffer(cur_buffer);
1955                                         WriteBuffer(Cbuf);
1956                                 }
1957                                 cur_buffer = InvalidBuffer;
1958                                 pfree(vtmove);
1959                                 chain_tuple_moved = true;
1960                                 continue;
1961                         }
1962
1963                         /* try to find new page for this tuple */
1964                         if (cur_buffer == InvalidBuffer ||
1965                                 !enough_space(cur_page, tuple_len))
1966                         {
1967                                 if (cur_buffer != InvalidBuffer)
1968                                 {
1969                                         WriteBuffer(cur_buffer);
1970                                         cur_buffer = InvalidBuffer;
1971                                 }
1972                                 for (i = 0; i < num_fraged_pages; i++)
1973                                 {
1974                                         if (enough_space(fraged_pages->pagedesc[i], tuple_len))
1975                                                 break;
1976                                 }
1977                                 if (i == num_fraged_pages)
1978                                         break;          /* can't move item anywhere */
1979                                 cur_item = i;
1980                                 cur_page = fraged_pages->pagedesc[cur_item];
1981                                 cur_buffer = ReadBuffer(onerel, cur_page->blkno);
1982                                 LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
1983                                 ToPage = BufferGetPage(cur_buffer);
1984                                 /* if this page was not used before - clean it */
1985                                 if (!PageIsEmpty(ToPage) && cur_page->offsets_used == 0)
1986                                         vacuum_page(onerel, cur_buffer, cur_page);
1987                         }
1988                         else
1989                                 LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
1990
1991                         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1992
1993                         /* copy tuple */
1994                         heap_copytuple_with_tuple(&tuple, &newtup);
1995
1996                         /*
1997                          * register invalidation of source tuple in catcaches.
1998                          *
1999                          * (Note: we do not need to register the copied tuple,
2000                          * because we are not changing the tuple contents and
2001                          * so there cannot be any need to flush negative
2002                          * catcache entries.)
2003                          */
2004                         CacheInvalidateHeapTuple(onerel, &tuple);
2005
2006                         /* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
2007                         START_CRIT_SECTION();
2008
2009                         /*
2010                          * Mark new tuple as moved_in by vacuum and store vacuum XID
2011                          * in t_cmin !!!
2012                          */
2013                         newtup.t_data->t_infomask &=
2014                                 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF);
2015                         newtup.t_data->t_infomask |= HEAP_MOVED_IN;
2016                         HeapTupleHeaderSetXvac(newtup.t_data, myXID);
2017
2018                         /* add tuple to the page */
2019                         newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
2020                                                                  InvalidOffsetNumber, LP_USED);
2021                         if (newoff == InvalidOffsetNumber)
2022                         {
2023                                 elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
2024                                          (unsigned long) tuple_len,
2025                                          cur_page->blkno, (unsigned long) cur_page->free,
2026                                          cur_page->offsets_used, cur_page->offsets_free);
2027                         }
2028                         newitemid = PageGetItemId(ToPage, newoff);
2029                         pfree(newtup.t_data);
2030                         newtup.t_datamcxt = NULL;
2031                         newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
2032                         ItemPointerSet(&(newtup.t_data->t_ctid), cur_page->blkno, newoff);
2033                         newtup.t_self = newtup.t_data->t_ctid;
2034
2035                         /*
2036                          * Mark old tuple as moved_off by vacuum and store vacuum XID
2037                          * in t_cmin !!!
2038                          */
2039                         tuple.t_data->t_infomask &=
2040                                 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
2041                         tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
2042                         HeapTupleHeaderSetXvac(tuple.t_data, myXID);
2043
2044                         {
2045                                 XLogRecPtr      recptr =
2046                                 log_heap_move(onerel, buf, tuple.t_self,
2047                                                           cur_buffer, &newtup);
2048
2049                                 PageSetLSN(page, recptr);
2050                                 PageSetSUI(page, ThisStartUpID);
2051                                 PageSetLSN(ToPage, recptr);
2052                                 PageSetSUI(ToPage, ThisStartUpID);
2053                         }
2054                         END_CRIT_SECTION();
2055
2056                         cur_page->offsets_used++;
2057                         num_moved++;
2058                         cur_page->free = ((PageHeader) ToPage)->pd_upper - ((PageHeader) ToPage)->pd_lower;
2059                         if (cur_page->blkno > last_move_dest_block)
2060                                 last_move_dest_block = cur_page->blkno;
2061
2062                         vacpage->offsets[vacpage->offsets_free++] = offnum;
2063
2064                         LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
2065                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2066
2067                         /* insert index' tuples if needed */
2068                         if (resultRelInfo->ri_NumIndices > 0)
2069                         {
2070                                 ExecStoreTuple(&newtup, slot, InvalidBuffer, false);
2071                                 ExecInsertIndexTuples(slot, &(newtup.t_self), estate, true);
2072                         }
2073                 }                                               /* walk along page */
2074
2075                 if (offnum < maxoff && keep_tuples > 0)
2076                 {
2077                         OffsetNumber off;
2078
2079                         for (off = OffsetNumberNext(offnum);
2080                                  off <= maxoff;
2081                                  off = OffsetNumberNext(off))
2082                         {
2083                                 itemid = PageGetItemId(page, off);
2084                                 if (!ItemIdIsUsed(itemid))
2085                                         continue;
2086                                 tuple.t_datamcxt = NULL;
2087                                 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2088                                 if (tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)
2089                                         continue;
2090                                 if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
2091                                         elog(ERROR, "Invalid XVAC in tuple header (4)");
2092                                 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
2093                                         elog(ERROR, "HEAP_MOVED_IN was not expected (2)");
2094                                 if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
2095                                 {
2096                                         /* some chains was moved while */
2097                                         if (chain_tuple_moved)
2098                                         {                       /* cleaning this page */
2099                                                 Assert(vacpage->offsets_free > 0);
2100                                                 for (i = 0; i < vacpage->offsets_free; i++)
2101                                                 {
2102                                                         if (vacpage->offsets[i] == off)
2103                                                                 break;
2104                                                 }
2105                                                 if (i >= vacpage->offsets_free) /* not found */
2106                                                 {
2107                                                         vacpage->offsets[vacpage->offsets_free++] = off;
2108                                                         Assert(keep_tuples > 0);
2109                                                         keep_tuples--;
2110                                                 }
2111                                         }
2112                                         else
2113                                         {
2114                                                 vacpage->offsets[vacpage->offsets_free++] = off;
2115                                                 Assert(keep_tuples > 0);
2116                                                 keep_tuples--;
2117                                         }
2118                                 }
2119                         }
2120                 }
2121
2122                 if (vacpage->offsets_free > 0)  /* some tuples were moved */
2123                 {
2124                         if (chain_tuple_moved)          /* else - they are ordered */
2125                         {
2126                                 qsort((char *) (vacpage->offsets), vacpage->offsets_free,
2127                                           sizeof(OffsetNumber), vac_cmp_offno);
2128                         }
2129                         vpage_insert(&Nvacpagelist, copy_vac_page(vacpage));
2130                         WriteBuffer(buf);
2131                 }
2132                 else if (dowrite)
2133                         WriteBuffer(buf);
2134                 else
2135                         ReleaseBuffer(buf);
2136
2137                 if (offnum <= maxoff)
2138                         break;                          /* some item(s) left */
2139
2140         }                                                       /* walk along relation */
2141
2142         blkno++;                                        /* new number of blocks */
2143
2144         if (cur_buffer != InvalidBuffer)
2145         {
2146                 Assert(num_moved > 0);
2147                 WriteBuffer(cur_buffer);
2148         }
2149
2150         if (num_moved > 0)
2151         {
2152                 /*
2153                  * We have to commit our tuple movings before we truncate the
2154                  * relation.  Ideally we should do Commit/StartTransactionCommand
2155                  * here, relying on the session-level table lock to protect our
2156                  * exclusive access to the relation.  However, that would require
2157                  * a lot of extra code to close and re-open the relation, indexes,
2158                  * etc.  For now, a quick hack: record status of current
2159                  * transaction as committed, and continue.
2160                  */
2161                 RecordTransactionCommit();
2162         }
2163
2164         /*
2165          * We are not going to move any more tuples across pages, but we still
2166          * need to apply vacuum_page to compact free space in the remaining
2167          * pages in vacuum_pages list.  Note that some of these pages may also
2168          * be in the fraged_pages list, and may have had tuples moved onto
2169          * them; if so, we already did vacuum_page and needn't do it again.
2170          */
2171         for (i = 0, curpage = vacuum_pages->pagedesc;
2172                  i < vacuumed_pages;
2173                  i++, curpage++)
2174         {
2175                 CHECK_FOR_INTERRUPTS();
2176                 Assert((*curpage)->blkno < blkno);
2177                 if ((*curpage)->offsets_used == 0)
2178                 {
2179                         /* this page was not used as a move target, so must clean it */
2180                         buf = ReadBuffer(onerel, (*curpage)->blkno);
2181                         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2182                         page = BufferGetPage(buf);
2183                         if (!PageIsEmpty(page))
2184                                 vacuum_page(onerel, buf, *curpage);
2185                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2186                         WriteBuffer(buf);
2187                 }
2188         }
2189
2190         /*
2191          * Now scan all the pages that we moved tuples onto and update tuple
2192          * status bits.  This is not really necessary, but will save time for
2193          * future transactions examining these tuples.
2194          *
2195          * XXX WARNING that this code fails to clear HEAP_MOVED_OFF tuples from
2196          * pages that were move source pages but not move dest pages.  One
2197          * also wonders whether it wouldn't be better to skip this step and
2198          * let the tuple status updates happen someplace that's not holding an
2199          * exclusive lock on the relation.
2200          */
2201         checked_moved = 0;
2202         for (i = 0, curpage = fraged_pages->pagedesc;
2203                  i < num_fraged_pages;
2204                  i++, curpage++)
2205         {
2206                 CHECK_FOR_INTERRUPTS();
2207                 Assert((*curpage)->blkno < blkno);
2208                 if ((*curpage)->blkno > last_move_dest_block)
2209                         break;                          /* no need to scan any further */
2210                 if ((*curpage)->offsets_used == 0)
2211                         continue;                       /* this page was never used as a move dest */
2212                 buf = ReadBuffer(onerel, (*curpage)->blkno);
2213                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2214                 page = BufferGetPage(buf);
2215                 num_tuples = 0;
2216                 max_offset = PageGetMaxOffsetNumber(page);
2217                 for (newoff = FirstOffsetNumber;
2218                          newoff <= max_offset;
2219                          newoff = OffsetNumberNext(newoff))
2220                 {
2221                         itemid = PageGetItemId(page, newoff);
2222                         if (!ItemIdIsUsed(itemid))
2223                                 continue;
2224                         tuple.t_datamcxt = NULL;
2225                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2226                         if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
2227                         {
2228                                 if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
2229                                         elog(ERROR, "Invalid XVAC in tuple header (2)");
2230                                 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
2231                                 {
2232                                         tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
2233                                         num_tuples++;
2234                                 }
2235                                 else if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
2236                                         tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
2237                                 else
2238                                         elog(ERROR, "HEAP_MOVED_OFF/HEAP_MOVED_IN was expected");
2239                         }
2240                 }
2241                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2242                 WriteBuffer(buf);
2243                 Assert((*curpage)->offsets_used == num_tuples);
2244                 checked_moved += num_tuples;
2245         }
2246         Assert(num_moved == checked_moved);
2247
2248         elog(elevel, "Rel %s: Pages: %u --> %u; Tuple(s) moved: %u.\n\t%s",
2249                  RelationGetRelationName(onerel),
2250                  nblocks, blkno, num_moved,
2251                  vac_show_rusage(&ru0));
2252
2253         /*
2254          * Reflect the motion of system tuples to catalog cache here.
2255          */
2256         CommandCounterIncrement();
2257
2258         if (Nvacpagelist.num_pages > 0)
2259         {
2260                 /* vacuum indexes again if needed */
2261                 if (Irel != (Relation *) NULL)
2262                 {
2263                         VacPage    *vpleft,
2264                                            *vpright,
2265                                                 vpsave;
2266
2267                         /* re-sort Nvacpagelist.pagedesc */
2268                         for (vpleft = Nvacpagelist.pagedesc,
2269                         vpright = Nvacpagelist.pagedesc + Nvacpagelist.num_pages - 1;
2270                                  vpleft < vpright; vpleft++, vpright--)
2271                         {
2272                                 vpsave = *vpleft;
2273                                 *vpleft = *vpright;
2274                                 *vpright = vpsave;
2275                         }
2276                         Assert(keep_tuples >= 0);
2277                         for (i = 0; i < nindexes; i++)
2278                                 vacuum_index(&Nvacpagelist, Irel[i],
2279                                                          vacrelstats->rel_tuples, keep_tuples);
2280                 }
2281
2282                 /* clean moved tuples from last page in Nvacpagelist list */
2283                 if (vacpage->blkno == (blkno - 1) &&
2284                         vacpage->offsets_free > 0)
2285                 {
2286                         OffsetNumber unbuf[BLCKSZ / sizeof(OffsetNumber)];
2287                         OffsetNumber *unused = unbuf;
2288                         int                     uncnt;
2289
2290                         buf = ReadBuffer(onerel, vacpage->blkno);
2291                         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2292                         page = BufferGetPage(buf);
2293                         num_tuples = 0;
2294                         maxoff = PageGetMaxOffsetNumber(page);
2295                         for (offnum = FirstOffsetNumber;
2296                                  offnum <= maxoff;
2297                                  offnum = OffsetNumberNext(offnum))
2298                         {
2299                                 itemid = PageGetItemId(page, offnum);
2300                                 if (!ItemIdIsUsed(itemid))
2301                                         continue;
2302                                 tuple.t_datamcxt = NULL;
2303                                 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2304
2305                                 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
2306                                 {
2307                                         if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
2308                                                 elog(ERROR, "Invalid XVAC in tuple header (3)");
2309                                         if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
2310                                         {
2311                                                 itemid->lp_flags &= ~LP_USED;
2312                                                 num_tuples++;
2313                                         }
2314                                         else
2315                                                 elog(ERROR, "HEAP_MOVED_OFF was expected (2)");
2316                                 }
2317
2318                         }
2319                         Assert(vacpage->offsets_free == num_tuples);
2320                         START_CRIT_SECTION();
2321                         uncnt = PageRepairFragmentation(page, unused);
2322                         {
2323                                 XLogRecPtr      recptr;
2324
2325                                 recptr = log_heap_clean(onerel, buf, (char *) unused,
2326                                                   (char *) (&(unused[uncnt])) - (char *) unused);
2327                                 PageSetLSN(page, recptr);
2328                                 PageSetSUI(page, ThisStartUpID);
2329                         }
2330                         END_CRIT_SECTION();
2331                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2332                         WriteBuffer(buf);
2333                 }
2334
2335                 /* now - free new list of reaped pages */
2336                 curpage = Nvacpagelist.pagedesc;
2337                 for (i = 0; i < Nvacpagelist.num_pages; i++, curpage++)
2338                         pfree(*curpage);
2339                 pfree(Nvacpagelist.pagedesc);
2340         }
2341
2342         /*
2343          * Flush dirty pages out to disk.  We do this unconditionally, even if
2344          * we don't need to truncate, because we want to ensure that all
2345          * tuples have correct on-row commit status on disk (see bufmgr.c's
2346          * comments for FlushRelationBuffers()).
2347          */
2348         i = FlushRelationBuffers(onerel, blkno);
2349         if (i < 0)
2350                 elog(ERROR, "VACUUM (repair_frag): FlushRelationBuffers returned %d",
2351                          i);
2352
2353         /* truncate relation, if needed */
2354         if (blkno < nblocks)
2355         {
2356                 blkno = smgrtruncate(DEFAULT_SMGR, onerel, blkno);
2357                 onerel->rd_nblocks = blkno;             /* update relcache immediately */
2358                 onerel->rd_targblock = InvalidBlockNumber;
2359                 vacrelstats->rel_pages = blkno; /* set new number of blocks */
2360         }
2361
2362         /* clean up */
2363         pfree(vacpage);
2364         if (vacrelstats->vtlinks != NULL)
2365                 pfree(vacrelstats->vtlinks);
2366
2367         ExecDropTupleTable(tupleTable, true);
2368
2369         ExecCloseIndices(resultRelInfo);
2370 }
2371
2372 /*
2373  *      vacuum_heap() -- free dead tuples
2374  *
2375  *              This routine marks dead tuples as unused and truncates relation
2376  *              if there are "empty" end-blocks.
2377  */
2378 static void
2379 vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
2380 {
2381         Buffer          buf;
2382         VacPage    *vacpage;
2383         BlockNumber relblocks;
2384         int                     nblocks;
2385         int                     i;
2386
2387         nblocks = vacuum_pages->num_pages;
2388         nblocks -= vacuum_pages->empty_end_pages;       /* nothing to do with them */
2389
2390         for (i = 0, vacpage = vacuum_pages->pagedesc; i < nblocks; i++, vacpage++)
2391         {
2392                 CHECK_FOR_INTERRUPTS();
2393                 if ((*vacpage)->offsets_free > 0)
2394                 {
2395                         buf = ReadBuffer(onerel, (*vacpage)->blkno);
2396                         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2397                         vacuum_page(onerel, buf, *vacpage);
2398                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2399                         WriteBuffer(buf);
2400                 }
2401         }
2402
2403         /*
2404          * Flush dirty pages out to disk.  We do this unconditionally, even if
2405          * we don't need to truncate, because we want to ensure that all
2406          * tuples have correct on-row commit status on disk (see bufmgr.c's
2407          * comments for FlushRelationBuffers()).
2408          */
2409         Assert(vacrelstats->rel_pages >= vacuum_pages->empty_end_pages);
2410         relblocks = vacrelstats->rel_pages - vacuum_pages->empty_end_pages;
2411
2412         i = FlushRelationBuffers(onerel, relblocks);
2413         if (i < 0)
2414                 elog(ERROR, "VACUUM (vacuum_heap): FlushRelationBuffers returned %d",
2415                          i);
2416
2417         /* truncate relation if there are some empty end-pages */
2418         if (vacuum_pages->empty_end_pages > 0)
2419         {
2420                 elog(elevel, "Rel %s: Pages: %u --> %u.",
2421                          RelationGetRelationName(onerel),
2422                          vacrelstats->rel_pages, relblocks);
2423                 relblocks = smgrtruncate(DEFAULT_SMGR, onerel, relblocks);
2424                 onerel->rd_nblocks = relblocks; /* update relcache immediately */
2425                 onerel->rd_targblock = InvalidBlockNumber;
2426                 vacrelstats->rel_pages = relblocks;             /* set new number of
2427                                                                                                  * blocks */
2428         }
2429 }
2430
2431 /*
2432  *      vacuum_page() -- free dead tuples on a page
2433  *                                       and repair its fragmentation.
2434  */
2435 static void
2436 vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
2437 {
2438         OffsetNumber unbuf[BLCKSZ / sizeof(OffsetNumber)];
2439         OffsetNumber *unused = unbuf;
2440         int                     uncnt;
2441         Page            page = BufferGetPage(buffer);
2442         ItemId          itemid;
2443         int                     i;
2444
2445         /* There shouldn't be any tuples moved onto the page yet! */
2446         Assert(vacpage->offsets_used == 0);
2447
2448         START_CRIT_SECTION();
2449         for (i = 0; i < vacpage->offsets_free; i++)
2450         {
2451                 itemid = PageGetItemId(page, vacpage->offsets[i]);
2452                 itemid->lp_flags &= ~LP_USED;
2453         }
2454         uncnt = PageRepairFragmentation(page, unused);
2455         {
2456                 XLogRecPtr      recptr;
2457
2458                 recptr = log_heap_clean(onerel, buffer, (char *) unused,
2459                                                   (char *) (&(unused[uncnt])) - (char *) unused);
2460                 PageSetLSN(page, recptr);
2461                 PageSetSUI(page, ThisStartUpID);
2462         }
2463         END_CRIT_SECTION();
2464 }
2465
2466 /*
2467  *      scan_index() -- scan one index relation to update statistic.
2468  *
2469  * We use this when we have no deletions to do.
2470  */
2471 static void
2472 scan_index(Relation indrel, double num_tuples)
2473 {
2474         IndexBulkDeleteResult *stats;
2475         VacRUsage       ru0;
2476
2477         vac_init_rusage(&ru0);
2478
2479         /*
2480          * Even though we're not planning to delete anything, use the
2481          * ambulkdelete call, so that the scan happens within the index AM for
2482          * more speed.
2483          */
2484         stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);
2485
2486         if (!stats)
2487                 return;
2488
2489         /* now update statistics in pg_class */
2490         vac_update_relstats(RelationGetRelid(indrel),
2491                                                 stats->num_pages, stats->num_index_tuples,
2492                                                 false);
2493
2494         elog(elevel, "Index %s: Pages %u; Tuples %.0f.\n\t%s",
2495                  RelationGetRelationName(indrel),
2496                  stats->num_pages, stats->num_index_tuples,
2497                  vac_show_rusage(&ru0));
2498
2499         /*
2500          * Check for tuple count mismatch.      If the index is partial, then it's
2501          * OK for it to have fewer tuples than the heap; else we got trouble.
2502          */
2503         if (stats->num_index_tuples != num_tuples)
2504         {
2505                 if (stats->num_index_tuples > num_tuples ||
2506                         !vac_is_partial_index(indrel))
2507                         elog(WARNING, "Index %s: NUMBER OF INDEX' TUPLES (%.0f) IS NOT THE SAME AS HEAP' (%.0f).\
2508 \n\tRecreate the index.",
2509                                  RelationGetRelationName(indrel),
2510                                  stats->num_index_tuples, num_tuples);
2511         }
2512
2513         pfree(stats);
2514 }
2515
2516 /*
2517  *      vacuum_index() -- vacuum one index relation.
2518  *
2519  *              Vpl is the VacPageList of the heap we're currently vacuuming.
2520  *              It's locked. Indrel is an index relation on the vacuumed heap.
2521  *
2522  *              We don't bother to set locks on the index relation here, since
2523  *              the parent table is exclusive-locked already.
2524  *
2525  *              Finally, we arrange to update the index relation's statistics in
2526  *              pg_class.
2527  */
2528 static void
2529 vacuum_index(VacPageList vacpagelist, Relation indrel,
2530                          double num_tuples, int keep_tuples)
2531 {
2532         IndexBulkDeleteResult *stats;
2533         VacRUsage       ru0;
2534
2535         vac_init_rusage(&ru0);
2536
2537         /* Do bulk deletion */
2538         stats = index_bulk_delete(indrel, tid_reaped, (void *) vacpagelist);
2539
2540         if (!stats)
2541                 return;
2542
2543         /* now update statistics in pg_class */
2544         vac_update_relstats(RelationGetRelid(indrel),
2545                                                 stats->num_pages, stats->num_index_tuples,
2546                                                 false);
2547
2548         elog(elevel, "Index %s: Pages %u; Tuples %.0f: Deleted %.0f.\n\t%s",
2549                  RelationGetRelationName(indrel), stats->num_pages,
2550                  stats->num_index_tuples - keep_tuples, stats->tuples_removed,
2551                  vac_show_rusage(&ru0));
2552
2553         /*
2554          * Check for tuple count mismatch.      If the index is partial, then it's
2555          * OK for it to have fewer tuples than the heap; else we got trouble.
2556          */
2557         if (stats->num_index_tuples != num_tuples + keep_tuples)
2558         {
2559                 if (stats->num_index_tuples > num_tuples + keep_tuples ||
2560                         !vac_is_partial_index(indrel))
2561                         elog(WARNING, "Index %s: NUMBER OF INDEX' TUPLES (%.0f) IS NOT THE SAME AS HEAP' (%.0f).\
2562 \n\tRecreate the index.",
2563                                  RelationGetRelationName(indrel),
2564                                  stats->num_index_tuples, num_tuples);
2565         }
2566
2567         pfree(stats);
2568 }
2569
2570 /*
2571  *      tid_reaped() -- is a particular tid reaped?
2572  *
2573  *              This has the right signature to be an IndexBulkDeleteCallback.
2574  *
2575  *              vacpagelist->VacPage_array is sorted in right order.
2576  */
2577 static bool
2578 tid_reaped(ItemPointer itemptr, void *state)
2579 {
2580         VacPageList vacpagelist = (VacPageList) state;
2581         OffsetNumber ioffno;
2582         OffsetNumber *voff;
2583         VacPage         vp,
2584                            *vpp;
2585         VacPageData vacpage;
2586
2587         vacpage.blkno = ItemPointerGetBlockNumber(itemptr);
2588         ioffno = ItemPointerGetOffsetNumber(itemptr);
2589
2590         vp = &vacpage;
2591         vpp = (VacPage *) vac_bsearch((void *) &vp,
2592                                                                   (void *) (vacpagelist->pagedesc),
2593                                                                   vacpagelist->num_pages,
2594                                                                   sizeof(VacPage),
2595                                                                   vac_cmp_blk);
2596
2597         if (vpp == NULL)
2598                 return false;
2599
2600         /* ok - we are on a partially or fully reaped page */
2601         vp = *vpp;
2602
2603         if (vp->offsets_free == 0)
2604         {
2605                 /* this is EmptyPage, so claim all tuples on it are reaped!!! */
2606                 return true;
2607         }
2608
2609         voff = (OffsetNumber *) vac_bsearch((void *) &ioffno,
2610                                                                                 (void *) (vp->offsets),
2611                                                                                 vp->offsets_free,
2612                                                                                 sizeof(OffsetNumber),
2613                                                                                 vac_cmp_offno);
2614
2615         if (voff == NULL)
2616                 return false;
2617
2618         /* tid is reaped */
2619         return true;
2620 }
2621
2622 /*
2623  * Dummy version for scan_index.
2624  */
2625 static bool
2626 dummy_tid_reaped(ItemPointer itemptr, void *state)
2627 {
2628         return false;
2629 }
2630
2631 /*
2632  * Update the shared Free Space Map with the info we now have about
2633  * free space in the relation, discarding any old info the map may have.
2634  */
2635 static void
2636 vac_update_fsm(Relation onerel, VacPageList fraged_pages,
2637                            BlockNumber rel_pages)
2638 {
2639         int                     nPages = fraged_pages->num_pages;
2640         int                     i;
2641         BlockNumber *pages;
2642         Size       *spaceAvail;
2643
2644         /* +1 to avoid palloc(0) */
2645         pages = (BlockNumber *) palloc((nPages + 1) * sizeof(BlockNumber));
2646         spaceAvail = (Size *) palloc((nPages + 1) * sizeof(Size));
2647
2648         for (i = 0; i < nPages; i++)
2649         {
2650                 pages[i] = fraged_pages->pagedesc[i]->blkno;
2651                 spaceAvail[i] = fraged_pages->pagedesc[i]->free;
2652
2653                 /*
2654                  * fraged_pages may contain entries for pages that we later
2655                  * decided to truncate from the relation; don't enter them into
2656                  * the map!
2657                  */
2658                 if (pages[i] >= rel_pages)
2659                 {
2660                         nPages = i;
2661                         break;
2662                 }
2663         }
2664
2665         MultiRecordFreeSpace(&onerel->rd_node,
2666                                                  0, MaxBlockNumber,
2667                                                  nPages, pages, spaceAvail);
2668         pfree(pages);
2669         pfree(spaceAvail);
2670 }
2671
2672 /* Copy a VacPage structure */
2673 static VacPage
2674 copy_vac_page(VacPage vacpage)
2675 {
2676         VacPage         newvacpage;
2677
2678         /* allocate a VacPageData entry */
2679         newvacpage = (VacPage) palloc(sizeof(VacPageData) +
2680                                                    vacpage->offsets_free * sizeof(OffsetNumber));
2681
2682         /* fill it in */
2683         if (vacpage->offsets_free > 0)
2684                 memcpy(newvacpage->offsets, vacpage->offsets,
2685                            vacpage->offsets_free * sizeof(OffsetNumber));
2686         newvacpage->blkno = vacpage->blkno;
2687         newvacpage->free = vacpage->free;
2688         newvacpage->offsets_used = vacpage->offsets_used;
2689         newvacpage->offsets_free = vacpage->offsets_free;
2690
2691         return newvacpage;
2692 }
2693
2694 /*
2695  * Add a VacPage pointer to a VacPageList.
2696  *
2697  *              As a side effect of the way that scan_heap works,
2698  *              higher pages come after lower pages in the array
2699  *              (and highest tid on a page is last).
2700  */
2701 static void
2702 vpage_insert(VacPageList vacpagelist, VacPage vpnew)
2703 {
2704 #define PG_NPAGEDESC 1024
2705
2706         /* allocate a VacPage entry if needed */
2707         if (vacpagelist->num_pages == 0)
2708         {
2709                 vacpagelist->pagedesc = (VacPage *) palloc(PG_NPAGEDESC * sizeof(VacPage));
2710                 vacpagelist->num_allocated_pages = PG_NPAGEDESC;
2711         }
2712         else if (vacpagelist->num_pages >= vacpagelist->num_allocated_pages)
2713         {
2714                 vacpagelist->num_allocated_pages *= 2;
2715                 vacpagelist->pagedesc = (VacPage *) repalloc(vacpagelist->pagedesc, vacpagelist->num_allocated_pages * sizeof(VacPage));
2716         }
2717         vacpagelist->pagedesc[vacpagelist->num_pages] = vpnew;
2718         (vacpagelist->num_pages)++;
2719 }
2720
2721 /*
2722  * vac_bsearch: just like standard C library routine bsearch(),
2723  * except that we first test to see whether the target key is outside
2724  * the range of the table entries.      This case is handled relatively slowly
2725  * by the normal binary search algorithm (ie, no faster than any other key)
2726  * but it occurs often enough in VACUUM to be worth optimizing.
2727  */
2728 static void *
2729 vac_bsearch(const void *key, const void *base,
2730                         size_t nelem, size_t size,
2731                         int (*compar) (const void *, const void *))
2732 {
2733         int                     res;
2734         const void *last;
2735
2736         if (nelem == 0)
2737                 return NULL;
2738         res = compar(key, base);
2739         if (res < 0)
2740                 return NULL;
2741         if (res == 0)
2742                 return (void *) base;
2743         if (nelem > 1)
2744         {
2745                 last = (const void *) ((const char *) base + (nelem - 1) * size);
2746                 res = compar(key, last);
2747                 if (res > 0)
2748                         return NULL;
2749                 if (res == 0)
2750                         return (void *) last;
2751         }
2752         if (nelem <= 2)
2753                 return NULL;                    /* already checked 'em all */
2754         return bsearch(key, base, nelem, size, compar);
2755 }
2756
2757 /*
2758  * Comparator routines for use with qsort() and bsearch().
2759  */
2760 static int
2761 vac_cmp_blk(const void *left, const void *right)
2762 {
2763         BlockNumber lblk,
2764                                 rblk;
2765
2766         lblk = (*((VacPage *) left))->blkno;
2767         rblk = (*((VacPage *) right))->blkno;
2768
2769         if (lblk < rblk)
2770                 return -1;
2771         if (lblk == rblk)
2772                 return 0;
2773         return 1;
2774 }
2775
2776 static int
2777 vac_cmp_offno(const void *left, const void *right)
2778 {
2779         if (*(OffsetNumber *) left < *(OffsetNumber *) right)
2780                 return -1;
2781         if (*(OffsetNumber *) left == *(OffsetNumber *) right)
2782                 return 0;
2783         return 1;
2784 }
2785
2786 static int
2787 vac_cmp_vtlinks(const void *left, const void *right)
2788 {
2789         if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi <
2790                 ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2791                 return -1;
2792         if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi >
2793                 ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2794                 return 1;
2795         /* bi_hi-es are equal */
2796         if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo <
2797                 ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2798                 return -1;
2799         if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo >
2800                 ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2801                 return 1;
2802         /* bi_lo-es are equal */
2803         if (((VTupleLink) left)->new_tid.ip_posid <
2804                 ((VTupleLink) right)->new_tid.ip_posid)
2805                 return -1;
2806         if (((VTupleLink) left)->new_tid.ip_posid >
2807                 ((VTupleLink) right)->new_tid.ip_posid)
2808                 return 1;
2809         return 0;
2810 }
2811
2812
2813 void
2814 vac_open_indexes(Relation relation, int *nindexes, Relation **Irel)
2815 {
2816         List       *indexoidlist,
2817                            *indexoidscan;
2818         int                     i;
2819
2820         indexoidlist = RelationGetIndexList(relation);
2821
2822         *nindexes = length(indexoidlist);
2823
2824         if (*nindexes > 0)
2825                 *Irel = (Relation *) palloc(*nindexes * sizeof(Relation));
2826         else
2827                 *Irel = NULL;
2828
2829         i = 0;
2830         foreach(indexoidscan, indexoidlist)
2831         {
2832                 Oid                     indexoid = lfirsti(indexoidscan);
2833
2834                 (*Irel)[i] = index_open(indexoid);
2835                 i++;
2836         }
2837
2838         freeList(indexoidlist);
2839 }
2840
2841
2842 void
2843 vac_close_indexes(int nindexes, Relation *Irel)
2844 {
2845         if (Irel == (Relation *) NULL)
2846                 return;
2847
2848         while (nindexes--)
2849                 index_close(Irel[nindexes]);
2850         pfree(Irel);
2851 }
2852
2853
2854 /*
2855  * Is an index partial (ie, could it contain fewer tuples than the heap?)
2856  */
2857 bool
2858 vac_is_partial_index(Relation indrel)
2859 {
2860         /*
2861          * If the index's AM doesn't support nulls, it's partial for our
2862          * purposes
2863          */
2864         if (!indrel->rd_am->amindexnulls)
2865                 return true;
2866
2867         /* Otherwise, look to see if there's a partial-index predicate */
2868         return (VARSIZE(&indrel->rd_index->indpred) > VARHDRSZ);
2869 }
2870
2871
2872 static bool
2873 enough_space(VacPage vacpage, Size len)
2874 {
2875         len = MAXALIGN(len);
2876
2877         if (len > vacpage->free)
2878                 return false;
2879
2880         /* if there are free itemid(s) and len <= free_space... */
2881         if (vacpage->offsets_used < vacpage->offsets_free)
2882                 return true;
2883
2884         /* noff_used >= noff_free and so we'll have to allocate new itemid */
2885         if (len + sizeof(ItemIdData) <= vacpage->free)
2886                 return true;
2887
2888         return false;
2889 }
2890
2891
2892 /*
2893  * Initialize usage snapshot.
2894  */
2895 void
2896 vac_init_rusage(VacRUsage *ru0)
2897 {
2898         struct timezone tz;
2899
2900         getrusage(RUSAGE_SELF, &ru0->ru);
2901         gettimeofday(&ru0->tv, &tz);
2902 }
2903
2904 /*
2905  * Compute elapsed time since ru0 usage snapshot, and format into
2906  * a displayable string.  Result is in a static string, which is
2907  * tacky, but no one ever claimed that the Postgres backend is
2908  * threadable...
2909  */
2910 const char *
2911 vac_show_rusage(VacRUsage *ru0)
2912 {
2913         static char result[100];
2914         VacRUsage       ru1;
2915
2916         vac_init_rusage(&ru1);
2917
2918         if (ru1.tv.tv_usec < ru0->tv.tv_usec)
2919         {
2920                 ru1.tv.tv_sec--;
2921                 ru1.tv.tv_usec += 1000000;
2922         }
2923         if (ru1.ru.ru_stime.tv_usec < ru0->ru.ru_stime.tv_usec)
2924         {
2925                 ru1.ru.ru_stime.tv_sec--;
2926                 ru1.ru.ru_stime.tv_usec += 1000000;
2927         }
2928         if (ru1.ru.ru_utime.tv_usec < ru0->ru.ru_utime.tv_usec)
2929         {
2930                 ru1.ru.ru_utime.tv_sec--;
2931                 ru1.ru.ru_utime.tv_usec += 1000000;
2932         }
2933
2934         snprintf(result, sizeof(result),
2935                          "CPU %d.%02ds/%d.%02du sec elapsed %d.%02d sec.",
2936                          (int) (ru1.ru.ru_stime.tv_sec - ru0->ru.ru_stime.tv_sec),
2937           (int) (ru1.ru.ru_stime.tv_usec - ru0->ru.ru_stime.tv_usec) / 10000,
2938                          (int) (ru1.ru.ru_utime.tv_sec - ru0->ru.ru_utime.tv_sec),
2939           (int) (ru1.ru.ru_utime.tv_usec - ru0->ru.ru_utime.tv_usec) / 10000,
2940                          (int) (ru1.tv.tv_sec - ru0->tv.tv_sec),
2941                          (int) (ru1.tv.tv_usec - ru0->tv.tv_usec) / 10000);
2942
2943         return result;
2944 }