]> granicus.if.org Git - postgresql/blob - src/backend/commands/vacuum.c
First batch of object rename commands.
[postgresql] / src / backend / commands / vacuum.c
1 /*-------------------------------------------------------------------------
2  *
3  * vacuum.c
4  *        The postgres vacuum cleaner.
5  *
6  * This file includes the "full" version of VACUUM, as well as control code
7  * used by all three of full VACUUM, lazy VACUUM, and ANALYZE.  See
8  * vacuumlazy.c and analyze.c for the rest of the code for the latter two.
9  *
10  *
11  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
12  * Portions Copyright (c) 1994, Regents of the University of California
13  *
14  *
15  * IDENTIFICATION
16  *        $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.256 2003/06/27 14:45:27 petere Exp $
17  *
18  *-------------------------------------------------------------------------
19  */
20 #include "postgres.h"
21
22 #include <unistd.h>
23
24 #include "access/clog.h"
25 #include "access/genam.h"
26 #include "access/heapam.h"
27 #include "access/xlog.h"
28 #include "catalog/catalog.h"
29 #include "catalog/catname.h"
30 #include "catalog/namespace.h"
31 #include "catalog/pg_database.h"
32 #include "catalog/pg_index.h"
33 #include "commands/vacuum.h"
34 #include "executor/executor.h"
35 #include "miscadmin.h"
36 #include "storage/freespace.h"
37 #include "storage/sinval.h"
38 #include "storage/smgr.h"
39 #include "tcop/pquery.h"
40 #include "utils/acl.h"
41 #include "utils/builtins.h"
42 #include "utils/fmgroids.h"
43 #include "utils/inval.h"
44 #include "utils/lsyscache.h"
45 #include "utils/relcache.h"
46 #include "utils/syscache.h"
47 #include "pgstat.h"
48
49
50 typedef struct VacPageData
51 {
52         BlockNumber blkno;                      /* BlockNumber of this Page */
53         Size            free;                   /* FreeSpace on this Page */
54         uint16          offsets_used;   /* Number of OffNums used by vacuum */
55         uint16          offsets_free;   /* Number of OffNums free or to be free */
56         OffsetNumber offsets[1];        /* Array of free OffNums */
57 } VacPageData;
58
59 typedef VacPageData *VacPage;
60
61 typedef struct VacPageListData
62 {
63         BlockNumber empty_end_pages;    /* Number of "empty" end-pages */
64         int                     num_pages;              /* Number of pages in pagedesc */
65         int                     num_allocated_pages;    /* Number of allocated pages in
66                                                                                  * pagedesc */
67         VacPage    *pagedesc;           /* Descriptions of pages */
68 } VacPageListData;
69
70 typedef VacPageListData *VacPageList;
71
72 typedef struct VTupleLinkData
73 {
74         ItemPointerData new_tid;
75         ItemPointerData this_tid;
76 } VTupleLinkData;
77
78 typedef VTupleLinkData *VTupleLink;
79
80 typedef struct VTupleMoveData
81 {
82         ItemPointerData tid;            /* tuple ID */
83         VacPage         vacpage;                /* where to move */
84         bool            cleanVpd;               /* clean vacpage before using */
85 } VTupleMoveData;
86
87 typedef VTupleMoveData *VTupleMove;
88
89 typedef struct VRelStats
90 {
91         BlockNumber rel_pages;
92         double          rel_tuples;
93         Size            min_tlen;
94         Size            max_tlen;
95         bool            hasindex;
96         int                     num_vtlinks;
97         VTupleLink      vtlinks;
98 } VRelStats;
99
100
101 static MemoryContext vac_context = NULL;
102
103 static int      elevel = -1;
104
105 static TransactionId OldestXmin;
106 static TransactionId FreezeLimit;
107
108
109 /* non-export function prototypes */
110 static List *getrels(const RangeVar *vacrel, const char *stmttype);
111 static void vac_update_dbstats(Oid dbid,
112                                    TransactionId vacuumXID,
113                                    TransactionId frozenXID);
114 static void vac_truncate_clog(TransactionId vacuumXID,
115                                   TransactionId frozenXID);
116 static bool vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind);
117 static void full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt);
118 static void scan_heap(VRelStats *vacrelstats, Relation onerel,
119                   VacPageList vacuum_pages, VacPageList fraged_pages);
120 static void repair_frag(VRelStats *vacrelstats, Relation onerel,
121                         VacPageList vacuum_pages, VacPageList fraged_pages,
122                         int nindexes, Relation *Irel);
123 static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
124                         VacPageList vacpagelist);
125 static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
126 static void vacuum_index(VacPageList vacpagelist, Relation indrel,
127                          double num_tuples, int keep_tuples);
128 static void scan_index(Relation indrel, double num_tuples);
129 static bool tid_reaped(ItemPointer itemptr, void *state);
130 static bool dummy_tid_reaped(ItemPointer itemptr, void *state);
131 static void vac_update_fsm(Relation onerel, VacPageList fraged_pages,
132                            BlockNumber rel_pages);
133 static VacPage copy_vac_page(VacPage vacpage);
134 static void vpage_insert(VacPageList vacpagelist, VacPage vpnew);
135 static void *vac_bsearch(const void *key, const void *base,
136                         size_t nelem, size_t size,
137                         int (*compar) (const void *, const void *));
138 static int      vac_cmp_blk(const void *left, const void *right);
139 static int      vac_cmp_offno(const void *left, const void *right);
140 static int      vac_cmp_vtlinks(const void *left, const void *right);
141 static bool enough_space(VacPage vacpage, Size len);
142
143
144 /****************************************************************************
145  *                                                                                                                                                      *
146  *                      Code common to all flavors of VACUUM and ANALYZE                                *
147  *                                                                                                                                                      *
148  ****************************************************************************
149  */
150
151
152 /*
153  * Primary entry point for VACUUM and ANALYZE commands.
154  */
155 void
156 vacuum(VacuumStmt *vacstmt)
157 {
158         const char *stmttype = vacstmt->vacuum ? "VACUUM" : "ANALYZE";
159         MemoryContext anl_context = NULL;
160         TransactionId initialOldestXmin = InvalidTransactionId;
161         TransactionId initialFreezeLimit = InvalidTransactionId;
162         bool            all_rels;
163         List       *vrl,
164                            *cur;
165
166         if (vacstmt->verbose)
167                 elevel = INFO;
168         else
169                 elevel = DEBUG2;
170
171         /*
172          * We cannot run VACUUM inside a user transaction block; if we were
173          * inside a transaction, then our commit- and
174          * start-transaction-command calls would not have the intended effect!
175          * Furthermore, the forced commit that occurs before truncating the
176          * relation's file would have the effect of committing the rest of the
177          * user's transaction too, which would certainly not be the desired
178          * behavior.
179          */
180         if (vacstmt->vacuum)
181                 PreventTransactionChain((void *) vacstmt, stmttype);
182
183         /*
184          * Send info about dead objects to the statistics collector
185          */
186         if (vacstmt->vacuum)
187                 pgstat_vacuum_tabstat();
188
189         /*
190          * Create special memory context for cross-transaction storage.
191          *
192          * Since it is a child of PortalContext, it will go away eventually even
193          * if we suffer an error; there's no need for special abort cleanup
194          * logic.
195          */
196         vac_context = AllocSetContextCreate(PortalContext,
197                                                                                 "Vacuum",
198                                                                                 ALLOCSET_DEFAULT_MINSIZE,
199                                                                                 ALLOCSET_DEFAULT_INITSIZE,
200                                                                                 ALLOCSET_DEFAULT_MAXSIZE);
201
202         /*
203          * If we are running only ANALYZE, we don't need per-table
204          * transactions, but we still need a memory context with table
205          * lifetime.
206          */
207         if (vacstmt->analyze && !vacstmt->vacuum)
208                 anl_context = AllocSetContextCreate(PortalContext,
209                                                                                         "Analyze",
210                                                                                         ALLOCSET_DEFAULT_MINSIZE,
211                                                                                         ALLOCSET_DEFAULT_INITSIZE,
212                                                                                         ALLOCSET_DEFAULT_MAXSIZE);
213
214         /* Assume we are processing everything unless one table is mentioned */
215         all_rels = (vacstmt->relation == NULL);
216
217         /* Build list of relations to process (note this lives in vac_context) */
218         vrl = getrels(vacstmt->relation, stmttype);
219
220         /*
221          * Formerly, there was code here to prevent more than one VACUUM from
222          * executing concurrently in the same database.  However, there's no
223          * good reason to prevent that, and manually removing lockfiles after
224          * a vacuum crash was a pain for dbadmins.      So, forget about
225          * lockfiles, and just rely on the locks we grab on each target table
226          * to ensure that there aren't two VACUUMs running on the same table
227          * at the same time.
228          */
229
230         /*
231          * The strangeness with committing and starting transactions here is
232          * due to wanting to run each table's VACUUM as a separate
233          * transaction, so that we don't hold locks unnecessarily long.  Also,
234          * if we are doing VACUUM ANALYZE, the ANALYZE part runs as a separate
235          * transaction from the VACUUM to further reduce locking.
236          *
237          * vacuum_rel expects to be entered with no transaction active; it will
238          * start and commit its own transaction.  But we are called by an SQL
239          * command, and so we are executing inside a transaction already.  We
240          * commit the transaction started in PostgresMain() here, and start
241          * another one before exiting to match the commit waiting for us back
242          * in PostgresMain().
243          *
244          * In the case of an ANALYZE statement (no vacuum, just analyze) it's
245          * okay to run the whole thing in the outer transaction, and so we
246          * skip transaction start/stop operations.
247          */
248         if (vacstmt->vacuum)
249         {
250                 if (all_rels)
251                 {
252                         /*
253                          * It's a database-wide VACUUM.
254                          *
255                          * Compute the initially applicable OldestXmin and FreezeLimit
256                          * XIDs, so that we can record these values at the end of the
257                          * VACUUM. Note that individual tables may well be processed
258                          * with newer values, but we can guarantee that no
259                          * (non-shared) relations are processed with older ones.
260                          *
261                          * It is okay to record non-shared values in pg_database, even
262                          * though we may vacuum shared relations with older cutoffs,
263                          * because only the minimum of the values present in
264                          * pg_database matters.  We can be sure that shared relations
265                          * have at some time been vacuumed with cutoffs no worse than
266                          * the global minimum; for, if there is a backend in some
267                          * other DB with xmin = OLDXMIN that's determining the cutoff
268                          * with which we vacuum shared relations, it is not possible
269                          * for that database to have a cutoff newer than OLDXMIN
270                          * recorded in pg_database.
271                          */
272                         vacuum_set_xid_limits(vacstmt, false,
273                                                                   &initialOldestXmin,
274                                                                   &initialFreezeLimit);
275                 }
276
277                 /* matches the StartTransaction in PostgresMain() */
278                 CommitTransactionCommand();
279         }
280
281         /*
282          * Loop to process each selected relation.
283          */
284         foreach(cur, vrl)
285         {
286                 Oid                     relid = lfirsto(cur);
287
288                 if (vacstmt->vacuum)
289                 {
290                         if (! vacuum_rel(relid, vacstmt, RELKIND_RELATION))
291                                 all_rels = false; /* forget about updating dbstats */
292                 }
293                 if (vacstmt->analyze)
294                 {
295                         MemoryContext old_context = NULL;
296
297                         /*
298                          * If we vacuumed, use new transaction for analyze.
299                          * Otherwise, we can use the outer transaction, but we still
300                          * need to call analyze_rel in a memory context that will be
301                          * cleaned up on return (else we leak memory while processing
302                          * multiple tables).
303                          */
304                         if (vacstmt->vacuum)
305                         {
306                                 StartTransactionCommand();
307                                 SetQuerySnapshot();     /* might be needed for functions in indexes */
308                         }
309                         else
310                                 old_context = MemoryContextSwitchTo(anl_context);
311
312                         analyze_rel(relid, vacstmt);
313
314                         if (vacstmt->vacuum)
315                                 CommitTransactionCommand();
316                         else
317                         {
318                                 MemoryContextSwitchTo(old_context);
319                                 MemoryContextResetAndDeleteChildren(anl_context);
320                         }
321                 }
322         }
323
324         /*
325          * Finish up processing.
326          */
327         if (vacstmt->vacuum)
328         {
329                 /* here, we are not in a transaction */
330
331                 /*
332                  * This matches the CommitTransaction waiting for us in
333                  * PostgresMain().
334                  */
335                 StartTransactionCommand();
336
337                 /*
338                  * If it was a database-wide VACUUM, print FSM usage statistics
339                  * (we don't make you be superuser to see these).
340                  */
341                 if (vacstmt->relation == NULL)
342                         PrintFreeSpaceMapStatistics(elevel);
343
344                 /*
345                  * If we completed a database-wide VACUUM without skipping any
346                  * relations, update the database's pg_database row with info
347                  * about the transaction IDs used, and try to truncate pg_clog.
348                  */
349                 if (all_rels)
350                 {
351                         vac_update_dbstats(MyDatabaseId,
352                                                            initialOldestXmin, initialFreezeLimit);
353                         vac_truncate_clog(initialOldestXmin, initialFreezeLimit);
354                 }
355         }
356
357         /*
358          * Clean up working storage --- note we must do this after
359          * StartTransactionCommand, else we might be trying to delete the
360          * active context!
361          */
362         MemoryContextDelete(vac_context);
363         vac_context = NULL;
364
365         if (anl_context)
366                 MemoryContextDelete(anl_context);
367 }
368
369 /*
370  * Build a list of Oids for each relation to be processed
371  *
372  * The list is built in vac_context so that it will survive across our
373  * per-relation transactions.
374  */
375 static List *
376 getrels(const RangeVar *vacrel, const char *stmttype)
377 {
378         List       *vrl = NIL;
379         MemoryContext oldcontext;
380
381         if (vacrel)
382         {
383                 /* Process specific relation */
384                 Oid                     relid;
385
386                 relid = RangeVarGetRelid(vacrel, false);
387
388                 /* Make a relation list entry for this guy */
389                 oldcontext = MemoryContextSwitchTo(vac_context);
390                 vrl = lappendo(vrl, relid);
391                 MemoryContextSwitchTo(oldcontext);
392         }
393         else
394         {
395                 /* Process all plain relations listed in pg_class */
396                 Relation        pgclass;
397                 HeapScanDesc scan;
398                 HeapTuple       tuple;
399                 ScanKeyData key;
400
401                 ScanKeyEntryInitialize(&key, 0x0,
402                                                            Anum_pg_class_relkind,
403                                                            F_CHAREQ,
404                                                            CharGetDatum(RELKIND_RELATION));
405
406                 pgclass = heap_openr(RelationRelationName, AccessShareLock);
407
408                 scan = heap_beginscan(pgclass, SnapshotNow, 1, &key);
409
410                 while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
411                 {
412                         /* Make a relation list entry for this guy */
413                         oldcontext = MemoryContextSwitchTo(vac_context);
414                         vrl = lappendo(vrl, HeapTupleGetOid(tuple));
415                         MemoryContextSwitchTo(oldcontext);
416                 }
417
418                 heap_endscan(scan);
419                 heap_close(pgclass, AccessShareLock);
420         }
421
422         return vrl;
423 }
424
425 /*
426  * vacuum_set_xid_limits() -- compute oldest-Xmin and freeze cutoff points
427  */
428 void
429 vacuum_set_xid_limits(VacuumStmt *vacstmt, bool sharedRel,
430                                           TransactionId *oldestXmin,
431                                           TransactionId *freezeLimit)
432 {
433         TransactionId limit;
434
435         *oldestXmin = GetOldestXmin(sharedRel);
436
437         Assert(TransactionIdIsNormal(*oldestXmin));
438
439         if (vacstmt->freeze)
440         {
441                 /* FREEZE option: use oldest Xmin as freeze cutoff too */
442                 limit = *oldestXmin;
443         }
444         else
445         {
446                 /*
447                  * Normal case: freeze cutoff is well in the past, to wit, about
448                  * halfway to the wrap horizon
449                  */
450                 limit = GetCurrentTransactionId() - (MaxTransactionId >> 2);
451         }
452
453         /*
454          * Be careful not to generate a "permanent" XID
455          */
456         if (!TransactionIdIsNormal(limit))
457                 limit = FirstNormalTransactionId;
458
459         /*
460          * Ensure sane relationship of limits
461          */
462         if (TransactionIdFollows(limit, *oldestXmin))
463         {
464                 elog(WARNING, "oldest Xmin is far in the past --- close open transactions soon to avoid wraparound problems");
465                 limit = *oldestXmin;
466         }
467
468         *freezeLimit = limit;
469 }
470
471
472 /*
473  *      vac_update_relstats() -- update statistics for one relation
474  *
475  *              Update the whole-relation statistics that are kept in its pg_class
476  *              row.  There are additional stats that will be updated if we are
477  *              doing ANALYZE, but we always update these stats.  This routine works
478  *              for both index and heap relation entries in pg_class.
479  *
480  *              We violate no-overwrite semantics here by storing new values for the
481  *              statistics columns directly into the pg_class tuple that's already on
482  *              the page.  The reason for this is that if we updated these tuples in
483  *              the usual way, vacuuming pg_class itself wouldn't work very well ---
484  *              by the time we got done with a vacuum cycle, most of the tuples in
485  *              pg_class would've been obsoleted.  Of course, this only works for
486  *              fixed-size never-null columns, but these are.
487  *
488  *              This routine is shared by full VACUUM, lazy VACUUM, and stand-alone
489  *              ANALYZE.
490  */
491 void
492 vac_update_relstats(Oid relid, BlockNumber num_pages, double num_tuples,
493                                         bool hasindex)
494 {
495         Relation        rd;
496         HeapTupleData rtup;
497         HeapTuple       ctup;
498         Form_pg_class pgcform;
499         Buffer          buffer;
500
501         /*
502          * update number of tuples and number of pages in pg_class
503          */
504         rd = heap_openr(RelationRelationName, RowExclusiveLock);
505
506         ctup = SearchSysCache(RELOID,
507                                                   ObjectIdGetDatum(relid),
508                                                   0, 0, 0);
509         if (!HeapTupleIsValid(ctup))
510                 elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
511                          relid);
512
513         /* get the buffer cache tuple */
514         rtup.t_self = ctup->t_self;
515         ReleaseSysCache(ctup);
516         if (!heap_fetch(rd, SnapshotNow, &rtup, &buffer, false, NULL))
517                 elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
518                          relid);
519
520         /* overwrite the existing statistics in the tuple */
521         pgcform = (Form_pg_class) GETSTRUCT(&rtup);
522         pgcform->relpages = (int32) num_pages;
523         pgcform->reltuples = num_tuples;
524         pgcform->relhasindex = hasindex;
525
526         /*
527          * If we have discovered that there are no indexes, then there's no
528          * primary key either.  This could be done more thoroughly...
529          */
530         if (!hasindex)
531                 pgcform->relhaspkey = false;
532
533         /*
534          * Invalidate the tuple in the catcaches; this also arranges to flush
535          * the relation's relcache entry.  (If we fail to commit for some
536          * reason, no flush will occur, but no great harm is done since there
537          * are no noncritical state updates here.)
538          */
539         CacheInvalidateHeapTuple(rd, &rtup);
540
541         /* Write the buffer */
542         WriteBuffer(buffer);
543
544         heap_close(rd, RowExclusiveLock);
545 }
546
547
548 /*
549  *      vac_update_dbstats() -- update statistics for one database
550  *
551  *              Update the whole-database statistics that are kept in its pg_database
552  *              row.
553  *
554  *              We violate no-overwrite semantics here by storing new values for the
555  *              statistics columns directly into the tuple that's already on the page.
556  *              As with vac_update_relstats, this avoids leaving dead tuples behind
557  *              after a VACUUM; which is good since GetRawDatabaseInfo
558  *              can get confused by finding dead tuples in pg_database.
559  *
560  *              This routine is shared by full and lazy VACUUM.  Note that it is only
561  *              applied after a database-wide VACUUM operation.
562  */
563 static void
564 vac_update_dbstats(Oid dbid,
565                                    TransactionId vacuumXID,
566                                    TransactionId frozenXID)
567 {
568         Relation        relation;
569         ScanKeyData entry[1];
570         HeapScanDesc scan;
571         HeapTuple       tuple;
572         Form_pg_database dbform;
573
574         relation = heap_openr(DatabaseRelationName, RowExclusiveLock);
575
576         /* Must use a heap scan, since there's no syscache for pg_database */
577         ScanKeyEntryInitialize(&entry[0], 0x0,
578                                                    ObjectIdAttributeNumber, F_OIDEQ,
579                                                    ObjectIdGetDatum(dbid));
580
581         scan = heap_beginscan(relation, SnapshotNow, 1, entry);
582
583         tuple = heap_getnext(scan, ForwardScanDirection);
584
585         if (!HeapTupleIsValid(tuple))
586                 elog(ERROR, "database %u does not exist", dbid);
587
588         dbform = (Form_pg_database) GETSTRUCT(tuple);
589
590         /* overwrite the existing statistics in the tuple */
591         dbform->datvacuumxid = vacuumXID;
592         dbform->datfrozenxid = frozenXID;
593
594         /* invalidate the tuple in the cache and write the buffer */
595         CacheInvalidateHeapTuple(relation, tuple);
596         WriteNoReleaseBuffer(scan->rs_cbuf);
597
598         heap_endscan(scan);
599
600         heap_close(relation, RowExclusiveLock);
601 }
602
603
604 /*
605  *      vac_truncate_clog() -- attempt to truncate the commit log
606  *
607  *              Scan pg_database to determine the system-wide oldest datvacuumxid,
608  *              and use it to truncate the transaction commit log (pg_clog).
609  *              Also generate a warning if the system-wide oldest datfrozenxid
610  *              seems to be in danger of wrapping around.
611  *
612  *              The passed XIDs are simply the ones I just wrote into my pg_database
613  *              entry.  They're used to initialize the "min" calculations.
614  *
615  *              This routine is shared by full and lazy VACUUM.  Note that it is only
616  *              applied after a database-wide VACUUM operation.
617  */
618 static void
619 vac_truncate_clog(TransactionId vacuumXID, TransactionId frozenXID)
620 {
621         TransactionId myXID;
622         Relation        relation;
623         HeapScanDesc scan;
624         HeapTuple       tuple;
625         int32           age;
626         bool            vacuumAlreadyWrapped = false;
627         bool            frozenAlreadyWrapped = false;
628
629         myXID = GetCurrentTransactionId();
630
631         relation = heap_openr(DatabaseRelationName, AccessShareLock);
632
633         scan = heap_beginscan(relation, SnapshotNow, 0, NULL);
634
635         while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
636         {
637                 Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple);
638
639                 /* Ignore non-connectable databases (eg, template0) */
640                 /* It's assumed that these have been frozen correctly */
641                 if (!dbform->datallowconn)
642                         continue;
643
644                 if (TransactionIdIsNormal(dbform->datvacuumxid))
645                 {
646                         if (TransactionIdPrecedes(myXID, dbform->datvacuumxid))
647                                 vacuumAlreadyWrapped = true;
648                         else if (TransactionIdPrecedes(dbform->datvacuumxid, vacuumXID))
649                                 vacuumXID = dbform->datvacuumxid;
650                 }
651                 if (TransactionIdIsNormal(dbform->datfrozenxid))
652                 {
653                         if (TransactionIdPrecedes(myXID, dbform->datfrozenxid))
654                                 frozenAlreadyWrapped = true;
655                         else if (TransactionIdPrecedes(dbform->datfrozenxid, frozenXID))
656                                 frozenXID = dbform->datfrozenxid;
657                 }
658         }
659
660         heap_endscan(scan);
661
662         heap_close(relation, AccessShareLock);
663
664         /*
665          * Do not truncate CLOG if we seem to have suffered wraparound
666          * already; the computed minimum XID might be bogus.
667          */
668         if (vacuumAlreadyWrapped)
669         {
670                 elog(WARNING, "Some databases have not been vacuumed in over 2 billion transactions."
671                          "\n\tYou may have already suffered transaction-wraparound data loss.");
672                 return;
673         }
674
675         /* Truncate CLOG to the oldest vacuumxid */
676         TruncateCLOG(vacuumXID);
677
678         /* Give warning about impending wraparound problems */
679         if (frozenAlreadyWrapped)
680         {
681                 elog(WARNING, "Some databases have not been vacuumed in over 1 billion transactions."
682                          "\n\tBetter vacuum them soon, or you may have a wraparound failure.");
683         }
684         else
685         {
686                 age = (int32) (myXID - frozenXID);
687                 if (age > (int32) ((MaxTransactionId >> 3) * 3))
688                         elog(WARNING, "Some databases have not been vacuumed in %d transactions."
689                                  "\n\tBetter vacuum them within %d transactions,"
690                                  "\n\tor you may have a wraparound failure.",
691                                  age, (int32) (MaxTransactionId >> 1) - age);
692         }
693 }
694
695
696 /****************************************************************************
697  *                                                                                                                                                      *
698  *                      Code common to both flavors of VACUUM                                                   *
699  *                                                                                                                                                      *
700  ****************************************************************************
701  */
702
703
704 /*
705  *      vacuum_rel() -- vacuum one heap relation
706  *
707  *              Returns TRUE if we actually processed the relation (or can ignore it
708  *              for some reason), FALSE if we failed to process it due to permissions
709  *              or other reasons.  (A FALSE result really means that some data
710  *              may have been left unvacuumed, so we can't update XID stats.)
711  *
712  *              Doing one heap at a time incurs extra overhead, since we need to
713  *              check that the heap exists again just before we vacuum it.      The
714  *              reason that we do this is so that vacuuming can be spread across
715  *              many small transactions.  Otherwise, two-phase locking would require
716  *              us to lock the entire database during one pass of the vacuum cleaner.
717  *
718  *              At entry and exit, we are not inside a transaction.
719  */
720 static bool
721 vacuum_rel(Oid relid, VacuumStmt *vacstmt, char expected_relkind)
722 {
723         LOCKMODE        lmode;
724         Relation        onerel;
725         LockRelId       onerelid;
726         Oid                     toast_relid;
727         bool            result;
728
729         /* Begin a transaction for vacuuming this relation */
730         StartTransactionCommand();
731         SetQuerySnapshot();                     /* might be needed for functions in indexes */
732
733         /*
734          * Check for user-requested abort.      Note we want this to be inside a
735          * transaction, so xact.c doesn't issue useless WARNING.
736          */
737         CHECK_FOR_INTERRUPTS();
738
739         /*
740          * Race condition -- if the pg_class tuple has gone away since the
741          * last time we saw it, we don't need to vacuum it.
742          */
743         if (!SearchSysCacheExists(RELOID,
744                                                           ObjectIdGetDatum(relid),
745                                                           0, 0, 0))
746         {
747                 CommitTransactionCommand();
748                 return true;                    /* okay 'cause no data there */
749         }
750
751         /*
752          * Determine the type of lock we want --- hard exclusive lock for a
753          * FULL vacuum, but just ShareUpdateExclusiveLock for concurrent
754          * vacuum.      Either way, we can be sure that no other backend is
755          * vacuuming the same table.
756          */
757         lmode = vacstmt->full ? AccessExclusiveLock : ShareUpdateExclusiveLock;
758
759         /*
760          * Open the class, get an appropriate lock on it, and check
761          * permissions.
762          *
763          * We allow the user to vacuum a table if he is superuser, the table
764          * owner, or the database owner (but in the latter case, only if it's
765          * not a shared relation).      pg_class_ownercheck includes the superuser
766          * case.
767          *
768          * Note we choose to treat permissions failure as a WARNING and keep
769          * trying to vacuum the rest of the DB --- is this appropriate?
770          */
771         onerel = relation_open(relid, lmode);
772
773         if (!(pg_class_ownercheck(RelationGetRelid(onerel), GetUserId()) ||
774                   (pg_database_ownercheck(MyDatabaseId, GetUserId()) && !onerel->rd_rel->relisshared)))
775         {
776                 elog(WARNING, "Skipping \"%s\" --- only table or database owner can VACUUM it",
777                          RelationGetRelationName(onerel));
778                 relation_close(onerel, lmode);
779                 CommitTransactionCommand();
780                 return false;
781         }
782
783         /*
784          * Check that it's a plain table; we used to do this in getrels() but
785          * seems safer to check after we've locked the relation.
786          */
787         if (onerel->rd_rel->relkind != expected_relkind)
788         {
789                 elog(WARNING, "Skipping \"%s\" --- can not process indexes, views or special system tables",
790                          RelationGetRelationName(onerel));
791                 relation_close(onerel, lmode);
792                 CommitTransactionCommand();
793                 return false;
794         }
795
796         /*
797          * Silently ignore tables that are temp tables of other backends ---
798          * trying to vacuum these will lead to great unhappiness, since their
799          * contents are probably not up-to-date on disk.  (We don't throw a
800          * warning here; it would just lead to chatter during a database-wide
801          * VACUUM.)
802          */
803         if (isOtherTempNamespace(RelationGetNamespace(onerel)))
804         {
805                 relation_close(onerel, lmode);
806                 CommitTransactionCommand();
807                 return true;                    /* assume no long-lived data in temp tables */
808         }
809
810         /*
811          * Get a session-level lock too. This will protect our access to the
812          * relation across multiple transactions, so that we can vacuum the
813          * relation's TOAST table (if any) secure in the knowledge that no one
814          * is deleting the parent relation.
815          *
816          * NOTE: this cannot block, even if someone else is waiting for access,
817          * because the lock manager knows that both lock requests are from the
818          * same process.
819          */
820         onerelid = onerel->rd_lockInfo.lockRelId;
821         LockRelationForSession(&onerelid, lmode);
822
823         /*
824          * Remember the relation's TOAST relation for later
825          */
826         toast_relid = onerel->rd_rel->reltoastrelid;
827
828         /*
829          * Do the actual work --- either FULL or "lazy" vacuum
830          */
831         if (vacstmt->full)
832                 full_vacuum_rel(onerel, vacstmt);
833         else
834                 lazy_vacuum_rel(onerel, vacstmt);
835
836         result = true;                          /* did the vacuum */
837
838         /* all done with this class, but hold lock until commit */
839         relation_close(onerel, NoLock);
840
841         /*
842          * Complete the transaction and free all temporary memory used.
843          */
844         CommitTransactionCommand();
845
846         /*
847          * If the relation has a secondary toast rel, vacuum that too while we
848          * still hold the session lock on the master table.  Note however that
849          * "analyze" will not get done on the toast table.      This is good,
850          * because the toaster always uses hardcoded index access and
851          * statistics are totally unimportant for toast relations.
852          */
853         if (toast_relid != InvalidOid)
854         {
855                 if (! vacuum_rel(toast_relid, vacstmt, RELKIND_TOASTVALUE))
856                         result = false;         /* failed to vacuum the TOAST table? */
857         }
858
859         /*
860          * Now release the session-level lock on the master table.
861          */
862         UnlockRelationForSession(&onerelid, lmode);
863
864         return result;
865 }
866
867
868 /****************************************************************************
869  *                                                                                                                                                      *
870  *                      Code for VACUUM FULL (only)                                                                             *
871  *                                                                                                                                                      *
872  ****************************************************************************
873  */
874
875
876 /*
877  *      full_vacuum_rel() -- perform FULL VACUUM for one heap relation
878  *
879  *              This routine vacuums a single heap, cleans out its indexes, and
880  *              updates its num_pages and num_tuples statistics.
881  *
882  *              At entry, we have already established a transaction and opened
883  *              and locked the relation.
884  */
885 static void
886 full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
887 {
888         VacPageListData vacuum_pages;           /* List of pages to vacuum and/or
889                                                                                  * clean indexes */
890         VacPageListData fraged_pages;           /* List of pages with space enough
891                                                                                  * for re-using */
892         Relation   *Irel;
893         int                     nindexes,
894                                 i;
895         VRelStats  *vacrelstats;
896         bool            reindex = false;
897
898         if (IsIgnoringSystemIndexes() &&
899                 IsSystemRelation(onerel))
900                 reindex = true;
901
902         vacuum_set_xid_limits(vacstmt, onerel->rd_rel->relisshared,
903                                                   &OldestXmin, &FreezeLimit);
904
905         /*
906          * Set up statistics-gathering machinery.
907          */
908         vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
909         vacrelstats->rel_pages = 0;
910         vacrelstats->rel_tuples = 0;
911         vacrelstats->hasindex = false;
912
913         /* scan the heap */
914         vacuum_pages.num_pages = fraged_pages.num_pages = 0;
915         scan_heap(vacrelstats, onerel, &vacuum_pages, &fraged_pages);
916
917         /* Now open all indexes of the relation */
918         vac_open_indexes(onerel, &nindexes, &Irel);
919         if (!Irel)
920                 reindex = false;
921         else if (!RelationGetForm(onerel)->relhasindex)
922                 reindex = true;
923         if (nindexes > 0)
924                 vacrelstats->hasindex = true;
925
926 #ifdef NOT_USED
927
928         /*
929          * reindex in VACUUM is dangerous under WAL. ifdef out until it
930          * becomes safe.
931          */
932         if (reindex)
933         {
934                 vac_close_indexes(nindexes, Irel);
935                 Irel = (Relation *) NULL;
936                 activate_indexes_of_a_table(onerel, false);
937         }
938 #endif   /* NOT_USED */
939
940         /* Clean/scan index relation(s) */
941         if (Irel != (Relation *) NULL)
942         {
943                 if (vacuum_pages.num_pages > 0)
944                 {
945                         for (i = 0; i < nindexes; i++)
946                                 vacuum_index(&vacuum_pages, Irel[i],
947                                                          vacrelstats->rel_tuples, 0);
948                 }
949                 else
950                 {
951                         /* just scan indexes to update statistic */
952                         for (i = 0; i < nindexes; i++)
953                                 scan_index(Irel[i], vacrelstats->rel_tuples);
954                 }
955         }
956
957         if (fraged_pages.num_pages > 0)
958         {
959                 /* Try to shrink heap */
960                 repair_frag(vacrelstats, onerel, &vacuum_pages, &fraged_pages,
961                                         nindexes, Irel);
962                 vac_close_indexes(nindexes, Irel);
963         }
964         else
965         {
966                 vac_close_indexes(nindexes, Irel);
967                 if (vacuum_pages.num_pages > 0)
968                 {
969                         /* Clean pages from vacuum_pages list */
970                         vacuum_heap(vacrelstats, onerel, &vacuum_pages);
971                 }
972                 else
973                 {
974                         /*
975                          * Flush dirty pages out to disk.  We must do this even if we
976                          * didn't do anything else, because we want to ensure that all
977                          * tuples have correct on-row commit status on disk (see
978                          * bufmgr.c's comments for FlushRelationBuffers()).
979                          */
980                         i = FlushRelationBuffers(onerel, vacrelstats->rel_pages);
981                         if (i < 0)
982                                 elog(ERROR, "VACUUM (full_vacuum_rel): FlushRelationBuffers returned %d",
983                                          i);
984                 }
985         }
986
987 #ifdef NOT_USED
988         if (reindex)
989                 activate_indexes_of_a_table(onerel, true);
990 #endif   /* NOT_USED */
991
992         /* update shared free space map with final free space info */
993         vac_update_fsm(onerel, &fraged_pages, vacrelstats->rel_pages);
994
995         /* update statistics in pg_class */
996         vac_update_relstats(RelationGetRelid(onerel), vacrelstats->rel_pages,
997                                                 vacrelstats->rel_tuples, vacrelstats->hasindex);
998 }
999
1000
1001 /*
1002  *      scan_heap() -- scan an open heap relation
1003  *
1004  *              This routine sets commit status bits, constructs vacuum_pages (list
1005  *              of pages we need to compact free space on and/or clean indexes of
1006  *              deleted tuples), constructs fraged_pages (list of pages with free
1007  *              space that tuples could be moved into), and calculates statistics
1008  *              on the number of live tuples in the heap.
1009  */
1010 static void
1011 scan_heap(VRelStats *vacrelstats, Relation onerel,
1012                   VacPageList vacuum_pages, VacPageList fraged_pages)
1013 {
1014         BlockNumber nblocks,
1015                                 blkno;
1016         ItemId          itemid;
1017         Buffer          buf;
1018         HeapTupleData tuple;
1019         OffsetNumber offnum,
1020                                 maxoff;
1021         bool            pgchanged,
1022                                 tupgone,
1023                                 notup;
1024         char       *relname;
1025         VacPage         vacpage,
1026                                 vacpagecopy;
1027         BlockNumber empty_pages,
1028                                 new_pages,
1029                                 changed_pages,
1030                                 empty_end_pages;
1031         double          num_tuples,
1032                                 tups_vacuumed,
1033                                 nkeep,
1034                                 nunused;
1035         double          free_size,
1036                                 usable_free_size;
1037         Size            min_tlen = MaxTupleSize;
1038         Size            max_tlen = 0;
1039         int                     i;
1040         bool            do_shrinking = true;
1041         VTupleLink      vtlinks = (VTupleLink) palloc(100 * sizeof(VTupleLinkData));
1042         int                     num_vtlinks = 0;
1043         int                     free_vtlinks = 100;
1044         VacRUsage       ru0;
1045
1046         vac_init_rusage(&ru0);
1047
1048         relname = RelationGetRelationName(onerel);
1049         elog(elevel, "--Relation %s.%s--",
1050                  get_namespace_name(RelationGetNamespace(onerel)),
1051                  relname);
1052
1053         empty_pages = new_pages = changed_pages = empty_end_pages = 0;
1054         num_tuples = tups_vacuumed = nkeep = nunused = 0;
1055         free_size = 0;
1056
1057         nblocks = RelationGetNumberOfBlocks(onerel);
1058
1059         /*
1060          * We initially create each VacPage item in a maximal-sized workspace,
1061          * then copy the workspace into a just-large-enough copy.
1062          */
1063         vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
1064
1065         for (blkno = 0; blkno < nblocks; blkno++)
1066         {
1067                 Page            page,
1068                                         tempPage = NULL;
1069                 bool            do_reap,
1070                                         do_frag;
1071
1072                 CHECK_FOR_INTERRUPTS();
1073
1074                 buf = ReadBuffer(onerel, blkno);
1075                 page = BufferGetPage(buf);
1076
1077                 vacpage->blkno = blkno;
1078                 vacpage->offsets_used = 0;
1079                 vacpage->offsets_free = 0;
1080
1081                 if (PageIsNew(page))
1082                 {
1083                         elog(WARNING, "Rel %s: Uninitialized page %u - fixing",
1084                                  relname, blkno);
1085                         PageInit(page, BufferGetPageSize(buf), 0);
1086                         vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1087                         free_size += vacpage->free;
1088                         new_pages++;
1089                         empty_end_pages++;
1090                         vacpagecopy = copy_vac_page(vacpage);
1091                         vpage_insert(vacuum_pages, vacpagecopy);
1092                         vpage_insert(fraged_pages, vacpagecopy);
1093                         WriteBuffer(buf);
1094                         continue;
1095                 }
1096
1097                 if (PageIsEmpty(page))
1098                 {
1099                         vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1100                         free_size += vacpage->free;
1101                         empty_pages++;
1102                         empty_end_pages++;
1103                         vacpagecopy = copy_vac_page(vacpage);
1104                         vpage_insert(vacuum_pages, vacpagecopy);
1105                         vpage_insert(fraged_pages, vacpagecopy);
1106                         ReleaseBuffer(buf);
1107                         continue;
1108                 }
1109
1110                 pgchanged = false;
1111                 notup = true;
1112                 maxoff = PageGetMaxOffsetNumber(page);
1113                 for (offnum = FirstOffsetNumber;
1114                          offnum <= maxoff;
1115                          offnum = OffsetNumberNext(offnum))
1116                 {
1117                         uint16          sv_infomask;
1118
1119                         itemid = PageGetItemId(page, offnum);
1120
1121                         /*
1122                          * Collect un-used items too - it's possible to have indexes
1123                          * pointing here after crash.
1124                          */
1125                         if (!ItemIdIsUsed(itemid))
1126                         {
1127                                 vacpage->offsets[vacpage->offsets_free++] = offnum;
1128                                 nunused += 1;
1129                                 continue;
1130                         }
1131
1132                         tuple.t_datamcxt = NULL;
1133                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1134                         tuple.t_len = ItemIdGetLength(itemid);
1135                         ItemPointerSet(&(tuple.t_self), blkno, offnum);
1136
1137                         tupgone = false;
1138                         sv_infomask = tuple.t_data->t_infomask;
1139
1140                         switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin))
1141                         {
1142                                 case HEAPTUPLE_DEAD:
1143                                         tupgone = true;         /* we can delete the tuple */
1144                                         break;
1145                                 case HEAPTUPLE_LIVE:
1146
1147                                         /*
1148                                          * Tuple is good.  Consider whether to replace its
1149                                          * xmin value with FrozenTransactionId.
1150                                          */
1151                                         if (TransactionIdIsNormal(HeapTupleHeaderGetXmin(tuple.t_data)) &&
1152                                                 TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
1153                                                                                           FreezeLimit))
1154                                         {
1155                                                 HeapTupleHeaderSetXmin(tuple.t_data, FrozenTransactionId);
1156                                                 /* infomask should be okay already */
1157                                                 Assert(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED);
1158                                                 pgchanged = true;
1159                                         }
1160                                         break;
1161                                 case HEAPTUPLE_RECENTLY_DEAD:
1162
1163                                         /*
1164                                          * If tuple is recently deleted then we must not
1165                                          * remove it from relation.
1166                                          */
1167                                         nkeep += 1;
1168
1169                                         /*
1170                                          * If we do shrinking and this tuple is updated one
1171                                          * then remember it to construct updated tuple
1172                                          * dependencies.
1173                                          */
1174                                         if (do_shrinking &&
1175                                                 !(ItemPointerEquals(&(tuple.t_self),
1176                                                                                         &(tuple.t_data->t_ctid))))
1177                                         {
1178                                                 if (free_vtlinks == 0)
1179                                                 {
1180                                                         free_vtlinks = 1000;
1181                                                         vtlinks = (VTupleLink) repalloc(vtlinks,
1182                                                                                    (free_vtlinks + num_vtlinks) *
1183                                                                                                  sizeof(VTupleLinkData));
1184                                                 }
1185                                                 vtlinks[num_vtlinks].new_tid = tuple.t_data->t_ctid;
1186                                                 vtlinks[num_vtlinks].this_tid = tuple.t_self;
1187                                                 free_vtlinks--;
1188                                                 num_vtlinks++;
1189                                         }
1190                                         break;
1191                                 case HEAPTUPLE_INSERT_IN_PROGRESS:
1192
1193                                         /*
1194                                          * This should not happen, since we hold exclusive
1195                                          * lock on the relation; shouldn't we raise an error?
1196                                          */
1197                                         elog(WARNING, "Rel %s: TID %u/%u: InsertTransactionInProgress %u - can't shrink relation",
1198                                                  relname, blkno, offnum, HeapTupleHeaderGetXmin(tuple.t_data));
1199                                         do_shrinking = false;
1200                                         break;
1201                                 case HEAPTUPLE_DELETE_IN_PROGRESS:
1202
1203                                         /*
1204                                          * This should not happen, since we hold exclusive
1205                                          * lock on the relation; shouldn't we raise an error?
1206                                          */
1207                                         elog(WARNING, "Rel %s: TID %u/%u: DeleteTransactionInProgress %u - can't shrink relation",
1208                                                  relname, blkno, offnum, HeapTupleHeaderGetXmax(tuple.t_data));
1209                                         do_shrinking = false;
1210                                         break;
1211                                 default:
1212                                         elog(ERROR, "Unexpected HeapTupleSatisfiesVacuum result");
1213                                         break;
1214                         }
1215
1216                         /* check for hint-bit update by HeapTupleSatisfiesVacuum */
1217                         if (sv_infomask != tuple.t_data->t_infomask)
1218                                 pgchanged = true;
1219
1220                         /*
1221                          * Other checks...
1222                          */
1223                         if (onerel->rd_rel->relhasoids &&
1224                                 !OidIsValid(HeapTupleGetOid(&tuple)))
1225                                 elog(WARNING, "Rel %s: TID %u/%u: OID IS INVALID. TUPGONE %d.",
1226                                          relname, blkno, offnum, (int) tupgone);
1227
1228                         if (tupgone)
1229                         {
1230                                 ItemId          lpp;
1231
1232                                 /*
1233                                  * Here we are building a temporary copy of the page with
1234                                  * dead tuples removed.  Below we will apply
1235                                  * PageRepairFragmentation to the copy, so that we can
1236                                  * determine how much space will be available after
1237                                  * removal of dead tuples.      But note we are NOT changing
1238                                  * the real page yet...
1239                                  */
1240                                 if (tempPage == (Page) NULL)
1241                                 {
1242                                         Size            pageSize;
1243
1244                                         pageSize = PageGetPageSize(page);
1245                                         tempPage = (Page) palloc(pageSize);
1246                                         memcpy(tempPage, page, pageSize);
1247                                 }
1248
1249                                 /* mark it unused on the temp page */
1250                                 lpp = PageGetItemId(tempPage, offnum);
1251                                 lpp->lp_flags &= ~LP_USED;
1252
1253                                 vacpage->offsets[vacpage->offsets_free++] = offnum;
1254                                 tups_vacuumed += 1;
1255                         }
1256                         else
1257                         {
1258                                 num_tuples += 1;
1259                                 notup = false;
1260                                 if (tuple.t_len < min_tlen)
1261                                         min_tlen = tuple.t_len;
1262                                 if (tuple.t_len > max_tlen)
1263                                         max_tlen = tuple.t_len;
1264                         }
1265                 }                                               /* scan along page */
1266
1267                 if (tempPage != (Page) NULL)
1268                 {
1269                         /* Some tuples are removable; figure free space after removal */
1270                         PageRepairFragmentation(tempPage, NULL);
1271                         vacpage->free = ((PageHeader) tempPage)->pd_upper - ((PageHeader) tempPage)->pd_lower;
1272                         pfree(tempPage);
1273                         do_reap = true;
1274                 }
1275                 else
1276                 {
1277                         /* Just use current available space */
1278                         vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
1279                         /* Need to reap the page if it has ~LP_USED line pointers */
1280                         do_reap = (vacpage->offsets_free > 0);
1281                 }
1282
1283                 free_size += vacpage->free;
1284
1285                 /*
1286                  * Add the page to fraged_pages if it has a useful amount of free
1287                  * space.  "Useful" means enough for a minimal-sized tuple. But we
1288                  * don't know that accurately near the start of the relation, so
1289                  * add pages unconditionally if they have >= BLCKSZ/10 free space.
1290                  */
1291                 do_frag = (vacpage->free >= min_tlen || vacpage->free >= BLCKSZ / 10);
1292
1293                 if (do_reap || do_frag)
1294                 {
1295                         vacpagecopy = copy_vac_page(vacpage);
1296                         if (do_reap)
1297                                 vpage_insert(vacuum_pages, vacpagecopy);
1298                         if (do_frag)
1299                                 vpage_insert(fraged_pages, vacpagecopy);
1300                 }
1301
1302                 if (notup)
1303                         empty_end_pages++;
1304                 else
1305                         empty_end_pages = 0;
1306
1307                 if (pgchanged)
1308                 {
1309                         WriteBuffer(buf);
1310                         changed_pages++;
1311                 }
1312                 else
1313                         ReleaseBuffer(buf);
1314         }
1315
1316         pfree(vacpage);
1317
1318         /* save stats in the rel list for use later */
1319         vacrelstats->rel_tuples = num_tuples;
1320         vacrelstats->rel_pages = nblocks;
1321         if (num_tuples == 0)
1322                 min_tlen = max_tlen = 0;
1323         vacrelstats->min_tlen = min_tlen;
1324         vacrelstats->max_tlen = max_tlen;
1325
1326         vacuum_pages->empty_end_pages = empty_end_pages;
1327         fraged_pages->empty_end_pages = empty_end_pages;
1328
1329         /*
1330          * Clear the fraged_pages list if we found we couldn't shrink. Else,
1331          * remove any "empty" end-pages from the list, and compute usable free
1332          * space = free space in remaining pages.
1333          */
1334         if (do_shrinking)
1335         {
1336                 Assert((BlockNumber) fraged_pages->num_pages >= empty_end_pages);
1337                 fraged_pages->num_pages -= empty_end_pages;
1338                 usable_free_size = 0;
1339                 for (i = 0; i < fraged_pages->num_pages; i++)
1340                         usable_free_size += fraged_pages->pagedesc[i]->free;
1341         }
1342         else
1343         {
1344                 fraged_pages->num_pages = 0;
1345                 usable_free_size = 0;
1346         }
1347
1348         /* don't bother to save vtlinks if we will not call repair_frag */
1349         if (fraged_pages->num_pages > 0 && num_vtlinks > 0)
1350         {
1351                 qsort((char *) vtlinks, num_vtlinks, sizeof(VTupleLinkData),
1352                           vac_cmp_vtlinks);
1353                 vacrelstats->vtlinks = vtlinks;
1354                 vacrelstats->num_vtlinks = num_vtlinks;
1355         }
1356         else
1357         {
1358                 vacrelstats->vtlinks = NULL;
1359                 vacrelstats->num_vtlinks = 0;
1360                 pfree(vtlinks);
1361         }
1362
1363         elog(elevel, "Pages %u: Changed %u, reaped %u, Empty %u, New %u; "
1364                  "Tup %.0f: Vac %.0f, Keep/VTL %.0f/%u, UnUsed %.0f, MinLen %lu, "
1365                  "MaxLen %lu; Re-using: Free/Avail. Space %.0f/%.0f; "
1366                  "EndEmpty/Avail. Pages %u/%u.\n\t%s",
1367                  nblocks, changed_pages, vacuum_pages->num_pages, empty_pages,
1368                  new_pages, num_tuples, tups_vacuumed,
1369                  nkeep, vacrelstats->num_vtlinks,
1370                  nunused, (unsigned long) min_tlen, (unsigned long) max_tlen,
1371                  free_size, usable_free_size,
1372                  empty_end_pages, fraged_pages->num_pages,
1373                  vac_show_rusage(&ru0));
1374 }
1375
1376
1377 /*
1378  *      repair_frag() -- try to repair relation's fragmentation
1379  *
1380  *              This routine marks dead tuples as unused and tries re-use dead space
1381  *              by moving tuples (and inserting indexes if needed). It constructs
1382  *              Nvacpagelist list of free-ed pages (moved tuples) and clean indexes
1383  *              for them after committing (in hack-manner - without losing locks
1384  *              and freeing memory!) current transaction. It truncates relation
1385  *              if some end-blocks are gone away.
1386  */
1387 static void
1388 repair_frag(VRelStats *vacrelstats, Relation onerel,
1389                         VacPageList vacuum_pages, VacPageList fraged_pages,
1390                         int nindexes, Relation *Irel)
1391 {
1392         TransactionId myXID;
1393         CommandId       myCID;
1394         Buffer          buf,
1395                                 cur_buffer;
1396         BlockNumber nblocks,
1397                                 blkno;
1398         BlockNumber last_move_dest_block = 0,
1399                                 last_vacuum_block;
1400         Page            page,
1401                                 ToPage = NULL;
1402         OffsetNumber offnum,
1403                                 maxoff,
1404                                 newoff,
1405                                 max_offset;
1406         ItemId          itemid,
1407                                 newitemid;
1408         HeapTupleData tuple,
1409                                 newtup;
1410         TupleDesc       tupdesc;
1411         ResultRelInfo *resultRelInfo;
1412         EState     *estate;
1413         TupleTable      tupleTable;
1414         TupleTableSlot *slot;
1415         VacPageListData Nvacpagelist;
1416         VacPage         cur_page = NULL,
1417                                 last_vacuum_page,
1418                                 vacpage,
1419                            *curpage;
1420         int                     cur_item = 0;
1421         int                     i;
1422         Size            tuple_len;
1423         int                     num_moved,
1424                                 num_fraged_pages,
1425                                 vacuumed_pages;
1426         int                     checked_moved,
1427                                 num_tuples,
1428                                 keep_tuples = 0;
1429         bool            isempty,
1430                                 dowrite,
1431                                 chain_tuple_moved;
1432         VacRUsage       ru0;
1433
1434         vac_init_rusage(&ru0);
1435
1436         myXID = GetCurrentTransactionId();
1437         myCID = GetCurrentCommandId();
1438
1439         tupdesc = RelationGetDescr(onerel);
1440
1441         /*
1442          * We need a ResultRelInfo and an EState so we can use the regular
1443          * executor's index-entry-making machinery.
1444          */
1445         estate = CreateExecutorState();
1446
1447         resultRelInfo = makeNode(ResultRelInfo);
1448         resultRelInfo->ri_RangeTableIndex = 1;          /* dummy */
1449         resultRelInfo->ri_RelationDesc = onerel;
1450         resultRelInfo->ri_TrigDesc = NULL;      /* we don't fire triggers */
1451
1452         ExecOpenIndices(resultRelInfo);
1453
1454         estate->es_result_relations = resultRelInfo;
1455         estate->es_num_result_relations = 1;
1456         estate->es_result_relation_info = resultRelInfo;
1457
1458         /* Set up a dummy tuple table too */
1459         tupleTable = ExecCreateTupleTable(1);
1460         slot = ExecAllocTableSlot(tupleTable);
1461         ExecSetSlotDescriptor(slot, tupdesc, false);
1462
1463         Nvacpagelist.num_pages = 0;
1464         num_fraged_pages = fraged_pages->num_pages;
1465         Assert((BlockNumber) vacuum_pages->num_pages >= vacuum_pages->empty_end_pages);
1466         vacuumed_pages = vacuum_pages->num_pages - vacuum_pages->empty_end_pages;
1467         if (vacuumed_pages > 0)
1468         {
1469                 /* get last reaped page from vacuum_pages */
1470                 last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
1471                 last_vacuum_block = last_vacuum_page->blkno;
1472         }
1473         else
1474         {
1475                 last_vacuum_page = NULL;
1476                 last_vacuum_block = InvalidBlockNumber;
1477         }
1478         cur_buffer = InvalidBuffer;
1479         num_moved = 0;
1480
1481         vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
1482         vacpage->offsets_used = vacpage->offsets_free = 0;
1483
1484         /*
1485          * Scan pages backwards from the last nonempty page, trying to move
1486          * tuples down to lower pages.  Quit when we reach a page that we have
1487          * moved any tuples onto, or the first page if we haven't moved
1488          * anything, or when we find a page we cannot completely empty (this
1489          * last condition is handled by "break" statements within the loop).
1490          *
1491          * NB: this code depends on the vacuum_pages and fraged_pages lists being
1492          * in order by blkno.
1493          */
1494         nblocks = vacrelstats->rel_pages;
1495         for (blkno = nblocks - vacuum_pages->empty_end_pages - 1;
1496                  blkno > last_move_dest_block;
1497                  blkno--)
1498         {
1499                 CHECK_FOR_INTERRUPTS();
1500
1501                 /*
1502                  * Forget fraged_pages pages at or after this one; they're no
1503                  * longer useful as move targets, since we only want to move down.
1504                  * Note that since we stop the outer loop at last_move_dest_block,
1505                  * pages removed here cannot have had anything moved onto them
1506                  * already.
1507                  *
1508                  * Also note that we don't change the stored fraged_pages list, only
1509                  * our local variable num_fraged_pages; so the forgotten pages are
1510                  * still available to be loaded into the free space map later.
1511                  */
1512                 while (num_fraged_pages > 0 &&
1513                         fraged_pages->pagedesc[num_fraged_pages - 1]->blkno >= blkno)
1514                 {
1515                         Assert(fraged_pages->pagedesc[num_fraged_pages - 1]->offsets_used == 0);
1516                         --num_fraged_pages;
1517                 }
1518
1519                 /*
1520                  * Process this page of relation.
1521                  */
1522                 buf = ReadBuffer(onerel, blkno);
1523                 page = BufferGetPage(buf);
1524
1525                 vacpage->offsets_free = 0;
1526
1527                 isempty = PageIsEmpty(page);
1528
1529                 dowrite = false;
1530
1531                 /* Is the page in the vacuum_pages list? */
1532                 if (blkno == last_vacuum_block)
1533                 {
1534                         if (last_vacuum_page->offsets_free > 0)
1535                         {
1536                                 /* there are dead tuples on this page - clean them */
1537                                 Assert(!isempty);
1538                                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1539                                 vacuum_page(onerel, buf, last_vacuum_page);
1540                                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1541                                 dowrite = true;
1542                         }
1543                         else
1544                                 Assert(isempty);
1545                         --vacuumed_pages;
1546                         if (vacuumed_pages > 0)
1547                         {
1548                                 /* get prev reaped page from vacuum_pages */
1549                                 last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
1550                                 last_vacuum_block = last_vacuum_page->blkno;
1551                         }
1552                         else
1553                         {
1554                                 last_vacuum_page = NULL;
1555                                 last_vacuum_block = InvalidBlockNumber;
1556                         }
1557                         if (isempty)
1558                         {
1559                                 ReleaseBuffer(buf);
1560                                 continue;
1561                         }
1562                 }
1563                 else
1564                         Assert(!isempty);
1565
1566                 chain_tuple_moved = false;              /* no one chain-tuple was moved
1567                                                                                  * off this page, yet */
1568                 vacpage->blkno = blkno;
1569                 maxoff = PageGetMaxOffsetNumber(page);
1570                 for (offnum = FirstOffsetNumber;
1571                          offnum <= maxoff;
1572                          offnum = OffsetNumberNext(offnum))
1573                 {
1574                         itemid = PageGetItemId(page, offnum);
1575
1576                         if (!ItemIdIsUsed(itemid))
1577                                 continue;
1578
1579                         tuple.t_datamcxt = NULL;
1580                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1581                         tuple_len = tuple.t_len = ItemIdGetLength(itemid);
1582                         ItemPointerSet(&(tuple.t_self), blkno, offnum);
1583
1584                         if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1585                         {
1586                                 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1587                                         elog(ERROR, "HEAP_MOVED_IN was not expected");
1588
1589                                 /*
1590                                  * If this (chain) tuple is moved by me already then I
1591                                  * have to check is it in vacpage or not - i.e. is it
1592                                  * moved while cleaning this page or some previous one.
1593                                  */
1594                                 if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
1595                                 {
1596                                         if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
1597                                                 elog(ERROR, "Invalid XVAC in tuple header");
1598                                         if (keep_tuples == 0)
1599                                                 continue;
1600                                         if (chain_tuple_moved)          /* some chains was moved
1601                                                                                                  * while */
1602                                         {                       /* cleaning this page */
1603                                                 Assert(vacpage->offsets_free > 0);
1604                                                 for (i = 0; i < vacpage->offsets_free; i++)
1605                                                 {
1606                                                         if (vacpage->offsets[i] == offnum)
1607                                                                 break;
1608                                                 }
1609                                                 if (i >= vacpage->offsets_free) /* not found */
1610                                                 {
1611                                                         vacpage->offsets[vacpage->offsets_free++] = offnum;
1612                                                         keep_tuples--;
1613                                                 }
1614                                         }
1615                                         else
1616                                         {
1617                                                 vacpage->offsets[vacpage->offsets_free++] = offnum;
1618                                                 keep_tuples--;
1619                                         }
1620                                         continue;
1621                                 }
1622                                 elog(ERROR, "HEAP_MOVED_OFF was expected");
1623                         }
1624
1625                         /*
1626                          * If this tuple is in the chain of tuples created in updates
1627                          * by "recent" transactions then we have to move all chain of
1628                          * tuples to another places.
1629                          *
1630                          * NOTE: this test is not 100% accurate: it is possible for a
1631                          * tuple to be an updated one with recent xmin, and yet not
1632                          * have a corresponding tuple in the vtlinks list.      Presumably
1633                          * there was once a parent tuple with xmax matching the xmin,
1634                          * but it's possible that that tuple has been removed --- for
1635                          * example, if it had xmin = xmax then
1636                          * HeapTupleSatisfiesVacuum would deem it removable as soon as
1637                          * the xmin xact completes.
1638                          *
1639                          * To be on the safe side, we abandon the repair_frag process if
1640                          * we cannot find the parent tuple in vtlinks.  This may be
1641                          * overly conservative; AFAICS it would be safe to move the
1642                          * chain.
1643                          */
1644                         if (((tuple.t_data->t_infomask & HEAP_UPDATED) &&
1645                          !TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data),
1646                                                                         OldestXmin)) ||
1647                                 (!(tuple.t_data->t_infomask & (HEAP_XMAX_INVALID |
1648                                                                                            HEAP_MARKED_FOR_UPDATE)) &&
1649                                  !(ItemPointerEquals(&(tuple.t_self),
1650                                                                          &(tuple.t_data->t_ctid)))))
1651                         {
1652                                 Buffer          Cbuf = buf;
1653                                 bool            freeCbuf = false;
1654                                 bool            chain_move_failed = false;
1655                                 Page            Cpage;
1656                                 ItemId          Citemid;
1657                                 ItemPointerData Ctid;
1658                                 HeapTupleData tp = tuple;
1659                                 Size            tlen = tuple_len;
1660                                 VTupleMove      vtmove;
1661                                 int                     num_vtmove;
1662                                 int                     free_vtmove;
1663                                 VacPage         to_vacpage = NULL;
1664                                 int                     to_item = 0;
1665                                 int                     ti;
1666
1667                                 if (cur_buffer != InvalidBuffer)
1668                                 {
1669                                         WriteBuffer(cur_buffer);
1670                                         cur_buffer = InvalidBuffer;
1671                                 }
1672
1673                                 /* Quick exit if we have no vtlinks to search in */
1674                                 if (vacrelstats->vtlinks == NULL)
1675                                 {
1676                                         elog(DEBUG2, "Parent item in update-chain not found - can't continue repair_frag");
1677                                         break;          /* out of walk-along-page loop */
1678                                 }
1679
1680                                 vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData));
1681                                 num_vtmove = 0;
1682                                 free_vtmove = 100;
1683
1684                                 /*
1685                                  * If this tuple is in the begin/middle of the chain then
1686                                  * we have to move to the end of chain.
1687                                  */
1688                                 while (!(tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
1689                                                                                           HEAP_MARKED_FOR_UPDATE)) &&
1690                                            !(ItemPointerEquals(&(tp.t_self),
1691                                                                                    &(tp.t_data->t_ctid))))
1692                                 {
1693                                         Ctid = tp.t_data->t_ctid;
1694                                         if (freeCbuf)
1695                                                 ReleaseBuffer(Cbuf);
1696                                         freeCbuf = true;
1697                                         Cbuf = ReadBuffer(onerel,
1698                                                                           ItemPointerGetBlockNumber(&Ctid));
1699                                         Cpage = BufferGetPage(Cbuf);
1700                                         Citemid = PageGetItemId(Cpage,
1701                                                                           ItemPointerGetOffsetNumber(&Ctid));
1702                                         if (!ItemIdIsUsed(Citemid))
1703                                         {
1704                                                 /*
1705                                                  * This means that in the middle of chain there
1706                                                  * was tuple updated by older (than OldestXmin)
1707                                                  * xaction and this tuple is already deleted by
1708                                                  * me. Actually, upper part of chain should be
1709                                                  * removed and seems that this should be handled
1710                                                  * in scan_heap(), but it's not implemented at the
1711                                                  * moment and so we just stop shrinking here.
1712                                                  */
1713                                                 elog(DEBUG2, "Child itemid in update-chain marked as unused - can't continue repair_frag");
1714                                                 chain_move_failed = true;
1715                                                 break;  /* out of loop to move to chain end */
1716                                         }
1717                                         tp.t_datamcxt = NULL;
1718                                         tp.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
1719                                         tp.t_self = Ctid;
1720                                         tlen = tp.t_len = ItemIdGetLength(Citemid);
1721                                 }
1722                                 if (chain_move_failed)
1723                                 {
1724                                         if (freeCbuf)
1725                                                 ReleaseBuffer(Cbuf);
1726                                         pfree(vtmove);
1727                                         break;          /* out of walk-along-page loop */
1728                                 }
1729
1730                                 /*
1731                                  * Check if all items in chain can be moved
1732                                  */
1733                                 for (;;)
1734                                 {
1735                                         Buffer          Pbuf;
1736                                         Page            Ppage;
1737                                         ItemId          Pitemid;
1738                                         HeapTupleData Ptp;
1739                                         VTupleLinkData vtld,
1740                                                            *vtlp;
1741
1742                                         if (to_vacpage == NULL ||
1743                                                 !enough_space(to_vacpage, tlen))
1744                                         {
1745                                                 for (i = 0; i < num_fraged_pages; i++)
1746                                                 {
1747                                                         if (enough_space(fraged_pages->pagedesc[i], tlen))
1748                                                                 break;
1749                                                 }
1750
1751                                                 if (i == num_fraged_pages)
1752                                                 {
1753                                                         /* can't move item anywhere */
1754                                                         chain_move_failed = true;
1755                                                         break;          /* out of check-all-items loop */
1756                                                 }
1757                                                 to_item = i;
1758                                                 to_vacpage = fraged_pages->pagedesc[to_item];
1759                                         }
1760                                         to_vacpage->free -= MAXALIGN(tlen);
1761                                         if (to_vacpage->offsets_used >= to_vacpage->offsets_free)
1762                                                 to_vacpage->free -= sizeof(ItemIdData);
1763                                         (to_vacpage->offsets_used)++;
1764                                         if (free_vtmove == 0)
1765                                         {
1766                                                 free_vtmove = 1000;
1767                                                 vtmove = (VTupleMove)
1768                                                         repalloc(vtmove,
1769                                                                          (free_vtmove + num_vtmove) *
1770                                                                          sizeof(VTupleMoveData));
1771                                         }
1772                                         vtmove[num_vtmove].tid = tp.t_self;
1773                                         vtmove[num_vtmove].vacpage = to_vacpage;
1774                                         if (to_vacpage->offsets_used == 1)
1775                                                 vtmove[num_vtmove].cleanVpd = true;
1776                                         else
1777                                                 vtmove[num_vtmove].cleanVpd = false;
1778                                         free_vtmove--;
1779                                         num_vtmove++;
1780
1781                                         /* At beginning of chain? */
1782                                         if (!(tp.t_data->t_infomask & HEAP_UPDATED) ||
1783                                                 TransactionIdPrecedes(HeapTupleHeaderGetXmin(tp.t_data),
1784                                                                                           OldestXmin))
1785                                                 break;
1786
1787                                         /* No, move to tuple with prior row version */
1788                                         vtld.new_tid = tp.t_self;
1789                                         vtlp = (VTupleLink)
1790                                                 vac_bsearch((void *) &vtld,
1791                                                                         (void *) (vacrelstats->vtlinks),
1792                                                                         vacrelstats->num_vtlinks,
1793                                                                         sizeof(VTupleLinkData),
1794                                                                         vac_cmp_vtlinks);
1795                                         if (vtlp == NULL)
1796                                         {
1797                                                 /* see discussion above */
1798                                                 elog(DEBUG2, "Parent item in update-chain not found - can't continue repair_frag");
1799                                                 chain_move_failed = true;
1800                                                 break;  /* out of check-all-items loop */
1801                                         }
1802                                         tp.t_self = vtlp->this_tid;
1803                                         Pbuf = ReadBuffer(onerel,
1804                                                                 ItemPointerGetBlockNumber(&(tp.t_self)));
1805                                         Ppage = BufferGetPage(Pbuf);
1806                                         Pitemid = PageGetItemId(Ppage,
1807                                                            ItemPointerGetOffsetNumber(&(tp.t_self)));
1808                                         /* this can't happen since we saw tuple earlier: */
1809                                         if (!ItemIdIsUsed(Pitemid))
1810                                                 elog(ERROR, "Parent itemid marked as unused");
1811                                         Ptp.t_datamcxt = NULL;
1812                                         Ptp.t_data = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
1813
1814                                         /* ctid should not have changed since we saved it */
1815                                         Assert(ItemPointerEquals(&(vtld.new_tid),
1816                                                                                          &(Ptp.t_data->t_ctid)));
1817
1818                                         /*
1819                                          * Read above about cases when !ItemIdIsUsed(Citemid)
1820                                          * (child item is removed)... Due to the fact that at
1821                                          * the moment we don't remove unuseful part of
1822                                          * update-chain, it's possible to get too old parent
1823                                          * row here. Like as in the case which caused this
1824                                          * problem, we stop shrinking here. I could try to
1825                                          * find real parent row but want not to do it because
1826                                          * of real solution will be implemented anyway, later,
1827                                          * and we are too close to 6.5 release. - vadim
1828                                          * 06/11/99
1829                                          */
1830                                         if (!(TransactionIdEquals(HeapTupleHeaderGetXmax(Ptp.t_data),
1831                                                                          HeapTupleHeaderGetXmin(tp.t_data))))
1832                                         {
1833                                                 ReleaseBuffer(Pbuf);
1834                                                 elog(DEBUG2, "Too old parent tuple found - can't continue repair_frag");
1835                                                 chain_move_failed = true;
1836                                                 break;  /* out of check-all-items loop */
1837                                         }
1838                                         tp.t_datamcxt = Ptp.t_datamcxt;
1839                                         tp.t_data = Ptp.t_data;
1840                                         tlen = tp.t_len = ItemIdGetLength(Pitemid);
1841                                         if (freeCbuf)
1842                                                 ReleaseBuffer(Cbuf);
1843                                         Cbuf = Pbuf;
1844                                         freeCbuf = true;
1845                                 }                               /* end of check-all-items loop */
1846
1847                                 if (freeCbuf)
1848                                         ReleaseBuffer(Cbuf);
1849                                 freeCbuf = false;
1850
1851                                 if (chain_move_failed)
1852                                 {
1853                                         /*
1854                                          * Undo changes to offsets_used state.  We don't
1855                                          * bother cleaning up the amount-free state, since
1856                                          * we're not going to do any further tuple motion.
1857                                          */
1858                                         for (i = 0; i < num_vtmove; i++)
1859                                         {
1860                                                 Assert(vtmove[i].vacpage->offsets_used > 0);
1861                                                 (vtmove[i].vacpage->offsets_used)--;
1862                                         }
1863                                         pfree(vtmove);
1864                                         break;          /* out of walk-along-page loop */
1865                                 }
1866
1867                                 /*
1868                                  * Okay, move the whle tuple chain
1869                                  */
1870                                 ItemPointerSetInvalid(&Ctid);
1871                                 for (ti = 0; ti < num_vtmove; ti++)
1872                                 {
1873                                         VacPage         destvacpage = vtmove[ti].vacpage;
1874
1875                                         /* Get page to move from */
1876                                         tuple.t_self = vtmove[ti].tid;
1877                                         Cbuf = ReadBuffer(onerel,
1878                                                          ItemPointerGetBlockNumber(&(tuple.t_self)));
1879
1880                                         /* Get page to move to */
1881                                         cur_buffer = ReadBuffer(onerel, destvacpage->blkno);
1882
1883                                         LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
1884                                         if (cur_buffer != Cbuf)
1885                                                 LockBuffer(Cbuf, BUFFER_LOCK_EXCLUSIVE);
1886
1887                                         ToPage = BufferGetPage(cur_buffer);
1888                                         Cpage = BufferGetPage(Cbuf);
1889
1890                                         Citemid = PageGetItemId(Cpage,
1891                                                         ItemPointerGetOffsetNumber(&(tuple.t_self)));
1892                                         tuple.t_datamcxt = NULL;
1893                                         tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
1894                                         tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
1895
1896                                         /*
1897                                          * make a copy of the source tuple, and then mark the
1898                                          * source tuple MOVED_OFF.
1899                                          */
1900                                         heap_copytuple_with_tuple(&tuple, &newtup);
1901
1902                                         /*
1903                                          * register invalidation of source tuple in catcaches.
1904                                          */
1905                                         CacheInvalidateHeapTuple(onerel, &tuple);
1906
1907                                         /* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
1908                                         START_CRIT_SECTION();
1909
1910                                         tuple.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
1911                                                                                                   HEAP_XMIN_INVALID |
1912                                                                                                   HEAP_MOVED_IN);
1913                                         tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
1914                                         HeapTupleHeaderSetXvac(tuple.t_data, myXID);
1915
1916                                         /*
1917                                          * If this page was not used before - clean it.
1918                                          *
1919                                          * NOTE: a nasty bug used to lurk here.  It is possible
1920                                          * for the source and destination pages to be the same
1921                                          * (since this tuple-chain member can be on a page
1922                                          * lower than the one we're currently processing in
1923                                          * the outer loop).  If that's true, then after
1924                                          * vacuum_page() the source tuple will have been
1925                                          * moved, and tuple.t_data will be pointing at
1926                                          * garbage.  Therefore we must do everything that uses
1927                                          * tuple.t_data BEFORE this step!!
1928                                          *
1929                                          * This path is different from the other callers of
1930                                          * vacuum_page, because we have already incremented
1931                                          * the vacpage's offsets_used field to account for the
1932                                          * tuple(s) we expect to move onto the page. Therefore
1933                                          * vacuum_page's check for offsets_used == 0 is wrong.
1934                                          * But since that's a good debugging check for all
1935                                          * other callers, we work around it here rather than
1936                                          * remove it.
1937                                          */
1938                                         if (!PageIsEmpty(ToPage) && vtmove[ti].cleanVpd)
1939                                         {
1940                                                 int                     sv_offsets_used = destvacpage->offsets_used;
1941
1942                                                 destvacpage->offsets_used = 0;
1943                                                 vacuum_page(onerel, cur_buffer, destvacpage);
1944                                                 destvacpage->offsets_used = sv_offsets_used;
1945                                         }
1946
1947                                         /*
1948                                          * Update the state of the copied tuple, and store it
1949                                          * on the destination page.
1950                                          */
1951                                         newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
1952                                                                                                    HEAP_XMIN_INVALID |
1953                                                                                                    HEAP_MOVED_OFF);
1954                                         newtup.t_data->t_infomask |= HEAP_MOVED_IN;
1955                                         HeapTupleHeaderSetXvac(newtup.t_data, myXID);
1956                                         newoff = PageAddItem(ToPage,
1957                                                                                  (Item) newtup.t_data,
1958                                                                                  tuple_len,
1959                                                                                  InvalidOffsetNumber,
1960                                                                                  LP_USED);
1961                                         if (newoff == InvalidOffsetNumber)
1962                                         {
1963                                                 elog(PANIC, "moving chain: failed to add item with len = %lu to page %u",
1964                                                   (unsigned long) tuple_len, destvacpage->blkno);
1965                                         }
1966                                         newitemid = PageGetItemId(ToPage, newoff);
1967                                         pfree(newtup.t_data);
1968                                         newtup.t_datamcxt = NULL;
1969                                         newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
1970                                         ItemPointerSet(&(newtup.t_self), destvacpage->blkno, newoff);
1971
1972                                         /* XLOG stuff */
1973                                         if (!onerel->rd_istemp)
1974                                         {
1975                                                 XLogRecPtr      recptr =
1976                                                 log_heap_move(onerel, Cbuf, tuple.t_self,
1977                                                                           cur_buffer, &newtup);
1978
1979                                                 if (Cbuf != cur_buffer)
1980                                                 {
1981                                                         PageSetLSN(Cpage, recptr);
1982                                                         PageSetSUI(Cpage, ThisStartUpID);
1983                                                 }
1984                                                 PageSetLSN(ToPage, recptr);
1985                                                 PageSetSUI(ToPage, ThisStartUpID);
1986                                         }
1987                                         else
1988                                         {
1989                                                 /*
1990                                                  * No XLOG record, but still need to flag that XID
1991                                                  * exists on disk
1992                                                  */
1993                                                 MyXactMadeTempRelUpdate = true;
1994                                         }
1995
1996                                         END_CRIT_SECTION();
1997
1998                                         if (destvacpage->blkno > last_move_dest_block)
1999                                                 last_move_dest_block = destvacpage->blkno;
2000
2001                                         /*
2002                                          * Set new tuple's t_ctid pointing to itself for last
2003                                          * tuple in chain, and to next tuple in chain
2004                                          * otherwise.
2005                                          */
2006                                         if (!ItemPointerIsValid(&Ctid))
2007                                                 newtup.t_data->t_ctid = newtup.t_self;
2008                                         else
2009                                                 newtup.t_data->t_ctid = Ctid;
2010                                         Ctid = newtup.t_self;
2011
2012                                         num_moved++;
2013
2014                                         /*
2015                                          * Remember that we moved tuple from the current page
2016                                          * (corresponding index tuple will be cleaned).
2017                                          */
2018                                         if (Cbuf == buf)
2019                                                 vacpage->offsets[vacpage->offsets_free++] =
2020                                                         ItemPointerGetOffsetNumber(&(tuple.t_self));
2021                                         else
2022                                                 keep_tuples++;
2023
2024                                         LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
2025                                         if (cur_buffer != Cbuf)
2026                                                 LockBuffer(Cbuf, BUFFER_LOCK_UNLOCK);
2027
2028                                         /* Create index entries for the moved tuple */
2029                                         if (resultRelInfo->ri_NumIndices > 0)
2030                                         {
2031                                                 ExecStoreTuple(&newtup, slot, InvalidBuffer, false);
2032                                                 ExecInsertIndexTuples(slot, &(newtup.t_self),
2033                                                                                           estate, true);
2034                                         }
2035
2036                                         WriteBuffer(cur_buffer);
2037                                         WriteBuffer(Cbuf);
2038                                 }                               /* end of move-the-tuple-chain loop */
2039
2040                                 cur_buffer = InvalidBuffer;
2041                                 pfree(vtmove);
2042                                 chain_tuple_moved = true;
2043
2044                                 /* advance to next tuple in walk-along-page loop */
2045                                 continue;
2046                         }                                       /* end of is-tuple-in-chain test */
2047
2048                         /* try to find new page for this tuple */
2049                         if (cur_buffer == InvalidBuffer ||
2050                                 !enough_space(cur_page, tuple_len))
2051                         {
2052                                 if (cur_buffer != InvalidBuffer)
2053                                 {
2054                                         WriteBuffer(cur_buffer);
2055                                         cur_buffer = InvalidBuffer;
2056                                 }
2057                                 for (i = 0; i < num_fraged_pages; i++)
2058                                 {
2059                                         if (enough_space(fraged_pages->pagedesc[i], tuple_len))
2060                                                 break;
2061                                 }
2062                                 if (i == num_fraged_pages)
2063                                         break;          /* can't move item anywhere */
2064                                 cur_item = i;
2065                                 cur_page = fraged_pages->pagedesc[cur_item];
2066                                 cur_buffer = ReadBuffer(onerel, cur_page->blkno);
2067                                 LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
2068                                 ToPage = BufferGetPage(cur_buffer);
2069                                 /* if this page was not used before - clean it */
2070                                 if (!PageIsEmpty(ToPage) && cur_page->offsets_used == 0)
2071                                         vacuum_page(onerel, cur_buffer, cur_page);
2072                         }
2073                         else
2074                                 LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
2075
2076                         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2077
2078                         /* copy tuple */
2079                         heap_copytuple_with_tuple(&tuple, &newtup);
2080
2081                         /*
2082                          * register invalidation of source tuple in catcaches.
2083                          *
2084                          * (Note: we do not need to register the copied tuple, because we
2085                          * are not changing the tuple contents and so there cannot be
2086                          * any need to flush negative catcache entries.)
2087                          */
2088                         CacheInvalidateHeapTuple(onerel, &tuple);
2089
2090                         /* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
2091                         START_CRIT_SECTION();
2092
2093                         /*
2094                          * Mark new tuple as MOVED_IN by me.
2095                          */
2096                         newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
2097                                                                                    HEAP_XMIN_INVALID |
2098                                                                                    HEAP_MOVED_OFF);
2099                         newtup.t_data->t_infomask |= HEAP_MOVED_IN;
2100                         HeapTupleHeaderSetXvac(newtup.t_data, myXID);
2101
2102                         /* add tuple to the page */
2103                         newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
2104                                                                  InvalidOffsetNumber, LP_USED);
2105                         if (newoff == InvalidOffsetNumber)
2106                         {
2107                                 elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)",
2108                                          (unsigned long) tuple_len,
2109                                          cur_page->blkno, (unsigned long) cur_page->free,
2110                                          cur_page->offsets_used, cur_page->offsets_free);
2111                         }
2112                         newitemid = PageGetItemId(ToPage, newoff);
2113                         pfree(newtup.t_data);
2114                         newtup.t_datamcxt = NULL;
2115                         newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
2116                         ItemPointerSet(&(newtup.t_data->t_ctid), cur_page->blkno, newoff);
2117                         newtup.t_self = newtup.t_data->t_ctid;
2118
2119                         /*
2120                          * Mark old tuple as MOVED_OFF by me.
2121                          */
2122                         tuple.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED |
2123                                                                                   HEAP_XMIN_INVALID |
2124                                                                                   HEAP_MOVED_IN);
2125                         tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
2126                         HeapTupleHeaderSetXvac(tuple.t_data, myXID);
2127
2128                         /* XLOG stuff */
2129                         if (!onerel->rd_istemp)
2130                         {
2131                                 XLogRecPtr      recptr =
2132                                 log_heap_move(onerel, buf, tuple.t_self,
2133                                                           cur_buffer, &newtup);
2134
2135                                 PageSetLSN(page, recptr);
2136                                 PageSetSUI(page, ThisStartUpID);
2137                                 PageSetLSN(ToPage, recptr);
2138                                 PageSetSUI(ToPage, ThisStartUpID);
2139                         }
2140                         else
2141                         {
2142                                 /*
2143                                  * No XLOG record, but still need to flag that XID exists
2144                                  * on disk
2145                                  */
2146                                 MyXactMadeTempRelUpdate = true;
2147                         }
2148
2149                         END_CRIT_SECTION();
2150
2151                         cur_page->offsets_used++;
2152                         num_moved++;
2153                         cur_page->free = ((PageHeader) ToPage)->pd_upper - ((PageHeader) ToPage)->pd_lower;
2154                         if (cur_page->blkno > last_move_dest_block)
2155                                 last_move_dest_block = cur_page->blkno;
2156
2157                         vacpage->offsets[vacpage->offsets_free++] = offnum;
2158
2159                         LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
2160                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2161
2162                         /* insert index' tuples if needed */
2163                         if (resultRelInfo->ri_NumIndices > 0)
2164                         {
2165                                 ExecStoreTuple(&newtup, slot, InvalidBuffer, false);
2166                                 ExecInsertIndexTuples(slot, &(newtup.t_self), estate, true);
2167                         }
2168                 }                                               /* walk along page */
2169
2170                 /*
2171                  * If we broke out of the walk-along-page loop early (ie, still
2172                  * have offnum <= maxoff), then we failed to move some tuple off
2173                  * this page.  No point in shrinking any more, so clean up and
2174                  * exit the per-page loop.
2175                  */
2176                 if (offnum < maxoff && keep_tuples > 0)
2177                 {
2178                         OffsetNumber off;
2179
2180                         /*
2181                          * Fix vacpage state for any unvisited tuples remaining on
2182                          * page
2183                          */
2184                         for (off = OffsetNumberNext(offnum);
2185                                  off <= maxoff;
2186                                  off = OffsetNumberNext(off))
2187                         {
2188                                 itemid = PageGetItemId(page, off);
2189                                 if (!ItemIdIsUsed(itemid))
2190                                         continue;
2191                                 tuple.t_datamcxt = NULL;
2192                                 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2193                                 if (tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)
2194                                         continue;
2195                                 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
2196                                         elog(ERROR, "HEAP_MOVED_IN was not expected (2)");
2197                                 if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
2198                                 {
2199                                         if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
2200                                                 elog(ERROR, "Invalid XVAC in tuple header (4)");
2201                                         /* some chains was moved while */
2202                                         if (chain_tuple_moved)
2203                                         {                       /* cleaning this page */
2204                                                 Assert(vacpage->offsets_free > 0);
2205                                                 for (i = 0; i < vacpage->offsets_free; i++)
2206                                                 {
2207                                                         if (vacpage->offsets[i] == off)
2208                                                                 break;
2209                                                 }
2210                                                 if (i >= vacpage->offsets_free) /* not found */
2211                                                 {
2212                                                         vacpage->offsets[vacpage->offsets_free++] = off;
2213                                                         Assert(keep_tuples > 0);
2214                                                         keep_tuples--;
2215                                                 }
2216                                         }
2217                                         else
2218                                         {
2219                                                 vacpage->offsets[vacpage->offsets_free++] = off;
2220                                                 Assert(keep_tuples > 0);
2221                                                 keep_tuples--;
2222                                         }
2223                                 }
2224                                 else
2225                                         elog(ERROR, "HEAP_MOVED_OFF was expected (2)");
2226                         }
2227                 }
2228
2229                 if (vacpage->offsets_free > 0)  /* some tuples were moved */
2230                 {
2231                         if (chain_tuple_moved)          /* else - they are ordered */
2232                         {
2233                                 qsort((char *) (vacpage->offsets), vacpage->offsets_free,
2234                                           sizeof(OffsetNumber), vac_cmp_offno);
2235                         }
2236                         vpage_insert(&Nvacpagelist, copy_vac_page(vacpage));
2237                         WriteBuffer(buf);
2238                 }
2239                 else if (dowrite)
2240                         WriteBuffer(buf);
2241                 else
2242                         ReleaseBuffer(buf);
2243
2244                 if (offnum <= maxoff)
2245                         break;                          /* had to quit early, see above note */
2246
2247         }                                                       /* walk along relation */
2248
2249         blkno++;                                        /* new number of blocks */
2250
2251         if (cur_buffer != InvalidBuffer)
2252         {
2253                 Assert(num_moved > 0);
2254                 WriteBuffer(cur_buffer);
2255         }
2256
2257         if (num_moved > 0)
2258         {
2259                 /*
2260                  * We have to commit our tuple movings before we truncate the
2261                  * relation.  Ideally we should do Commit/StartTransactionCommand
2262                  * here, relying on the session-level table lock to protect our
2263                  * exclusive access to the relation.  However, that would require
2264                  * a lot of extra code to close and re-open the relation, indexes,
2265                  * etc.  For now, a quick hack: record status of current
2266                  * transaction as committed, and continue.
2267                  */
2268                 RecordTransactionCommit();
2269         }
2270
2271         /*
2272          * We are not going to move any more tuples across pages, but we still
2273          * need to apply vacuum_page to compact free space in the remaining
2274          * pages in vacuum_pages list.  Note that some of these pages may also
2275          * be in the fraged_pages list, and may have had tuples moved onto
2276          * them; if so, we already did vacuum_page and needn't do it again.
2277          */
2278         for (i = 0, curpage = vacuum_pages->pagedesc;
2279                  i < vacuumed_pages;
2280                  i++, curpage++)
2281         {
2282                 CHECK_FOR_INTERRUPTS();
2283                 Assert((*curpage)->blkno < blkno);
2284                 if ((*curpage)->offsets_used == 0)
2285                 {
2286                         /* this page was not used as a move target, so must clean it */
2287                         buf = ReadBuffer(onerel, (*curpage)->blkno);
2288                         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2289                         page = BufferGetPage(buf);
2290                         if (!PageIsEmpty(page))
2291                                 vacuum_page(onerel, buf, *curpage);
2292                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2293                         WriteBuffer(buf);
2294                 }
2295         }
2296
2297         /*
2298          * Now scan all the pages that we moved tuples onto and update tuple
2299          * status bits.  This is not really necessary, but will save time for
2300          * future transactions examining these tuples.
2301          *
2302          * XXX NOTICE that this code fails to clear HEAP_MOVED_OFF tuples from
2303          * pages that were move source pages but not move dest pages.  One
2304          * also wonders whether it wouldn't be better to skip this step and
2305          * let the tuple status updates happen someplace that's not holding an
2306          * exclusive lock on the relation.
2307          */
2308         checked_moved = 0;
2309         for (i = 0, curpage = fraged_pages->pagedesc;
2310                  i < num_fraged_pages;
2311                  i++, curpage++)
2312         {
2313                 CHECK_FOR_INTERRUPTS();
2314                 Assert((*curpage)->blkno < blkno);
2315                 if ((*curpage)->blkno > last_move_dest_block)
2316                         break;                          /* no need to scan any further */
2317                 if ((*curpage)->offsets_used == 0)
2318                         continue;                       /* this page was never used as a move dest */
2319                 buf = ReadBuffer(onerel, (*curpage)->blkno);
2320                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2321                 page = BufferGetPage(buf);
2322                 num_tuples = 0;
2323                 max_offset = PageGetMaxOffsetNumber(page);
2324                 for (newoff = FirstOffsetNumber;
2325                          newoff <= max_offset;
2326                          newoff = OffsetNumberNext(newoff))
2327                 {
2328                         itemid = PageGetItemId(page, newoff);
2329                         if (!ItemIdIsUsed(itemid))
2330                                 continue;
2331                         tuple.t_datamcxt = NULL;
2332                         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2333                         if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
2334                         {
2335                                 if (!(tuple.t_data->t_infomask & HEAP_MOVED))
2336                                         elog(ERROR, "HEAP_MOVED_OFF/HEAP_MOVED_IN was expected");
2337                                 if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
2338                                         elog(ERROR, "Invalid XVAC in tuple header (2)");
2339                                 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
2340                                 {
2341                                         tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
2342                                         tuple.t_data->t_infomask &= ~HEAP_MOVED;
2343                                         num_tuples++;
2344                                 }
2345                                 else
2346                                         tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
2347                         }
2348                 }
2349                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2350                 WriteBuffer(buf);
2351                 Assert((*curpage)->offsets_used == num_tuples);
2352                 checked_moved += num_tuples;
2353         }
2354         Assert(num_moved == checked_moved);
2355
2356         elog(elevel, "Rel %s: Pages: %u --> %u; Tuple(s) moved: %u.\n\t%s",
2357                  RelationGetRelationName(onerel),
2358                  nblocks, blkno, num_moved,
2359                  vac_show_rusage(&ru0));
2360
2361         /*
2362          * Reflect the motion of system tuples to catalog cache here.
2363          */
2364         CommandCounterIncrement();
2365
2366         if (Nvacpagelist.num_pages > 0)
2367         {
2368                 /* vacuum indexes again if needed */
2369                 if (Irel != (Relation *) NULL)
2370                 {
2371                         VacPage    *vpleft,
2372                                            *vpright,
2373                                                 vpsave;
2374
2375                         /* re-sort Nvacpagelist.pagedesc */
2376                         for (vpleft = Nvacpagelist.pagedesc,
2377                         vpright = Nvacpagelist.pagedesc + Nvacpagelist.num_pages - 1;
2378                                  vpleft < vpright; vpleft++, vpright--)
2379                         {
2380                                 vpsave = *vpleft;
2381                                 *vpleft = *vpright;
2382                                 *vpright = vpsave;
2383                         }
2384                         Assert(keep_tuples >= 0);
2385                         for (i = 0; i < nindexes; i++)
2386                                 vacuum_index(&Nvacpagelist, Irel[i],
2387                                                          vacrelstats->rel_tuples, keep_tuples);
2388                 }
2389
2390                 /* clean moved tuples from last page in Nvacpagelist list */
2391                 if (vacpage->blkno == (blkno - 1) &&
2392                         vacpage->offsets_free > 0)
2393                 {
2394                         OffsetNumber unused[BLCKSZ / sizeof(OffsetNumber)];
2395                         int                     uncnt;
2396
2397                         buf = ReadBuffer(onerel, vacpage->blkno);
2398                         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2399                         page = BufferGetPage(buf);
2400                         num_tuples = 0;
2401                         maxoff = PageGetMaxOffsetNumber(page);
2402                         for (offnum = FirstOffsetNumber;
2403                                  offnum <= maxoff;
2404                                  offnum = OffsetNumberNext(offnum))
2405                         {
2406                                 itemid = PageGetItemId(page, offnum);
2407                                 if (!ItemIdIsUsed(itemid))
2408                                         continue;
2409                                 tuple.t_datamcxt = NULL;
2410                                 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2411
2412                                 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
2413                                 {
2414                                         if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
2415                                         {
2416                                                 if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID)
2417                                                         elog(ERROR, "Invalid XVAC in tuple header (3)");
2418                                                 itemid->lp_flags &= ~LP_USED;
2419                                                 num_tuples++;
2420                                         }
2421                                         else
2422                                                 elog(ERROR, "HEAP_MOVED_OFF was expected (3)");
2423                                 }
2424
2425                         }
2426                         Assert(vacpage->offsets_free == num_tuples);
2427
2428                         START_CRIT_SECTION();
2429
2430                         uncnt = PageRepairFragmentation(page, unused);
2431
2432                         /* XLOG stuff */
2433                         if (!onerel->rd_istemp)
2434                         {
2435                                 XLogRecPtr      recptr;
2436
2437                                 recptr = log_heap_clean(onerel, buf, unused, uncnt);
2438                                 PageSetLSN(page, recptr);
2439                                 PageSetSUI(page, ThisStartUpID);
2440                         }
2441                         else
2442                         {
2443                                 /*
2444                                  * No XLOG record, but still need to flag that XID exists
2445                                  * on disk
2446                                  */
2447                                 MyXactMadeTempRelUpdate = true;
2448                         }
2449
2450                         END_CRIT_SECTION();
2451
2452                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2453                         WriteBuffer(buf);
2454                 }
2455
2456                 /* now - free new list of reaped pages */
2457                 curpage = Nvacpagelist.pagedesc;
2458                 for (i = 0; i < Nvacpagelist.num_pages; i++, curpage++)
2459                         pfree(*curpage);
2460                 pfree(Nvacpagelist.pagedesc);
2461         }
2462
2463         /*
2464          * Flush dirty pages out to disk.  We do this unconditionally, even if
2465          * we don't need to truncate, because we want to ensure that all
2466          * tuples have correct on-row commit status on disk (see bufmgr.c's
2467          * comments for FlushRelationBuffers()).
2468          */
2469         i = FlushRelationBuffers(onerel, blkno);
2470         if (i < 0)
2471                 elog(ERROR, "VACUUM (repair_frag): FlushRelationBuffers returned %d",
2472                          i);
2473
2474         /* truncate relation, if needed */
2475         if (blkno < nblocks)
2476         {
2477                 blkno = smgrtruncate(DEFAULT_SMGR, onerel, blkno);
2478                 onerel->rd_nblocks = blkno;             /* update relcache immediately */
2479                 onerel->rd_targblock = InvalidBlockNumber;
2480                 vacrelstats->rel_pages = blkno; /* set new number of blocks */
2481         }
2482
2483         /* clean up */
2484         pfree(vacpage);
2485         if (vacrelstats->vtlinks != NULL)
2486                 pfree(vacrelstats->vtlinks);
2487
2488         ExecDropTupleTable(tupleTable, true);
2489
2490         ExecCloseIndices(resultRelInfo);
2491
2492         FreeExecutorState(estate);
2493 }
2494
2495 /*
2496  *      vacuum_heap() -- free dead tuples
2497  *
2498  *              This routine marks dead tuples as unused and truncates relation
2499  *              if there are "empty" end-blocks.
2500  */
2501 static void
2502 vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
2503 {
2504         Buffer          buf;
2505         VacPage    *vacpage;
2506         BlockNumber relblocks;
2507         int                     nblocks;
2508         int                     i;
2509
2510         nblocks = vacuum_pages->num_pages;
2511         nblocks -= vacuum_pages->empty_end_pages;       /* nothing to do with them */
2512
2513         for (i = 0, vacpage = vacuum_pages->pagedesc; i < nblocks; i++, vacpage++)
2514         {
2515                 CHECK_FOR_INTERRUPTS();
2516                 if ((*vacpage)->offsets_free > 0)
2517                 {
2518                         buf = ReadBuffer(onerel, (*vacpage)->blkno);
2519                         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2520                         vacuum_page(onerel, buf, *vacpage);
2521                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2522                         WriteBuffer(buf);
2523                 }
2524         }
2525
2526         /*
2527          * Flush dirty pages out to disk.  We do this unconditionally, even if
2528          * we don't need to truncate, because we want to ensure that all
2529          * tuples have correct on-row commit status on disk (see bufmgr.c's
2530          * comments for FlushRelationBuffers()).
2531          */
2532         Assert(vacrelstats->rel_pages >= vacuum_pages->empty_end_pages);
2533         relblocks = vacrelstats->rel_pages - vacuum_pages->empty_end_pages;
2534
2535         i = FlushRelationBuffers(onerel, relblocks);
2536         if (i < 0)
2537                 elog(ERROR, "VACUUM (vacuum_heap): FlushRelationBuffers returned %d",
2538                          i);
2539
2540         /* truncate relation if there are some empty end-pages */
2541         if (vacuum_pages->empty_end_pages > 0)
2542         {
2543                 elog(elevel, "Rel %s: Pages: %u --> %u.",
2544                          RelationGetRelationName(onerel),
2545                          vacrelstats->rel_pages, relblocks);
2546                 relblocks = smgrtruncate(DEFAULT_SMGR, onerel, relblocks);
2547                 onerel->rd_nblocks = relblocks; /* update relcache immediately */
2548                 onerel->rd_targblock = InvalidBlockNumber;
2549                 vacrelstats->rel_pages = relblocks;             /* set new number of
2550                                                                                                  * blocks */
2551         }
2552 }
2553
2554 /*
2555  *      vacuum_page() -- free dead tuples on a page
2556  *                                       and repair its fragmentation.
2557  */
2558 static void
2559 vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
2560 {
2561         OffsetNumber unused[BLCKSZ / sizeof(OffsetNumber)];
2562         int                     uncnt;
2563         Page            page = BufferGetPage(buffer);
2564         ItemId          itemid;
2565         int                     i;
2566
2567         /* There shouldn't be any tuples moved onto the page yet! */
2568         Assert(vacpage->offsets_used == 0);
2569
2570         START_CRIT_SECTION();
2571
2572         for (i = 0; i < vacpage->offsets_free; i++)
2573         {
2574                 itemid = PageGetItemId(page, vacpage->offsets[i]);
2575                 itemid->lp_flags &= ~LP_USED;
2576         }
2577
2578         uncnt = PageRepairFragmentation(page, unused);
2579
2580         /* XLOG stuff */
2581         if (!onerel->rd_istemp)
2582         {
2583                 XLogRecPtr      recptr;
2584
2585                 recptr = log_heap_clean(onerel, buffer, unused, uncnt);
2586                 PageSetLSN(page, recptr);
2587                 PageSetSUI(page, ThisStartUpID);
2588         }
2589         else
2590         {
2591                 /* No XLOG record, but still need to flag that XID exists on disk */
2592                 MyXactMadeTempRelUpdate = true;
2593         }
2594
2595         END_CRIT_SECTION();
2596 }
2597
2598 /*
2599  *      scan_index() -- scan one index relation to update statistic.
2600  *
2601  * We use this when we have no deletions to do.
2602  */
2603 static void
2604 scan_index(Relation indrel, double num_tuples)
2605 {
2606         IndexBulkDeleteResult *stats;
2607         IndexVacuumCleanupInfo vcinfo;
2608         VacRUsage       ru0;
2609
2610         vac_init_rusage(&ru0);
2611
2612         /*
2613          * Even though we're not planning to delete anything, we use the
2614          * ambulkdelete call, because (a) the scan happens within the index AM
2615          * for more speed, and (b) it may want to pass private statistics to
2616          * the amvacuumcleanup call.
2617          */
2618         stats = index_bulk_delete(indrel, dummy_tid_reaped, NULL);
2619
2620         /* Do post-VACUUM cleanup, even though we deleted nothing */
2621         vcinfo.vacuum_full = true;
2622         vcinfo.message_level = elevel;
2623
2624         stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
2625
2626         if (!stats)
2627                 return;
2628
2629         /* now update statistics in pg_class */
2630         vac_update_relstats(RelationGetRelid(indrel),
2631                                                 stats->num_pages, stats->num_index_tuples,
2632                                                 false);
2633
2634         elog(elevel, "Index %s: Pages %u, %u deleted, %u free; Tuples %.0f.\n\t%s",
2635                  RelationGetRelationName(indrel),
2636                  stats->num_pages, stats->pages_deleted, stats->pages_free,
2637                  stats->num_index_tuples,
2638                  vac_show_rusage(&ru0));
2639
2640         /*
2641          * Check for tuple count mismatch.      If the index is partial, then it's
2642          * OK for it to have fewer tuples than the heap; else we got trouble.
2643          */
2644         if (stats->num_index_tuples != num_tuples)
2645         {
2646                 if (stats->num_index_tuples > num_tuples ||
2647                         !vac_is_partial_index(indrel))
2648                         elog(WARNING, "Index %s: NUMBER OF INDEX' TUPLES (%.0f) IS NOT THE SAME AS HEAP' (%.0f)."
2649                                  "\n\tRecreate the index.",
2650                                  RelationGetRelationName(indrel),
2651                                  stats->num_index_tuples, num_tuples);
2652         }
2653
2654         pfree(stats);
2655 }
2656
2657 /*
2658  *      vacuum_index() -- vacuum one index relation.
2659  *
2660  *              Vpl is the VacPageList of the heap we're currently vacuuming.
2661  *              It's locked. Indrel is an index relation on the vacuumed heap.
2662  *
2663  *              We don't bother to set locks on the index relation here, since
2664  *              the parent table is exclusive-locked already.
2665  *
2666  *              Finally, we arrange to update the index relation's statistics in
2667  *              pg_class.
2668  */
2669 static void
2670 vacuum_index(VacPageList vacpagelist, Relation indrel,
2671                          double num_tuples, int keep_tuples)
2672 {
2673         IndexBulkDeleteResult *stats;
2674         IndexVacuumCleanupInfo vcinfo;
2675         VacRUsage       ru0;
2676
2677         vac_init_rusage(&ru0);
2678
2679         /* Do bulk deletion */
2680         stats = index_bulk_delete(indrel, tid_reaped, (void *) vacpagelist);
2681
2682         /* Do post-VACUUM cleanup */
2683         vcinfo.vacuum_full = true;
2684         vcinfo.message_level = elevel;
2685
2686         stats = index_vacuum_cleanup(indrel, &vcinfo, stats);
2687
2688         if (!stats)
2689                 return;
2690
2691         /* now update statistics in pg_class */
2692         vac_update_relstats(RelationGetRelid(indrel),
2693                                                 stats->num_pages, stats->num_index_tuples,
2694                                                 false);
2695
2696         elog(elevel, "Index %s: Pages %u, %u deleted, %u free; Tuples %.0f: Deleted %.0f.\n\t%s",
2697                  RelationGetRelationName(indrel),
2698                  stats->num_pages, stats->pages_deleted, stats->pages_free,
2699                  stats->num_index_tuples - keep_tuples, stats->tuples_removed,
2700                  vac_show_rusage(&ru0));
2701
2702         /*
2703          * Check for tuple count mismatch.      If the index is partial, then it's
2704          * OK for it to have fewer tuples than the heap; else we got trouble.
2705          */
2706         if (stats->num_index_tuples != num_tuples + keep_tuples)
2707         {
2708                 if (stats->num_index_tuples > num_tuples + keep_tuples ||
2709                         !vac_is_partial_index(indrel))
2710                         elog(WARNING, "Index %s: NUMBER OF INDEX' TUPLES (%.0f) IS NOT THE SAME AS HEAP' (%.0f)."
2711                                  "\n\tRecreate the index.",
2712                                  RelationGetRelationName(indrel),
2713                                  stats->num_index_tuples, num_tuples);
2714         }
2715
2716         pfree(stats);
2717 }
2718
2719 /*
2720  *      tid_reaped() -- is a particular tid reaped?
2721  *
2722  *              This has the right signature to be an IndexBulkDeleteCallback.
2723  *
2724  *              vacpagelist->VacPage_array is sorted in right order.
2725  */
2726 static bool
2727 tid_reaped(ItemPointer itemptr, void *state)
2728 {
2729         VacPageList vacpagelist = (VacPageList) state;
2730         OffsetNumber ioffno;
2731         OffsetNumber *voff;
2732         VacPage         vp,
2733                            *vpp;
2734         VacPageData vacpage;
2735
2736         vacpage.blkno = ItemPointerGetBlockNumber(itemptr);
2737         ioffno = ItemPointerGetOffsetNumber(itemptr);
2738
2739         vp = &vacpage;
2740         vpp = (VacPage *) vac_bsearch((void *) &vp,
2741                                                                   (void *) (vacpagelist->pagedesc),
2742                                                                   vacpagelist->num_pages,
2743                                                                   sizeof(VacPage),
2744                                                                   vac_cmp_blk);
2745
2746         if (vpp == NULL)
2747                 return false;
2748
2749         /* ok - we are on a partially or fully reaped page */
2750         vp = *vpp;
2751
2752         if (vp->offsets_free == 0)
2753         {
2754                 /* this is EmptyPage, so claim all tuples on it are reaped!!! */
2755                 return true;
2756         }
2757
2758         voff = (OffsetNumber *) vac_bsearch((void *) &ioffno,
2759                                                                                 (void *) (vp->offsets),
2760                                                                                 vp->offsets_free,
2761                                                                                 sizeof(OffsetNumber),
2762                                                                                 vac_cmp_offno);
2763
2764         if (voff == NULL)
2765                 return false;
2766
2767         /* tid is reaped */
2768         return true;
2769 }
2770
2771 /*
2772  * Dummy version for scan_index.
2773  */
2774 static bool
2775 dummy_tid_reaped(ItemPointer itemptr, void *state)
2776 {
2777         return false;
2778 }
2779
2780 /*
2781  * Update the shared Free Space Map with the info we now have about
2782  * free space in the relation, discarding any old info the map may have.
2783  */
2784 static void
2785 vac_update_fsm(Relation onerel, VacPageList fraged_pages,
2786                            BlockNumber rel_pages)
2787 {
2788         int                     nPages = fraged_pages->num_pages;
2789         VacPage    *pagedesc = fraged_pages->pagedesc;
2790         Size            threshold;
2791         PageFreeSpaceInfo *pageSpaces;
2792         int                     outPages;
2793         int                     i;
2794
2795         /*
2796          * We only report pages with free space at least equal to the average
2797          * request size --- this avoids cluttering FSM with uselessly-small bits
2798          * of space.  Although FSM would discard pages with little free space
2799          * anyway, it's important to do this prefiltering because (a) it reduces
2800          * the time spent holding the FSM lock in RecordRelationFreeSpace, and
2801          * (b) FSM uses the number of pages reported as a statistic for guiding
2802          * space management.  If we didn't threshold our reports the same way
2803          * vacuumlazy.c does, we'd be skewing that statistic.
2804          */
2805         threshold = GetAvgFSMRequestSize(&onerel->rd_node);
2806
2807         /* +1 to avoid palloc(0) */
2808         pageSpaces = (PageFreeSpaceInfo *)
2809                 palloc((nPages + 1) * sizeof(PageFreeSpaceInfo));
2810         outPages = 0;
2811
2812         for (i = 0; i < nPages; i++)
2813         {
2814                 /*
2815                  * fraged_pages may contain entries for pages that we later
2816                  * decided to truncate from the relation; don't enter them into
2817                  * the free space map!
2818                  */
2819                 if (pagedesc[i]->blkno >= rel_pages)
2820                         break;
2821
2822                 if (pagedesc[i]->free >= threshold)
2823                 {
2824                         pageSpaces[outPages].blkno = pagedesc[i]->blkno;
2825                         pageSpaces[outPages].avail = pagedesc[i]->free;
2826                         outPages++;
2827                 }
2828         }
2829
2830         RecordRelationFreeSpace(&onerel->rd_node, outPages, pageSpaces);
2831
2832         pfree(pageSpaces);
2833 }
2834
2835 /* Copy a VacPage structure */
2836 static VacPage
2837 copy_vac_page(VacPage vacpage)
2838 {
2839         VacPage         newvacpage;
2840
2841         /* allocate a VacPageData entry */
2842         newvacpage = (VacPage) palloc(sizeof(VacPageData) +
2843                                                    vacpage->offsets_free * sizeof(OffsetNumber));
2844
2845         /* fill it in */
2846         if (vacpage->offsets_free > 0)
2847                 memcpy(newvacpage->offsets, vacpage->offsets,
2848                            vacpage->offsets_free * sizeof(OffsetNumber));
2849         newvacpage->blkno = vacpage->blkno;
2850         newvacpage->free = vacpage->free;
2851         newvacpage->offsets_used = vacpage->offsets_used;
2852         newvacpage->offsets_free = vacpage->offsets_free;
2853
2854         return newvacpage;
2855 }
2856
2857 /*
2858  * Add a VacPage pointer to a VacPageList.
2859  *
2860  *              As a side effect of the way that scan_heap works,
2861  *              higher pages come after lower pages in the array
2862  *              (and highest tid on a page is last).
2863  */
2864 static void
2865 vpage_insert(VacPageList vacpagelist, VacPage vpnew)
2866 {
2867 #define PG_NPAGEDESC 1024
2868
2869         /* allocate a VacPage entry if needed */
2870         if (vacpagelist->num_pages == 0)
2871         {
2872                 vacpagelist->pagedesc = (VacPage *) palloc(PG_NPAGEDESC * sizeof(VacPage));
2873                 vacpagelist->num_allocated_pages = PG_NPAGEDESC;
2874         }
2875         else if (vacpagelist->num_pages >= vacpagelist->num_allocated_pages)
2876         {
2877                 vacpagelist->num_allocated_pages *= 2;
2878                 vacpagelist->pagedesc = (VacPage *) repalloc(vacpagelist->pagedesc, vacpagelist->num_allocated_pages * sizeof(VacPage));
2879         }
2880         vacpagelist->pagedesc[vacpagelist->num_pages] = vpnew;
2881         (vacpagelist->num_pages)++;
2882 }
2883
2884 /*
2885  * vac_bsearch: just like standard C library routine bsearch(),
2886  * except that we first test to see whether the target key is outside
2887  * the range of the table entries.      This case is handled relatively slowly
2888  * by the normal binary search algorithm (ie, no faster than any other key)
2889  * but it occurs often enough in VACUUM to be worth optimizing.
2890  */
2891 static void *
2892 vac_bsearch(const void *key, const void *base,
2893                         size_t nelem, size_t size,
2894                         int (*compar) (const void *, const void *))
2895 {
2896         int                     res;
2897         const void *last;
2898
2899         if (nelem == 0)
2900                 return NULL;
2901         res = compar(key, base);
2902         if (res < 0)
2903                 return NULL;
2904         if (res == 0)
2905                 return (void *) base;
2906         if (nelem > 1)
2907         {
2908                 last = (const void *) ((const char *) base + (nelem - 1) * size);
2909                 res = compar(key, last);
2910                 if (res > 0)
2911                         return NULL;
2912                 if (res == 0)
2913                         return (void *) last;
2914         }
2915         if (nelem <= 2)
2916                 return NULL;                    /* already checked 'em all */
2917         return bsearch(key, base, nelem, size, compar);
2918 }
2919
2920 /*
2921  * Comparator routines for use with qsort() and bsearch().
2922  */
2923 static int
2924 vac_cmp_blk(const void *left, const void *right)
2925 {
2926         BlockNumber lblk,
2927                                 rblk;
2928
2929         lblk = (*((VacPage *) left))->blkno;
2930         rblk = (*((VacPage *) right))->blkno;
2931
2932         if (lblk < rblk)
2933                 return -1;
2934         if (lblk == rblk)
2935                 return 0;
2936         return 1;
2937 }
2938
2939 static int
2940 vac_cmp_offno(const void *left, const void *right)
2941 {
2942         if (*(OffsetNumber *) left < *(OffsetNumber *) right)
2943                 return -1;
2944         if (*(OffsetNumber *) left == *(OffsetNumber *) right)
2945                 return 0;
2946         return 1;
2947 }
2948
2949 static int
2950 vac_cmp_vtlinks(const void *left, const void *right)
2951 {
2952         if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi <
2953                 ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2954                 return -1;
2955         if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi >
2956                 ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2957                 return 1;
2958         /* bi_hi-es are equal */
2959         if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo <
2960                 ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2961                 return -1;
2962         if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo >
2963                 ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2964                 return 1;
2965         /* bi_lo-es are equal */
2966         if (((VTupleLink) left)->new_tid.ip_posid <
2967                 ((VTupleLink) right)->new_tid.ip_posid)
2968                 return -1;
2969         if (((VTupleLink) left)->new_tid.ip_posid >
2970                 ((VTupleLink) right)->new_tid.ip_posid)
2971                 return 1;
2972         return 0;
2973 }
2974
2975
2976 void
2977 vac_open_indexes(Relation relation, int *nindexes, Relation **Irel)
2978 {
2979         List       *indexoidlist,
2980                            *indexoidscan;
2981         int                     i;
2982
2983         indexoidlist = RelationGetIndexList(relation);
2984
2985         *nindexes = length(indexoidlist);
2986
2987         if (*nindexes > 0)
2988                 *Irel = (Relation *) palloc(*nindexes * sizeof(Relation));
2989         else
2990                 *Irel = NULL;
2991
2992         i = 0;
2993         foreach(indexoidscan, indexoidlist)
2994         {
2995                 Oid                     indexoid = lfirsto(indexoidscan);
2996
2997                 (*Irel)[i] = index_open(indexoid);
2998                 i++;
2999         }
3000
3001         freeList(indexoidlist);
3002 }
3003
3004
3005 void
3006 vac_close_indexes(int nindexes, Relation *Irel)
3007 {
3008         if (Irel == (Relation *) NULL)
3009                 return;
3010
3011         while (nindexes--)
3012                 index_close(Irel[nindexes]);
3013         pfree(Irel);
3014 }
3015
3016
3017 /*
3018  * Is an index partial (ie, could it contain fewer tuples than the heap?)
3019  */
3020 bool
3021 vac_is_partial_index(Relation indrel)
3022 {
3023         /*
3024          * If the index's AM doesn't support nulls, it's partial for our
3025          * purposes
3026          */
3027         if (!indrel->rd_am->amindexnulls)
3028                 return true;
3029
3030         /* Otherwise, look to see if there's a partial-index predicate */
3031         if (!heap_attisnull(indrel->rd_indextuple, Anum_pg_index_indpred))
3032                 return true;
3033
3034         return false;
3035 }
3036
3037
3038 static bool
3039 enough_space(VacPage vacpage, Size len)
3040 {
3041         len = MAXALIGN(len);
3042
3043         if (len > vacpage->free)
3044                 return false;
3045
3046         /* if there are free itemid(s) and len <= free_space... */
3047         if (vacpage->offsets_used < vacpage->offsets_free)
3048                 return true;
3049
3050         /* noff_used >= noff_free and so we'll have to allocate new itemid */
3051         if (len + sizeof(ItemIdData) <= vacpage->free)
3052                 return true;
3053
3054         return false;
3055 }
3056
3057
3058 /*
3059  * Initialize usage snapshot.
3060  */
3061 void
3062 vac_init_rusage(VacRUsage *ru0)
3063 {
3064         struct timezone tz;
3065
3066         getrusage(RUSAGE_SELF, &ru0->ru);
3067         gettimeofday(&ru0->tv, &tz);
3068 }
3069
3070 /*
3071  * Compute elapsed time since ru0 usage snapshot, and format into
3072  * a displayable string.  Result is in a static string, which is
3073  * tacky, but no one ever claimed that the Postgres backend is
3074  * threadable...
3075  */
3076 const char *
3077 vac_show_rusage(VacRUsage *ru0)
3078 {
3079         static char result[100];
3080         VacRUsage       ru1;
3081
3082         vac_init_rusage(&ru1);
3083
3084         if (ru1.tv.tv_usec < ru0->tv.tv_usec)
3085         {
3086                 ru1.tv.tv_sec--;
3087                 ru1.tv.tv_usec += 1000000;
3088         }
3089         if (ru1.ru.ru_stime.tv_usec < ru0->ru.ru_stime.tv_usec)
3090         {
3091                 ru1.ru.ru_stime.tv_sec--;
3092                 ru1.ru.ru_stime.tv_usec += 1000000;
3093         }
3094         if (ru1.ru.ru_utime.tv_usec < ru0->ru.ru_utime.tv_usec)
3095         {
3096                 ru1.ru.ru_utime.tv_sec--;
3097                 ru1.ru.ru_utime.tv_usec += 1000000;
3098         }
3099
3100         snprintf(result, sizeof(result),
3101                          "CPU %d.%02ds/%d.%02du sec elapsed %d.%02d sec.",
3102                          (int) (ru1.ru.ru_stime.tv_sec - ru0->ru.ru_stime.tv_sec),
3103           (int) (ru1.ru.ru_stime.tv_usec - ru0->ru.ru_stime.tv_usec) / 10000,
3104                          (int) (ru1.ru.ru_utime.tv_sec - ru0->ru.ru_utime.tv_sec),
3105           (int) (ru1.ru.ru_utime.tv_usec - ru0->ru.ru_utime.tv_usec) / 10000,
3106                          (int) (ru1.tv.tv_sec - ru0->tv.tv_sec),
3107                          (int) (ru1.tv.tv_usec - ru0->tv.tv_usec) / 10000);
3108
3109         return result;
3110 }