1 /*-------------------------------------------------------------------------
4 * the postgres vacuum cleaner
6 * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.163 2000/07/14 22:17:42 tgl Exp $
14 *-------------------------------------------------------------------------
16 #include <sys/types.h>
24 #include "access/genam.h"
25 #include "access/heapam.h"
26 #include "catalog/catalog.h"
27 #include "catalog/catname.h"
28 #include "catalog/index.h"
29 #include "commands/vacuum.h"
30 #include "miscadmin.h"
31 #include "nodes/execnodes.h"
32 #include "storage/sinval.h"
33 #include "storage/smgr.h"
34 #include "tcop/tcopprot.h"
35 #include "utils/acl.h"
36 #include "utils/builtins.h"
37 #include "utils/fmgroids.h"
38 #include "utils/inval.h"
39 #include "utils/relcache.h"
40 #include "utils/syscache.h"
41 #include "utils/temprel.h"
43 #ifndef HAVE_GETRUSAGE
44 #include "rusagestub.h"
47 #include <sys/resource.h>
51 static MemoryContext vac_context = NULL;
53 static int MESSAGE_LEVEL; /* message level */
55 static TransactionId XmaxRecent;
57 /* non-export function prototypes */
58 static void vacuum_init(void);
59 static void vacuum_shutdown(void);
60 static void vac_vacuum(NameData *VacRelP, bool analyze, List *anal_cols2);
61 static VRelList getrels(NameData *VacRelP);
62 static void vacuum_rel(Oid relid, bool analyze, bool is_toastrel);
63 static void scan_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages, VacPageList fraged_pages);
64 static void repair_frag(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages, VacPageList fraged_pages, int nindices, Relation *Irel);
65 static void vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacpagelist);
66 static void vacuum_page(Page page, VacPage vacpage);
67 static void vacuum_index(VacPageList vacpagelist, Relation indrel, int num_tuples, int keep_tuples);
68 static void scan_index(Relation indrel, int num_tuples);
69 static void update_relstats(Oid relid, int num_pages, int num_tuples, bool hasindex, VRelStats *vacrelstats);
70 static VacPage tid_reaped(ItemPointer itemptr, VacPageList vacpagelist);
71 static void reap_page(VacPageList vacpagelist, VacPage vacpage);
72 static void vpage_insert(VacPageList vacpagelist, VacPage vpnew);
73 static void get_indices(Relation relation, int *nindices, Relation **Irel);
74 static void close_indices(int nindices, Relation *Irel);
75 static IndexInfo **get_index_desc(Relation onerel, int nindices,
77 static void *vac_find_eq(void *bot, int nelem, int size, void *elm,
78 int (*compar) (const void *, const void *));
79 static int vac_cmp_blk(const void *left, const void *right);
80 static int vac_cmp_offno(const void *left, const void *right);
81 static int vac_cmp_vtlinks(const void *left, const void *right);
82 static bool enough_space(VacPage vacpage, Size len);
83 static char *show_rusage(struct rusage * ru0);
87 vacuum(char *vacrel, bool verbose, bool analyze, List *anal_cols)
93 List *anal_cols2 = NIL;
95 if (anal_cols != NIL && !analyze)
96 elog(ERROR, "Can't vacuum columns, only tables. You can 'vacuum analyze' columns.");
99 * We cannot run VACUUM inside a user transaction block; if we were
100 * inside a transaction, then our commit- and
101 * start-transaction-command calls would not have the intended effect!
102 * Furthermore, the forced commit that occurs before truncating the
103 * relation's file would have the effect of committing the rest of the
104 * user's transaction too, which would certainly not be the desired
107 if (IsTransactionBlock())
108 elog(ERROR, "VACUUM cannot run inside a BEGIN/END block");
111 MESSAGE_LEVEL = NOTICE;
113 MESSAGE_LEVEL = DEBUG;
116 * Create special memory context for cross-transaction storage.
118 * Since it is a child of QueryContext, it will go away eventually
119 * even if we suffer an error; there's no need for special abort
122 vac_context = AllocSetContextCreate(QueryContext,
124 ALLOCSET_DEFAULT_MINSIZE,
125 ALLOCSET_DEFAULT_INITSIZE,
126 ALLOCSET_DEFAULT_MAXSIZE);
128 /* vacrel gets de-allocated on xact commit, so copy it to safe storage */
131 namestrcpy(&VacRel, vacrel);
132 VacRelName = &VacRel;
137 /* must also copy the column list, if any, to safe storage */
138 old = MemoryContextSwitchTo(vac_context);
139 foreach(le, anal_cols)
141 char *col = (char *) lfirst(le);
143 anal_cols2 = lappend(anal_cols2, pstrdup(col));
145 MemoryContextSwitchTo(old);
148 * Start up the vacuum cleaner.
150 * NOTE: since this commits the current transaction, the memory holding
151 * any passed-in parameters gets freed here. We must have already
152 * copied pass-by-reference parameters to safe storage. Don't make me
157 /* vacuum the database */
158 vac_vacuum(VacRelName, analyze, anal_cols2);
165 * vacuum_init(), vacuum_shutdown() -- start up and shut down the vacuum cleaner.
167 * Formerly, there was code here to prevent more than one VACUUM from
168 * executing concurrently in the same database. However, there's no
169 * good reason to prevent that, and manually removing lockfiles after
170 * a vacuum crash was a pain for dbadmins. So, forget about lockfiles,
171 * and just rely on the exclusive lock we grab on each target table
172 * to ensure that there aren't two VACUUMs running on the same table
175 * The strangeness with committing and starting transactions in the
176 * init and shutdown routines is due to the fact that the vacuum cleaner
177 * is invoked via an SQL command, and so is already executing inside
178 * a transaction. We need to leave ourselves in a predictable state
179 * on entry and exit to the vacuum cleaner. We commit the transaction
180 * started in PostgresMain() inside vacuum_init(), and start one in
181 * vacuum_shutdown() to match the commit waiting for us back in
187 /* matches the StartTransaction in PostgresMain() */
188 CommitTransactionCommand();
194 /* on entry, we are not in a transaction */
197 * Flush the init file that relcache.c uses to save startup time. The
198 * next backend startup will rebuild the init file with up-to-date
199 * information from pg_class. This lets the optimizer see the stats
200 * that we've collected for certain critical system indexes. See
201 * relcache.c for more details.
203 * Ignore any failure to unlink the file, since it might not be there if
204 * no backend has been started since the last vacuum...
206 unlink(RELCACHE_INIT_FILENAME);
208 /* matches the CommitTransaction in PostgresMain() */
209 StartTransactionCommand();
212 * Clean up working storage --- note we must do this after
213 * StartTransactionCommand, else we might be trying to delete
214 * the active context!
216 MemoryContextDelete(vac_context);
221 * vac_vacuum() -- vacuum the database.
223 * This routine builds a list of relations to vacuum, and then calls
224 * code that vacuums them one at a time. We are careful to vacuum each
225 * relation in a separate transaction in order to avoid holding too many
229 vac_vacuum(NameData *VacRelP, bool analyze, List *anal_cols2)
234 /* get list of relations */
235 vrl = getrels(VacRelP);
237 /* vacuum each heap relation */
238 for (cur = vrl; cur != (VRelList) NULL; cur = cur->vrl_next)
240 vacuum_rel(cur->vrl_relid, analyze, false);
241 /* analyze separately so locking is minimized */
243 analyze_rel(cur->vrl_relid, anal_cols2, MESSAGE_LEVEL);
248 getrels(NameData *VacRelP)
263 StartTransactionCommand();
265 if (NameStr(*VacRelP))
269 * we could use the cache here, but it is clearer to use scankeys
270 * for both vacuum cases, bjm 2000/01/19
272 char *nontemp_relname;
274 /* We must re-map temp table names bjm 2000-04-06 */
275 if ((nontemp_relname =
276 get_temp_rel_by_username(NameStr(*VacRelP))) == NULL)
277 nontemp_relname = NameStr(*VacRelP);
279 ScanKeyEntryInitialize(&key, 0x0, Anum_pg_class_relname,
281 PointerGetDatum(nontemp_relname));
285 ScanKeyEntryInitialize(&key, 0x0, Anum_pg_class_relkind,
286 F_CHAREQ, CharGetDatum('r'));
289 vrl = cur = (VRelList) NULL;
291 rel = heap_openr(RelationRelationName, AccessShareLock);
292 tupdesc = RelationGetDescr(rel);
294 scan = heap_beginscan(rel, false, SnapshotNow, 1, &key);
296 while (HeapTupleIsValid(tuple = heap_getnext(scan, 0)))
300 d = heap_getattr(tuple, Anum_pg_class_relname, tupdesc, &n);
303 d = heap_getattr(tuple, Anum_pg_class_relkind, tupdesc, &n);
305 rkind = DatumGetChar(d);
307 if (rkind != RELKIND_RELATION)
309 elog(NOTICE, "Vacuum: can not process index and certain system tables");
313 /* get a relation list entry for this guy */
314 if (vrl == (VRelList) NULL)
315 vrl = cur = (VRelList)
316 MemoryContextAlloc(vac_context, sizeof(VRelListData));
319 cur->vrl_next = (VRelList)
320 MemoryContextAlloc(vac_context, sizeof(VRelListData));
324 cur->vrl_relid = tuple->t_data->t_oid;
325 cur->vrl_next = (VRelList) NULL;
329 heap_close(rel, AccessShareLock);
332 elog(NOTICE, "Vacuum: table not found");
334 CommitTransactionCommand();
340 * vacuum_rel() -- vacuum one heap relation
342 * This routine vacuums a single heap, cleans out its indices, and
343 * updates its statistics num_pages and num_tuples statistics.
345 * Doing one heap at a time incurs extra overhead, since we need to
346 * check that the heap exists again just before we vacuum it. The
347 * reason that we do this is so that vacuuming can be spread across
348 * many small transactions. Otherwise, two-phase locking would require
349 * us to lock the entire database during one pass of the vacuum cleaner.
352 vacuum_rel(Oid relid, bool analyze, bool is_toastrel)
356 VacPageListData vacuum_pages; /* List of pages to vacuum and/or clean
358 VacPageListData fraged_pages; /* List of pages with space enough for
364 VRelStats *vacrelstats;
365 bool reindex = false;
369 StartTransactionCommand();
372 * Check for user-requested abort. Note we want this to be inside a
373 * transaction, so xact.c doesn't issue useless NOTICE.
379 * Race condition -- if the pg_class tuple has gone away since the
380 * last time we saw it, we don't need to vacuum it.
382 tuple = SearchSysCacheTuple(RELOID,
383 ObjectIdGetDatum(relid),
385 if (!HeapTupleIsValid(tuple))
388 CommitTransactionCommand();
393 * Open the class, get an exclusive lock on it, and check permissions.
395 * Note we choose to treat permissions failure as a NOTICE and keep
396 * trying to vacuum the rest of the DB --- is this appropriate?
398 onerel = heap_open(relid, AccessExclusiveLock);
401 * Remember the relations TOAST relation for later
404 toast_relid = onerel->rd_rel->reltoastrelid;
407 if (!pg_ownercheck(GetPgUserName(), RelationGetRelationName(onerel),
410 elog(NOTICE, "Skipping \"%s\" --- only table owner can VACUUM it",
411 RelationGetRelationName(onerel));
412 heap_close(onerel, AccessExclusiveLock);
414 CommitTransactionCommand();
420 * Set up statistics-gathering machinery.
422 vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
423 vacrelstats->relid = relid;
424 vacrelstats->num_pages = vacrelstats->num_tuples = 0;
425 vacrelstats->hasindex = false;
427 GetXmaxRecent(&XmaxRecent);
431 vacuum_pages.num_pages = fraged_pages.num_pages = 0;
432 scan_heap(vacrelstats, onerel, &vacuum_pages, &fraged_pages);
433 if (IsIgnoringSystemIndexes() && IsSystemRelationName(RelationGetRelationName(onerel)))
436 /* Now open indices */
438 Irel = (Relation *) NULL;
439 get_indices(onerel, &nindices, &Irel);
442 else if (!RelationGetForm(onerel)->relhasindex)
445 vacrelstats->hasindex = true;
447 vacrelstats->hasindex = false;
450 for (i = 0; i < nindices; i++)
451 index_close(Irel[i]);
452 Irel = (Relation *) NULL;
453 activate_indexes_of_a_table(relid, false);
456 /* Clean/scan index relation(s) */
457 if (Irel != (Relation *) NULL)
459 if (vacuum_pages.num_pages > 0)
461 for (i = 0; i < nindices; i++)
462 vacuum_index(&vacuum_pages, Irel[i], vacrelstats->num_tuples, 0);
465 /* just scan indices to update statistic */
467 for (i = 0; i < nindices; i++)
468 scan_index(Irel[i], vacrelstats->num_tuples);
472 if (fraged_pages.num_pages > 0) /* Try to shrink heap */
473 repair_frag(vacrelstats, onerel, &vacuum_pages, &fraged_pages, nindices, Irel);
476 if (Irel != (Relation *) NULL)
477 close_indices(nindices, Irel);
478 if (vacuum_pages.num_pages > 0) /* Clean pages from
479 * vacuum_pages list */
480 vacuum_heap(vacrelstats, onerel, &vacuum_pages);
483 activate_indexes_of_a_table(relid, true);
485 /* ok - free vacuum_pages list of reaped pages */
486 if (vacuum_pages.num_pages > 0)
488 vacpage = vacuum_pages.pagedesc;
489 for (i = 0; i < vacuum_pages.num_pages; i++, vacpage++)
491 pfree(vacuum_pages.pagedesc);
492 if (fraged_pages.num_pages > 0)
493 pfree(fraged_pages.pagedesc);
496 /* all done with this class, but hold lock until commit */
497 heap_close(onerel, NoLock);
499 /* update statistics in pg_class */
500 update_relstats(vacrelstats->relid, vacrelstats->num_pages,
501 vacrelstats->num_tuples, vacrelstats->hasindex, vacrelstats);
503 /* If the relation has a secondary toast one, vacuum that too
504 * while we still hold the lock on the master table. We don't
505 * need to propagate "analyze" to it, because the toaster
506 * allways uses hardcoded index access and statistics are
507 * totally unimportant for toast relations
509 if (toast_relid != InvalidOid)
510 vacuum_rel(toast_relid, false, true);
512 /* next command frees attribute stats */
514 CommitTransactionCommand();
518 * scan_heap() -- scan an open heap relation
520 * This routine sets commit times, constructs vacuum_pages list of
521 * empty/uninitialized pages and pages with dead tuples and
522 * ~LP_USED line pointers, constructs fraged_pages list of pages
523 * appropriate for purposes of shrinking and maintains statistics
524 * on the number of live tuples in a heap.
527 scan_heap(VRelStats *vacrelstats, Relation onerel,
528 VacPageList vacuum_pages, VacPageList fraged_pages)
546 uint32 tups_vacuumed,
557 Size min_tlen = MaxTupleSize;
560 bool do_shrinking = true;
561 VTupleLink vtlinks = (VTupleLink) palloc(100 * sizeof(VTupleLinkData));
563 int free_vtlinks = 100;
566 getrusage(RUSAGE_SELF, &ru0);
568 relname = RelationGetRelationName(onerel);
569 elog(MESSAGE_LEVEL, "--Relation %s--", relname);
571 tups_vacuumed = num_tuples = nkeep = nunused = ncrash = empty_pages =
572 new_pages = changed_pages = empty_end_pages = 0;
573 free_size = usable_free_size = 0;
575 nblocks = RelationGetNumberOfBlocks(onerel);
577 vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
578 vacpage->offsets_used = 0;
580 for (blkno = 0; blkno < nblocks; blkno++)
582 buf = ReadBuffer(onerel, blkno);
583 page = BufferGetPage(buf);
584 vacpage->blkno = blkno;
585 vacpage->offsets_free = 0;
589 elog(NOTICE, "Rel %s: Uninitialized page %u - fixing",
591 PageInit(page, BufferGetPageSize(buf), 0);
592 vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
593 free_size += (vacpage->free - sizeof(ItemIdData));
596 reap_page(vacuum_pages, vacpage);
601 if (PageIsEmpty(page))
603 vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
604 free_size += (vacpage->free - sizeof(ItemIdData));
607 reap_page(vacuum_pages, vacpage);
614 maxoff = PageGetMaxOffsetNumber(page);
615 for (offnum = FirstOffsetNumber;
617 offnum = OffsetNumberNext(offnum))
619 itemid = PageGetItemId(page, offnum);
622 * Collect un-used items too - it's possible to have indices
623 * pointing here after crash.
625 if (!ItemIdIsUsed(itemid))
627 vacpage->offsets[vacpage->offsets_free++] = offnum;
632 tuple.t_datamcxt = NULL;
633 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
634 tuple.t_len = ItemIdGetLength(itemid);
635 ItemPointerSet(&(tuple.t_self), blkno, offnum);
638 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
640 if (tuple.t_data->t_infomask & HEAP_XMIN_INVALID)
642 else if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
644 if (TransactionIdDidCommit((TransactionId)
645 tuple.t_data->t_cmin))
647 tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
652 tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
656 else if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
658 if (!TransactionIdDidCommit((TransactionId)
659 tuple.t_data->t_cmin))
661 tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
666 tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
672 if (TransactionIdDidAbort(tuple.t_data->t_xmin))
674 else if (TransactionIdDidCommit(tuple.t_data->t_xmin))
676 tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
679 else if (!TransactionIdIsInProgress(tuple.t_data->t_xmin))
683 * Not Aborted, Not Committed, Not in Progress -
684 * so it's from crashed process. - vadim 11/26/96
691 elog(NOTICE, "Rel %s: TID %u/%u: InsertTransactionInProgress %u - can't shrink relation",
692 relname, blkno, offnum, tuple.t_data->t_xmin);
693 do_shrinking = false;
699 * here we are concerned about tuples with xmin committed and
700 * xmax unknown or committed
702 if (tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED &&
703 !(tuple.t_data->t_infomask & HEAP_XMAX_INVALID))
705 if (tuple.t_data->t_infomask & HEAP_XMAX_COMMITTED)
707 if (tuple.t_data->t_infomask & HEAP_MARKED_FOR_UPDATE)
710 tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
715 else if (TransactionIdDidAbort(tuple.t_data->t_xmax))
717 tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
720 else if (TransactionIdDidCommit(tuple.t_data->t_xmax))
722 if (tuple.t_data->t_infomask & HEAP_MARKED_FOR_UPDATE)
724 tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
730 else if (!TransactionIdIsInProgress(tuple.t_data->t_xmax))
734 * Not Aborted, Not Committed, Not in Progress - so it
735 * from crashed process. - vadim 06/02/97
737 tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
742 elog(NOTICE, "Rel %s: TID %u/%u: DeleteTransactionInProgress %u - can't shrink relation",
743 relname, blkno, offnum, tuple.t_data->t_xmax);
744 do_shrinking = false;
748 * If tuple is recently deleted then we must not remove it
751 if (tupgone && (tuple.t_data->t_infomask & HEAP_XMIN_INVALID) == 0 && tuple.t_data->t_xmax >= XmaxRecent)
755 if (!(tuple.t_data->t_infomask & HEAP_XMAX_COMMITTED))
757 tuple.t_data->t_infomask |= HEAP_XMAX_COMMITTED;
762 * If we do shrinking and this tuple is updated one
763 * then remember it to construct updated tuple
766 if (do_shrinking && !(ItemPointerEquals(&(tuple.t_self),
767 &(tuple.t_data->t_ctid))))
769 if (free_vtlinks == 0)
772 vtlinks = (VTupleLink) repalloc(vtlinks,
773 (free_vtlinks + num_vtlinks) *
774 sizeof(VTupleLinkData));
776 vtlinks[num_vtlinks].new_tid = tuple.t_data->t_ctid;
777 vtlinks[num_vtlinks].this_tid = tuple.t_self;
787 if (!OidIsValid(tuple.t_data->t_oid))
789 elog(NOTICE, "Rel %s: TID %u/%u: OID IS INVALID. TUPGONE %d.",
790 relname, blkno, offnum, tupgone);
797 if (tempPage == (Page) NULL)
801 pageSize = PageGetPageSize(page);
802 tempPage = (Page) palloc(pageSize);
803 memmove(tempPage, page, pageSize);
806 lpp = &(((PageHeader) tempPage)->pd_linp[offnum - 1]);
809 lpp->lp_flags &= ~LP_USED;
811 vacpage->offsets[vacpage->offsets_free++] = offnum;
819 if (tuple.t_len < min_tlen)
820 min_tlen = tuple.t_len;
821 if (tuple.t_len > max_tlen)
822 max_tlen = tuple.t_len;
835 if (tempPage != (Page) NULL)
836 { /* Some tuples are gone */
837 PageRepairFragmentation(tempPage);
838 vacpage->free = ((PageHeader) tempPage)->pd_upper - ((PageHeader) tempPage)->pd_lower;
839 free_size += vacpage->free;
840 reap_page(vacuum_pages, vacpage);
842 tempPage = (Page) NULL;
844 else if (vacpage->offsets_free > 0)
845 { /* there are only ~LP_USED line pointers */
846 vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
847 free_size += vacpage->free;
848 reap_page(vacuum_pages, vacpage);
860 /* save stats in the rel list for use later */
861 vacrelstats->num_tuples = num_tuples;
862 vacrelstats->num_pages = nblocks;
863 /* vacrelstats->natts = attr_cnt;*/
865 min_tlen = max_tlen = 0;
866 vacrelstats->min_tlen = min_tlen;
867 vacrelstats->max_tlen = max_tlen;
869 vacuum_pages->empty_end_pages = empty_end_pages;
870 fraged_pages->empty_end_pages = empty_end_pages;
873 * Try to make fraged_pages keeping in mind that we can't use free
874 * space of "empty" end-pages and last page if it reaped.
876 if (do_shrinking && vacuum_pages->num_pages - empty_end_pages > 0)
878 int nusf; /* blocks usefull for re-using */
880 nusf = vacuum_pages->num_pages - empty_end_pages;
881 if ((vacuum_pages->pagedesc[nusf - 1])->blkno == nblocks - empty_end_pages - 1)
884 for (i = 0; i < nusf; i++)
886 vp = vacuum_pages->pagedesc[i];
887 if (enough_space(vp, min_tlen))
889 vpage_insert(fraged_pages, vp);
890 usable_free_size += vp->free;
895 if (usable_free_size > 0 && num_vtlinks > 0)
897 qsort((char *) vtlinks, num_vtlinks, sizeof(VTupleLinkData),
899 vacrelstats->vtlinks = vtlinks;
900 vacrelstats->num_vtlinks = num_vtlinks;
904 vacrelstats->vtlinks = NULL;
905 vacrelstats->num_vtlinks = 0;
909 elog(MESSAGE_LEVEL, "Pages %u: Changed %u, reaped %u, Empty %u, New %u; \
910 Tup %u: Vac %u, Keep/VTL %u/%u, Crash %u, UnUsed %u, MinLen %u, MaxLen %u; \
911 Re-using: Free/Avail. Space %u/%u; EndEmpty/Avail. Pages %u/%u. %s",
912 nblocks, changed_pages, vacuum_pages->num_pages, empty_pages,
913 new_pages, num_tuples, tups_vacuumed,
914 nkeep, vacrelstats->num_vtlinks, ncrash,
915 nunused, min_tlen, max_tlen, free_size, usable_free_size,
916 empty_end_pages, fraged_pages->num_pages,
923 * repair_frag() -- try to repair relation's fragmentation
925 * This routine marks dead tuples as unused and tries re-use dead space
926 * by moving tuples (and inserting indices if needed). It constructs
927 * Nvacpagelist list of free-ed pages (moved tuples) and clean indices
928 * for them after committing (in hack-manner - without losing locks
929 * and freeing memory!) current transaction. It truncates relation
930 * if some end-blocks are gone away.
933 repair_frag(VRelStats *vacrelstats, Relation onerel,
934 VacPageList vacuum_pages, VacPageList fraged_pages,
935 int nindices, Relation *Irel)
945 OffsetNumber offnum = 0,
954 IndexInfo **indexInfo = NULL;
955 Datum idatum[INDEX_MAX_KEYS];
956 char inulls[INDEX_MAX_KEYS];
957 InsertIndexResult iresult;
958 VacPageListData Nvacpagelist;
959 VacPage cur_page = NULL,
964 int last_move_dest_block = -1,
979 getrusage(RUSAGE_SELF, &ru0);
981 myXID = GetCurrentTransactionId();
982 myCID = GetCurrentCommandId();
984 tupdesc = RelationGetDescr(onerel);
986 if (Irel != (Relation *) NULL) /* preparation for index' inserts */
987 indexInfo = get_index_desc(onerel, nindices, Irel);
989 Nvacpagelist.num_pages = 0;
990 num_fraged_pages = fraged_pages->num_pages;
991 Assert(vacuum_pages->num_pages > vacuum_pages->empty_end_pages);
992 vacuumed_pages = vacuum_pages->num_pages - vacuum_pages->empty_end_pages;
993 last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
994 last_vacuum_block = last_vacuum_page->blkno;
995 cur_buffer = InvalidBuffer;
998 vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
999 vacpage->offsets_used = vacpage->offsets_free = 0;
1002 * Scan pages backwards from the last nonempty page, trying to move
1003 * tuples down to lower pages. Quit when we reach a page that we have
1004 * moved any tuples onto. Note that if a page is still in the
1005 * fraged_pages list (list of candidate move-target pages) when we
1006 * reach it, we will remove it from the list. This ensures we never
1007 * move a tuple up to a higher page number.
1009 * NB: this code depends on the vacuum_pages and fraged_pages lists being
1010 * in order, and on fraged_pages being a subset of vacuum_pages.
1012 nblocks = vacrelstats->num_pages;
1013 for (blkno = nblocks - vacuum_pages->empty_end_pages - 1;
1014 blkno > last_move_dest_block;
1017 buf = ReadBuffer(onerel, blkno);
1018 page = BufferGetPage(buf);
1020 vacpage->offsets_free = 0;
1022 isempty = PageIsEmpty(page);
1025 if (blkno == last_vacuum_block) /* it's reaped page */
1027 if (last_vacuum_page->offsets_free > 0) /* there are dead tuples */
1028 { /* on this page - clean */
1030 vacuum_page(page, last_vacuum_page);
1036 if (vacuumed_pages > 0)
1038 /* get prev reaped page from vacuum_pages */
1039 last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
1040 last_vacuum_block = last_vacuum_page->blkno;
1044 last_vacuum_page = NULL;
1045 last_vacuum_block = -1;
1047 if (num_fraged_pages > 0 &&
1048 fraged_pages->pagedesc[num_fraged_pages - 1]->blkno ==
1049 (BlockNumber) blkno)
1051 /* page is in fraged_pages too; remove it */
1063 chain_tuple_moved = false; /* no one chain-tuple was moved
1064 * off this page, yet */
1065 vacpage->blkno = blkno;
1066 maxoff = PageGetMaxOffsetNumber(page);
1067 for (offnum = FirstOffsetNumber;
1069 offnum = OffsetNumberNext(offnum))
1071 itemid = PageGetItemId(page, offnum);
1073 if (!ItemIdIsUsed(itemid))
1076 tuple.t_datamcxt = NULL;
1077 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1078 tuple_len = tuple.t_len = ItemIdGetLength(itemid);
1079 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1081 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1083 if ((TransactionId) tuple.t_data->t_cmin != myXID)
1084 elog(ERROR, "Invalid XID in t_cmin");
1085 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1086 elog(ERROR, "HEAP_MOVED_IN was not expected");
1089 * If this (chain) tuple is moved by me already then I
1090 * have to check is it in vacpage or not - i.e. is it moved
1091 * while cleaning this page or some previous one.
1093 if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
1095 if (keep_tuples == 0)
1097 if (chain_tuple_moved) /* some chains was moved
1099 { /* cleaning this page */
1100 Assert(vacpage->offsets_free > 0);
1101 for (i = 0; i < vacpage->offsets_free; i++)
1103 if (vacpage->offsets[i] == offnum)
1106 if (i >= vacpage->offsets_free) /* not found */
1108 vacpage->offsets[vacpage->offsets_free++] = offnum;
1114 vacpage->offsets[vacpage->offsets_free++] = offnum;
1119 elog(ERROR, "HEAP_MOVED_OFF was expected");
1123 * If this tuple is in the chain of tuples created in updates
1124 * by "recent" transactions then we have to move all chain of
1125 * tuples to another places.
1127 if ((tuple.t_data->t_infomask & HEAP_UPDATED &&
1128 tuple.t_data->t_xmin >= XmaxRecent) ||
1129 (!(tuple.t_data->t_infomask & HEAP_XMAX_INVALID) &&
1130 !(ItemPointerEquals(&(tuple.t_self), &(tuple.t_data->t_ctid)))))
1135 ItemPointerData Ctid;
1136 HeapTupleData tp = tuple;
1137 Size tlen = tuple_len;
1138 VTupleMove vtmove = (VTupleMove)
1139 palloc(100 * sizeof(VTupleMoveData));
1141 int free_vtmove = 100;
1142 VacPage to_vacpage = NULL;
1144 bool freeCbuf = false;
1147 if (vacrelstats->vtlinks == NULL)
1148 elog(ERROR, "No one parent tuple was found");
1149 if (cur_buffer != InvalidBuffer)
1151 WriteBuffer(cur_buffer);
1152 cur_buffer = InvalidBuffer;
1156 * If this tuple is in the begin/middle of the chain then
1157 * we have to move to the end of chain.
1159 while (!(tp.t_data->t_infomask & HEAP_XMAX_INVALID) &&
1160 !(ItemPointerEquals(&(tp.t_self), &(tp.t_data->t_ctid))))
1162 Ctid = tp.t_data->t_ctid;
1164 ReleaseBuffer(Cbuf);
1166 Cbuf = ReadBuffer(onerel,
1167 ItemPointerGetBlockNumber(&Ctid));
1168 Cpage = BufferGetPage(Cbuf);
1169 Citemid = PageGetItemId(Cpage,
1170 ItemPointerGetOffsetNumber(&Ctid));
1171 if (!ItemIdIsUsed(Citemid))
1175 * This means that in the middle of chain there
1176 * was tuple updated by older (than XmaxRecent)
1177 * xaction and this tuple is already deleted by
1178 * me. Actually, upper part of chain should be
1179 * removed and seems that this should be handled
1180 * in scan_heap(), but it's not implemented at
1181 * the moment and so we just stop shrinking here.
1183 ReleaseBuffer(Cbuf);
1186 elog(NOTICE, "Child itemid in update-chain marked as unused - can't continue repair_frag");
1189 tp.t_datamcxt = NULL;
1190 tp.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
1192 tlen = tp.t_len = ItemIdGetLength(Citemid);
1196 /* first, can chain be moved ? */
1199 if (to_vacpage == NULL ||
1200 !enough_space(to_vacpage, tlen))
1204 * if to_vacpage no longer has enough free space to be
1205 * useful, remove it from fraged_pages list
1207 if (to_vacpage != NULL &&
1208 !enough_space(to_vacpage, vacrelstats->min_tlen))
1210 Assert(num_fraged_pages > to_item);
1211 memmove(fraged_pages->pagedesc + to_item,
1212 fraged_pages->pagedesc + to_item + 1,
1213 sizeof(VacPage) * (num_fraged_pages - to_item - 1));
1216 for (i = 0; i < num_fraged_pages; i++)
1218 if (enough_space(fraged_pages->pagedesc[i], tlen))
1222 /* can't move item anywhere */
1223 if (i == num_fraged_pages)
1225 for (i = 0; i < num_vtmove; i++)
1227 Assert(vtmove[i].vacpage->offsets_used > 0);
1228 (vtmove[i].vacpage->offsets_used)--;
1234 to_vacpage = fraged_pages->pagedesc[to_item];
1236 to_vacpage->free -= MAXALIGN(tlen);
1237 if (to_vacpage->offsets_used >= to_vacpage->offsets_free)
1238 to_vacpage->free -= MAXALIGN(sizeof(ItemIdData));
1239 (to_vacpage->offsets_used)++;
1240 if (free_vtmove == 0)
1243 vtmove = (VTupleMove) repalloc(vtmove,
1244 (free_vtmove + num_vtmove) *
1245 sizeof(VTupleMoveData));
1247 vtmove[num_vtmove].tid = tp.t_self;
1248 vtmove[num_vtmove].vacpage = to_vacpage;
1249 if (to_vacpage->offsets_used == 1)
1250 vtmove[num_vtmove].cleanVpd = true;
1252 vtmove[num_vtmove].cleanVpd = false;
1257 if (!(tp.t_data->t_infomask & HEAP_UPDATED) ||
1258 tp.t_data->t_xmin < XmaxRecent)
1261 /* Well, try to find tuple with old row version */
1268 VTupleLinkData vtld,
1271 vtld.new_tid = tp.t_self;
1273 vac_find_eq((void *) (vacrelstats->vtlinks),
1274 vacrelstats->num_vtlinks,
1275 sizeof(VTupleLinkData),
1279 elog(ERROR, "Parent tuple was not found");
1280 tp.t_self = vtlp->this_tid;
1281 Pbuf = ReadBuffer(onerel,
1282 ItemPointerGetBlockNumber(&(tp.t_self)));
1283 Ppage = BufferGetPage(Pbuf);
1284 Pitemid = PageGetItemId(Ppage,
1285 ItemPointerGetOffsetNumber(&(tp.t_self)));
1286 if (!ItemIdIsUsed(Pitemid))
1287 elog(ERROR, "Parent itemid marked as unused");
1288 Ptp.t_datamcxt = NULL;
1289 Ptp.t_data = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
1290 Assert(ItemPointerEquals(&(vtld.new_tid),
1291 &(Ptp.t_data->t_ctid)));
1294 * Read above about cases when
1295 * !ItemIdIsUsed(Citemid) (child item is
1296 * removed)... Due to the fact that at the moment
1297 * we don't remove unuseful part of update-chain,
1298 * it's possible to get too old parent row here.
1299 * Like as in the case which caused this problem,
1300 * we stop shrinking here. I could try to find
1301 * real parent row but want not to do it because
1302 * of real solution will be implemented anyway,
1303 * latter, and we are too close to 6.5 release. -
1306 if (Ptp.t_data->t_xmax != tp.t_data->t_xmin)
1309 ReleaseBuffer(Cbuf);
1311 ReleaseBuffer(Pbuf);
1312 for (i = 0; i < num_vtmove; i++)
1314 Assert(vtmove[i].vacpage->offsets_used > 0);
1315 (vtmove[i].vacpage->offsets_used)--;
1318 elog(NOTICE, "Too old parent tuple found - can't continue repair_frag");
1321 #ifdef NOT_USED /* I'm not sure that this will wotk
1325 * If this tuple is updated version of row and it
1326 * was created by the same transaction then no one
1327 * is interested in this tuple - mark it as
1330 if (Ptp.t_data->t_infomask & HEAP_UPDATED &&
1331 Ptp.t_data->t_xmin == Ptp.t_data->t_xmax)
1333 TransactionIdStore(myXID,
1334 (TransactionId *) &(Ptp.t_data->t_cmin));
1335 Ptp.t_data->t_infomask &=
1336 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1337 Ptp.t_data->t_infomask |= HEAP_MOVED_OFF;
1342 tp.t_datamcxt = Ptp.t_datamcxt;
1343 tp.t_data = Ptp.t_data;
1344 tlen = tp.t_len = ItemIdGetLength(Pitemid);
1346 ReleaseBuffer(Cbuf);
1351 if (num_vtmove == 0)
1355 ReleaseBuffer(Cbuf);
1356 if (num_vtmove == 0) /* chain can't be moved */
1361 ItemPointerSetInvalid(&Ctid);
1362 for (ti = 0; ti < num_vtmove; ti++)
1364 VacPage destvacpage = vtmove[ti].vacpage;
1366 /* Get tuple from chain */
1367 tuple.t_self = vtmove[ti].tid;
1368 Cbuf = ReadBuffer(onerel,
1369 ItemPointerGetBlockNumber(&(tuple.t_self)));
1370 Cpage = BufferGetPage(Cbuf);
1371 Citemid = PageGetItemId(Cpage,
1372 ItemPointerGetOffsetNumber(&(tuple.t_self)));
1373 tuple.t_datamcxt = NULL;
1374 tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
1375 tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
1376 /* Get page to move in */
1377 cur_buffer = ReadBuffer(onerel, destvacpage->blkno);
1380 * We should LockBuffer(cur_buffer) but don't, at the
1381 * moment. If you'll do LockBuffer then UNLOCK it
1382 * before index_insert: unique btree-s call heap_fetch
1383 * to get t_infomask of inserted heap tuple !!!
1385 ToPage = BufferGetPage(cur_buffer);
1388 * If this page was not used before - clean it.
1390 * This path is different from the other callers of
1391 * vacuum_page, because we have already incremented the
1392 * vacpage's offsets_used field to account for the
1393 * tuple(s) we expect to move onto the page. Therefore
1394 * vacuum_page's check for offsets_used == 0 is
1395 * wrong. But since that's a good debugging check for
1396 * all other callers, we work around it here rather
1399 if (!PageIsEmpty(ToPage) && vtmove[ti].cleanVpd)
1401 int sv_offsets_used = destvacpage->offsets_used;
1403 destvacpage->offsets_used = 0;
1404 vacuum_page(ToPage, destvacpage);
1405 destvacpage->offsets_used = sv_offsets_used;
1407 heap_copytuple_with_tuple(&tuple, &newtup);
1408 RelationInvalidateHeapTuple(onerel, &tuple);
1409 TransactionIdStore(myXID, (TransactionId *) &(newtup.t_data->t_cmin));
1410 newtup.t_data->t_infomask &=
1411 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF);
1412 newtup.t_data->t_infomask |= HEAP_MOVED_IN;
1413 newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
1414 InvalidOffsetNumber, LP_USED);
1415 if (newoff == InvalidOffsetNumber)
1417 elog(ERROR, "moving chain: failed to add item with len = %u to page %u",
1418 tuple_len, destvacpage->blkno);
1420 newitemid = PageGetItemId(ToPage, newoff);
1421 pfree(newtup.t_data);
1422 newtup.t_datamcxt = NULL;
1423 newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
1424 ItemPointerSet(&(newtup.t_self), destvacpage->blkno, newoff);
1425 if (((int) destvacpage->blkno) > last_move_dest_block)
1426 last_move_dest_block = destvacpage->blkno;
1429 * Set t_ctid pointing to itself for last tuple in
1430 * chain and to next tuple in chain otherwise.
1432 if (!ItemPointerIsValid(&Ctid))
1433 newtup.t_data->t_ctid = newtup.t_self;
1435 newtup.t_data->t_ctid = Ctid;
1436 Ctid = newtup.t_self;
1438 TransactionIdStore(myXID, (TransactionId *) &(tuple.t_data->t_cmin));
1439 tuple.t_data->t_infomask &=
1440 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1441 tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
1446 * Remember that we moved tuple from the current page
1447 * (corresponding index tuple will be cleaned).
1450 vacpage->offsets[vacpage->offsets_free++] =
1451 ItemPointerGetOffsetNumber(&(tuple.t_self));
1455 if (Irel != (Relation *) NULL)
1458 * XXX using CurrentMemoryContext here means
1459 * intra-vacuum memory leak for functional indexes.
1460 * Should fix someday.
1462 * XXX This code fails to handle partial indexes!
1463 * Probably should change it to use ExecOpenIndices.
1465 for (i = 0; i < nindices; i++)
1467 FormIndexDatum(indexInfo[i],
1470 CurrentMemoryContext,
1473 iresult = index_insert(Irel[i],
1482 WriteBuffer(cur_buffer);
1484 ReleaseBuffer(Cbuf);
1488 cur_buffer = InvalidBuffer;
1490 chain_tuple_moved = true;
1494 /* try to find new page for this tuple */
1495 if (cur_buffer == InvalidBuffer ||
1496 !enough_space(cur_page, tuple_len))
1498 if (cur_buffer != InvalidBuffer)
1500 WriteBuffer(cur_buffer);
1501 cur_buffer = InvalidBuffer;
1504 * If previous target page is now too full to add *any*
1505 * tuple to it, remove it from fraged_pages.
1507 if (!enough_space(cur_page, vacrelstats->min_tlen))
1509 Assert(num_fraged_pages > cur_item);
1510 memmove(fraged_pages->pagedesc + cur_item,
1511 fraged_pages->pagedesc + cur_item + 1,
1512 sizeof(VacPage) * (num_fraged_pages - cur_item - 1));
1516 for (i = 0; i < num_fraged_pages; i++)
1518 if (enough_space(fraged_pages->pagedesc[i], tuple_len))
1521 if (i == num_fraged_pages)
1522 break; /* can't move item anywhere */
1524 cur_page = fraged_pages->pagedesc[cur_item];
1525 cur_buffer = ReadBuffer(onerel, cur_page->blkno);
1526 ToPage = BufferGetPage(cur_buffer);
1527 /* if this page was not used before - clean it */
1528 if (!PageIsEmpty(ToPage) && cur_page->offsets_used == 0)
1529 vacuum_page(ToPage, cur_page);
1533 heap_copytuple_with_tuple(&tuple, &newtup);
1535 RelationInvalidateHeapTuple(onerel, &tuple);
1538 * Mark new tuple as moved_in by vacuum and store vacuum XID
1541 TransactionIdStore(myXID, (TransactionId *) &(newtup.t_data->t_cmin));
1542 newtup.t_data->t_infomask &=
1543 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF);
1544 newtup.t_data->t_infomask |= HEAP_MOVED_IN;
1546 /* add tuple to the page */
1547 newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
1548 InvalidOffsetNumber, LP_USED);
1549 if (newoff == InvalidOffsetNumber)
1552 failed to add item with len = %u to page %u (free space %u, nusd %u, noff %u)",
1553 tuple_len, cur_page->blkno, cur_page->free,
1554 cur_page->offsets_used, cur_page->offsets_free);
1556 newitemid = PageGetItemId(ToPage, newoff);
1557 pfree(newtup.t_data);
1558 newtup.t_datamcxt = NULL;
1559 newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
1560 ItemPointerSet(&(newtup.t_data->t_ctid), cur_page->blkno, newoff);
1561 newtup.t_self = newtup.t_data->t_ctid;
1564 * Mark old tuple as moved_off by vacuum and store vacuum XID
1567 TransactionIdStore(myXID, (TransactionId *) &(tuple.t_data->t_cmin));
1568 tuple.t_data->t_infomask &=
1569 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1570 tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
1572 cur_page->offsets_used++;
1574 cur_page->free = ((PageHeader) ToPage)->pd_upper - ((PageHeader) ToPage)->pd_lower;
1575 if (((int) cur_page->blkno) > last_move_dest_block)
1576 last_move_dest_block = cur_page->blkno;
1578 vacpage->offsets[vacpage->offsets_free++] = offnum;
1580 /* insert index' tuples if needed */
1581 if (Irel != (Relation *) NULL)
1584 * XXX using CurrentMemoryContext here means
1585 * intra-vacuum memory leak for functional indexes.
1586 * Should fix someday.
1588 * XXX This code fails to handle partial indexes!
1589 * Probably should change it to use ExecOpenIndices.
1591 for (i = 0; i < nindices; i++)
1593 FormIndexDatum(indexInfo[i],
1596 CurrentMemoryContext,
1599 iresult = index_insert(Irel[i],
1609 } /* walk along page */
1611 if (offnum < maxoff && keep_tuples > 0)
1615 for (off = OffsetNumberNext(offnum);
1617 off = OffsetNumberNext(off))
1619 itemid = PageGetItemId(page, off);
1620 if (!ItemIdIsUsed(itemid))
1622 tuple.t_datamcxt = NULL;
1623 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1624 if (tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)
1626 if ((TransactionId) tuple.t_data->t_cmin != myXID)
1627 elog(ERROR, "Invalid XID in t_cmin (4)");
1628 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1629 elog(ERROR, "HEAP_MOVED_IN was not expected (2)");
1630 if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
1632 /* some chains was moved while */
1633 if (chain_tuple_moved)
1634 { /* cleaning this page */
1635 Assert(vacpage->offsets_free > 0);
1636 for (i = 0; i < vacpage->offsets_free; i++)
1638 if (vacpage->offsets[i] == off)
1641 if (i >= vacpage->offsets_free) /* not found */
1643 vacpage->offsets[vacpage->offsets_free++] = off;
1644 Assert(keep_tuples > 0);
1650 vacpage->offsets[vacpage->offsets_free++] = off;
1651 Assert(keep_tuples > 0);
1658 if (vacpage->offsets_free > 0) /* some tuples were moved */
1660 if (chain_tuple_moved) /* else - they are ordered */
1662 qsort((char *) (vacpage->offsets), vacpage->offsets_free,
1663 sizeof(OffsetNumber), vac_cmp_offno);
1665 reap_page(&Nvacpagelist, vacpage);
1673 if (offnum <= maxoff)
1674 break; /* some item(s) left */
1676 } /* walk along relation */
1678 blkno++; /* new number of blocks */
1680 if (cur_buffer != InvalidBuffer)
1682 Assert(num_moved > 0);
1683 WriteBuffer(cur_buffer);
1690 * We have to commit our tuple' movings before we'll truncate
1691 * relation, but we shouldn't lose our locks. And so - quick hack:
1692 * flush buffers and record status of current transaction as
1693 * committed, and continue. - vadim 11/13/96
1696 TransactionIdCommit(myXID);
1701 * Clean uncleaned reaped pages from vacuum_pages list list and set
1702 * xmin committed for inserted tuples
1705 for (i = 0, curpage = vacuum_pages->pagedesc; i < vacuumed_pages; i++, curpage++)
1707 Assert((*curpage)->blkno < (BlockNumber) blkno);
1708 buf = ReadBuffer(onerel, (*curpage)->blkno);
1709 page = BufferGetPage(buf);
1710 if ((*curpage)->offsets_used == 0) /* this page was not used */
1712 if (!PageIsEmpty(page))
1713 vacuum_page(page, *curpage);
1716 /* this page was used */
1719 max_offset = PageGetMaxOffsetNumber(page);
1720 for (newoff = FirstOffsetNumber;
1721 newoff <= max_offset;
1722 newoff = OffsetNumberNext(newoff))
1724 itemid = PageGetItemId(page, newoff);
1725 if (!ItemIdIsUsed(itemid))
1727 tuple.t_datamcxt = NULL;
1728 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1729 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1731 if ((TransactionId) tuple.t_data->t_cmin != myXID)
1732 elog(ERROR, "Invalid XID in t_cmin (2)");
1733 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1735 tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
1738 else if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
1739 tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
1741 elog(ERROR, "HEAP_MOVED_OFF/HEAP_MOVED_IN was expected");
1744 Assert((*curpage)->offsets_used == num_tuples);
1745 checked_moved += num_tuples;
1749 Assert(num_moved == checked_moved);
1751 elog(MESSAGE_LEVEL, "Rel %s: Pages: %u --> %u; Tuple(s) moved: %u. %s",
1752 RelationGetRelationName(onerel),
1753 nblocks, blkno, num_moved,
1756 if (Nvacpagelist.num_pages > 0)
1758 /* vacuum indices again if needed */
1759 if (Irel != (Relation *) NULL)
1765 /* re-sort Nvacpagelist.pagedesc */
1766 for (vpleft = Nvacpagelist.pagedesc,
1767 vpright = Nvacpagelist.pagedesc + Nvacpagelist.num_pages - 1;
1768 vpleft < vpright; vpleft++, vpright--)
1774 Assert(keep_tuples >= 0);
1775 for (i = 0; i < nindices; i++)
1776 vacuum_index(&Nvacpagelist, Irel[i],
1777 vacrelstats->num_tuples, keep_tuples);
1780 /* clean moved tuples from last page in Nvacpagelist list */
1781 if (vacpage->blkno == (BlockNumber) (blkno - 1) &&
1782 vacpage->offsets_free > 0)
1784 buf = ReadBuffer(onerel, vacpage->blkno);
1785 page = BufferGetPage(buf);
1787 for (offnum = FirstOffsetNumber;
1789 offnum = OffsetNumberNext(offnum))
1791 itemid = PageGetItemId(page, offnum);
1792 if (!ItemIdIsUsed(itemid))
1794 tuple.t_datamcxt = NULL;
1795 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1797 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1799 if ((TransactionId) tuple.t_data->t_cmin != myXID)
1800 elog(ERROR, "Invalid XID in t_cmin (3)");
1801 if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
1803 itemid->lp_flags &= ~LP_USED;
1807 elog(ERROR, "HEAP_MOVED_OFF was expected (2)");
1811 Assert(vacpage->offsets_free == num_tuples);
1812 PageRepairFragmentation(page);
1816 /* now - free new list of reaped pages */
1817 curpage = Nvacpagelist.pagedesc;
1818 for (i = 0; i < Nvacpagelist.num_pages; i++, curpage++)
1820 pfree(Nvacpagelist.pagedesc);
1823 /* truncate relation, after flushing any dirty pages out to disk */
1824 if (blkno < nblocks)
1826 i = FlushRelationBuffers(onerel, blkno);
1828 elog(FATAL, "VACUUM (repair_frag): FlushRelationBuffers returned %d", i);
1829 blkno = smgrtruncate(DEFAULT_SMGR, onerel, blkno);
1831 vacrelstats->num_pages = blkno; /* set new number of blocks */
1834 if (Irel != (Relation *) NULL) /* pfree index' allocations */
1836 close_indices(nindices, Irel);
1841 if (vacrelstats->vtlinks != NULL)
1842 pfree(vacrelstats->vtlinks);
1847 * vacuum_heap() -- free dead tuples
1849 * This routine marks dead tuples as unused and truncates relation
1850 * if there are "empty" end-blocks.
1853 vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
1861 nblocks = vacuum_pages->num_pages;
1862 nblocks -= vacuum_pages->empty_end_pages; /* nothing to do with
1865 for (i = 0, vacpage = vacuum_pages->pagedesc; i < nblocks; i++, vacpage++)
1867 if ((*vacpage)->offsets_free > 0)
1869 buf = ReadBuffer(onerel, (*vacpage)->blkno);
1870 page = BufferGetPage(buf);
1871 vacuum_page(page, *vacpage);
1876 /* truncate relation if there are some empty end-pages */
1877 if (vacuum_pages->empty_end_pages > 0)
1879 Assert(vacrelstats->num_pages >= vacuum_pages->empty_end_pages);
1880 nblocks = vacrelstats->num_pages - vacuum_pages->empty_end_pages;
1881 elog(MESSAGE_LEVEL, "Rel %s: Pages: %u --> %u.",
1882 RelationGetRelationName(onerel),
1883 vacrelstats->num_pages, nblocks);
1886 * We have to flush "empty" end-pages (if changed, but who knows
1887 * it) before truncation
1889 * XXX is FlushBufferPool() still needed here?
1893 i = FlushRelationBuffers(onerel, nblocks);
1895 elog(FATAL, "VACUUM (vacuum_heap): FlushRelationBuffers returned %d", i);
1897 nblocks = smgrtruncate(DEFAULT_SMGR, onerel, nblocks);
1898 Assert(nblocks >= 0);
1899 vacrelstats->num_pages = nblocks; /* set new number of
1906 * vacuum_page() -- free dead tuples on a page
1907 * and repair its fragmentation.
1910 vacuum_page(Page page, VacPage vacpage)
1915 /* There shouldn't be any tuples moved onto the page yet! */
1916 Assert(vacpage->offsets_used == 0);
1918 for (i = 0; i < vacpage->offsets_free; i++)
1920 itemid = &(((PageHeader) page)->pd_linp[vacpage->offsets[i] - 1]);
1921 itemid->lp_flags &= ~LP_USED;
1923 PageRepairFragmentation(page);
1928 * _scan_index() -- scan one index relation to update statistic.
1932 scan_index(Relation indrel, int num_tuples)
1934 RetrieveIndexResult res;
1935 IndexScanDesc iscan;
1940 getrusage(RUSAGE_SELF, &ru0);
1942 /* walk through the entire index */
1943 iscan = index_beginscan(indrel, false, 0, (ScanKey) NULL);
1946 while ((res = index_getnext(iscan, ForwardScanDirection))
1947 != (RetrieveIndexResult) NULL)
1953 index_endscan(iscan);
1955 /* now update statistics in pg_class */
1956 nipages = RelationGetNumberOfBlocks(indrel);
1957 update_relstats(RelationGetRelid(indrel), nipages, nitups, false, NULL);
1959 elog(MESSAGE_LEVEL, "Index %s: Pages %u; Tuples %u. %s",
1960 RelationGetRelationName(indrel), nipages, nitups,
1963 if (nitups != num_tuples)
1964 elog(NOTICE, "Index %s: NUMBER OF INDEX' TUPLES (%u) IS NOT THE SAME AS HEAP' (%u).\
1965 \n\tRecreate the index.",
1966 RelationGetRelationName(indrel), nitups, num_tuples);
1971 * vacuum_index() -- vacuum one index relation.
1973 * Vpl is the VacPageList of the heap we're currently vacuuming.
1974 * It's locked. Indrel is an index relation on the vacuumed heap.
1975 * We don't set locks on the index relation here, since the indexed
1976 * access methods support locking at different granularities.
1977 * We let them handle it.
1979 * Finally, we arrange to update the index relation's statistics in
1983 vacuum_index(VacPageList vacpagelist, Relation indrel, int num_tuples, int keep_tuples)
1985 RetrieveIndexResult res;
1986 IndexScanDesc iscan;
1987 ItemPointer heapptr;
1989 int num_index_tuples;
1994 getrusage(RUSAGE_SELF, &ru0);
1996 /* walk through the entire index */
1997 iscan = index_beginscan(indrel, false, 0, (ScanKey) NULL);
1999 num_index_tuples = 0;
2001 while ((res = index_getnext(iscan, ForwardScanDirection))
2002 != (RetrieveIndexResult) NULL)
2004 heapptr = &res->heap_iptr;
2006 if ((vp = tid_reaped(heapptr, vacpagelist)) != (VacPage) NULL)
2009 elog(DEBUG, "<%x,%x> -> <%x,%x>",
2010 ItemPointerGetBlockNumber(&(res->index_iptr)),
2011 ItemPointerGetOffsetNumber(&(res->index_iptr)),
2012 ItemPointerGetBlockNumber(&(res->heap_iptr)),
2013 ItemPointerGetOffsetNumber(&(res->heap_iptr)));
2015 if (vp->offsets_free == 0)
2017 elog(NOTICE, "Index %s: pointer to EmptyPage (blk %u off %u) - fixing",
2018 RelationGetRelationName(indrel),
2019 vp->blkno, ItemPointerGetOffsetNumber(heapptr));
2022 index_delete(indrel, &res->index_iptr);
2030 index_endscan(iscan);
2032 /* now update statistics in pg_class */
2033 num_pages = RelationGetNumberOfBlocks(indrel);
2034 update_relstats(RelationGetRelid(indrel), num_pages, num_index_tuples, false, NULL);
2036 elog(MESSAGE_LEVEL, "Index %s: Pages %u; Tuples %u: Deleted %u. %s",
2037 RelationGetRelationName(indrel), num_pages,
2038 num_index_tuples - keep_tuples, tups_vacuumed,
2041 if (num_index_tuples != num_tuples + keep_tuples)
2042 elog(NOTICE, "Index %s: NUMBER OF INDEX' TUPLES (%u) IS NOT THE SAME AS HEAP' (%u).\
2043 \n\tRecreate the index.",
2044 RelationGetRelationName(indrel), num_index_tuples, num_tuples);
2049 * tid_reaped() -- is a particular tid reaped?
2051 * vacpagelist->VacPage_array is sorted in right order.
2054 tid_reaped(ItemPointer itemptr, VacPageList vacpagelist)
2056 OffsetNumber ioffno;
2060 VacPageData vacpage;
2062 vacpage.blkno = ItemPointerGetBlockNumber(itemptr);
2063 ioffno = ItemPointerGetOffsetNumber(itemptr);
2066 vpp = (VacPage *) vac_find_eq((void *) (vacpagelist->pagedesc),
2067 vacpagelist->num_pages, sizeof(VacPage), (void *) &vp,
2070 if (vpp == (VacPage *) NULL)
2071 return (VacPage) NULL;
2074 /* ok - we are on true page */
2076 if (vp->offsets_free == 0)
2077 { /* this is EmptyPage !!! */
2081 voff = (OffsetNumber *) vac_find_eq((void *) (vp->offsets),
2082 vp->offsets_free, sizeof(OffsetNumber), (void *) &ioffno,
2085 if (voff == (OffsetNumber *) NULL)
2086 return (VacPage) NULL;
2093 * update_relstats() -- update statistics for one relation
2095 * Statistics are stored in several places: the pg_class row for the
2096 * relation has stats about the whole relation, the pg_attribute rows
2097 * for each attribute store "disbursion", and there is a pg_statistic
2098 * row for each (non-system) attribute. (Disbursion probably ought to
2099 * be moved to pg_statistic, but it's not worth doing unless there's
2100 * another reason to have to change pg_attribute.) Disbursion and
2101 * pg_statistic values are only updated by VACUUM ANALYZE, but we
2102 * always update the stats in pg_class.
2104 * This routine works for both index and heap relation entries in
2105 * pg_class. We violate no-overwrite semantics here by storing new
2106 * values for the statistics columns directly into the pg_class
2107 * tuple that's already on the page. The reason for this is that if
2108 * we updated these tuples in the usual way, vacuuming pg_class itself
2109 * wouldn't work very well --- by the time we got done with a vacuum
2110 * cycle, most of the tuples in pg_class would've been obsoleted.
2111 * Updating pg_class's own statistics would be especially tricky.
2112 * Of course, this only works for fixed-size never-null columns, but
2116 update_relstats(Oid relid, int num_pages, int num_tuples, bool hasindex,
2117 VRelStats *vacrelstats)
2122 Form_pg_class pgcform;
2126 * update number of tuples and number of pages in pg_class
2128 rd = heap_openr(RelationRelationName, RowExclusiveLock);
2130 ctup = SearchSysCacheTupleCopy(RELOID,
2131 ObjectIdGetDatum(relid),
2133 if (!HeapTupleIsValid(ctup))
2134 elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
2137 /* get the buffer cache tuple */
2138 rtup.t_self = ctup->t_self;
2139 heap_fetch(rd, SnapshotNow, &rtup, &buffer);
2140 heap_freetuple(ctup);
2142 /* overwrite the existing statistics in the tuple */
2143 pgcform = (Form_pg_class) GETSTRUCT(&rtup);
2144 pgcform->reltuples = num_tuples;
2145 pgcform->relpages = num_pages;
2146 pgcform->relhasindex = hasindex;
2148 /* invalidate the tuple in the cache and write the buffer */
2149 RelationInvalidateHeapTuple(rd, &rtup);
2150 WriteBuffer(buffer);
2152 heap_close(rd, RowExclusiveLock);
2156 * reap_page() -- save a page on the array of reaped pages.
2158 * As a side effect of the way that the vacuuming loop for a given
2159 * relation works, higher pages come after lower pages in the array
2160 * (and highest tid on a page is last).
2163 reap_page(VacPageList vacpagelist, VacPage vacpage)
2167 /* allocate a VacPageData entry */
2168 newvacpage = (VacPage) palloc(sizeof(VacPageData) + vacpage->offsets_free * sizeof(OffsetNumber));
2171 if (vacpage->offsets_free > 0)
2172 memmove(newvacpage->offsets, vacpage->offsets, vacpage->offsets_free * sizeof(OffsetNumber));
2173 newvacpage->blkno = vacpage->blkno;
2174 newvacpage->free = vacpage->free;
2175 newvacpage->offsets_used = vacpage->offsets_used;
2176 newvacpage->offsets_free = vacpage->offsets_free;
2178 /* insert this page into vacpagelist list */
2179 vpage_insert(vacpagelist, newvacpage);
2184 vpage_insert(VacPageList vacpagelist, VacPage vpnew)
2186 #define PG_NPAGEDESC 1024
2188 /* allocate a VacPage entry if needed */
2189 if (vacpagelist->num_pages == 0)
2191 vacpagelist->pagedesc = (VacPage *) palloc(PG_NPAGEDESC * sizeof(VacPage));
2192 vacpagelist->num_allocated_pages = PG_NPAGEDESC;
2194 else if (vacpagelist->num_pages >= vacpagelist->num_allocated_pages)
2196 vacpagelist->num_allocated_pages *= 2;
2197 vacpagelist->pagedesc = (VacPage *) repalloc(vacpagelist->pagedesc, vacpagelist->num_allocated_pages * sizeof(VacPage));
2199 vacpagelist->pagedesc[vacpagelist->num_pages] = vpnew;
2200 (vacpagelist->num_pages)++;
2205 vac_find_eq(void *bot, int nelem, int size, void *elm,
2206 int (*compar) (const void *, const void *))
2209 int last = nelem - 1;
2210 int celm = nelem / 2;
2214 last_move = first_move = true;
2217 if (first_move == true)
2219 res = compar(bot, elm);
2226 if (last_move == true)
2228 res = compar(elm, (void *) ((char *) bot + last * size));
2232 return (void *) ((char *) bot + last * size);
2235 res = compar(elm, (void *) ((char *) bot + celm * size));
2237 return (void *) ((char *) bot + celm * size);
2251 last = last - celm - 1;
2252 bot = (void *) ((char *) bot + (celm + 1) * size);
2253 celm = (last + 1) / 2;
2260 vac_cmp_blk(const void *left, const void *right)
2265 lblk = (*((VacPage *) left))->blkno;
2266 rblk = (*((VacPage *) right))->blkno;
2277 vac_cmp_offno(const void *left, const void *right)
2280 if (*(OffsetNumber *) left < *(OffsetNumber *) right)
2282 if (*(OffsetNumber *) left == *(OffsetNumber *) right)
2289 vac_cmp_vtlinks(const void *left, const void *right)
2292 if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi <
2293 ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2295 if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi >
2296 ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2298 /* bi_hi-es are equal */
2299 if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo <
2300 ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2302 if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo >
2303 ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2305 /* bi_lo-es are equal */
2306 if (((VTupleLink) left)->new_tid.ip_posid <
2307 ((VTupleLink) right)->new_tid.ip_posid)
2309 if (((VTupleLink) left)->new_tid.ip_posid >
2310 ((VTupleLink) right)->new_tid.ip_posid)
2318 get_indices(Relation relation, int *nindices, Relation **Irel)
2324 indexoidlist = RelationGetIndexList(relation);
2326 *nindices = length(indexoidlist);
2329 *Irel = (Relation *) palloc(*nindices * sizeof(Relation));
2334 foreach(indexoidscan, indexoidlist)
2336 Oid indexoid = lfirsti(indexoidscan);
2338 (*Irel)[i] = index_open(indexoid);
2342 freeList(indexoidlist);
2347 close_indices(int nindices, Relation *Irel)
2350 if (Irel == (Relation *) NULL)
2354 index_close(Irel[nindices]);
2361 * Obtain IndexInfo data for each index on the rel
2364 get_index_desc(Relation onerel, int nindices, Relation *Irel)
2366 IndexInfo **indexInfo;
2368 HeapTuple cachetuple;
2370 indexInfo = (IndexInfo **) palloc(nindices * sizeof(IndexInfo *));
2372 for (i = 0; i < nindices; i++)
2374 cachetuple = SearchSysCacheTuple(INDEXRELID,
2375 ObjectIdGetDatum(RelationGetRelid(Irel[i])),
2377 if (!HeapTupleIsValid(cachetuple))
2378 elog(ERROR, "get_index_desc: index %u not found",
2379 RelationGetRelid(Irel[i]));
2380 indexInfo[i] = BuildIndexInfo(cachetuple);
2388 enough_space(VacPage vacpage, Size len)
2391 len = MAXALIGN(len);
2393 if (len > vacpage->free)
2396 if (vacpage->offsets_used < vacpage->offsets_free) /* there are free
2398 return true; /* and len <= free_space */
2400 /* ok. noff_usd >= noff_free and so we'll have to allocate new itemid */
2401 if (len + MAXALIGN(sizeof(ItemIdData)) <= vacpage->free)
2410 * Compute elapsed time since ru0 usage snapshot, and format into
2411 * a displayable string. Result is in a static string, which is
2412 * tacky, but no one ever claimed that the Postgres backend is
2416 show_rusage(struct rusage * ru0)
2418 static char result[64];
2421 getrusage(RUSAGE_SELF, &ru1);
2423 if (ru1.ru_stime.tv_usec < ru0->ru_stime.tv_usec)
2425 ru1.ru_stime.tv_sec--;
2426 ru1.ru_stime.tv_usec += 1000000;
2428 if (ru1.ru_utime.tv_usec < ru0->ru_utime.tv_usec)
2430 ru1.ru_utime.tv_sec--;
2431 ru1.ru_utime.tv_usec += 1000000;
2434 snprintf(result, sizeof(result),
2435 "CPU %d.%02ds/%d.%02du sec.",
2436 (int) (ru1.ru_stime.tv_sec - ru0->ru_stime.tv_sec),
2437 (int) (ru1.ru_stime.tv_usec - ru0->ru_stime.tv_usec) / 10000,
2438 (int) (ru1.ru_utime.tv_sec - ru0->ru_utime.tv_sec),
2439 (int) (ru1.ru_utime.tv_usec - ru0->ru_utime.tv_usec) / 10000);