1 /*-------------------------------------------------------------------------
4 * the postgres vacuum cleaner
6 * Portions Copyright (c) 1996-2000, PostgreSQL, Inc
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.170 2000/10/24 09:56:15 vadim Exp $
14 *-------------------------------------------------------------------------
16 #include <sys/types.h>
24 #include "access/genam.h"
25 #include "access/heapam.h"
26 #include "catalog/catalog.h"
27 #include "catalog/catname.h"
28 #include "catalog/index.h"
29 #include "commands/vacuum.h"
30 #include "miscadmin.h"
31 #include "nodes/execnodes.h"
32 #include "storage/sinval.h"
33 #include "storage/smgr.h"
34 #include "tcop/tcopprot.h"
35 #include "utils/acl.h"
36 #include "utils/builtins.h"
37 #include "utils/fmgroids.h"
38 #include "utils/inval.h"
39 #include "utils/relcache.h"
40 #include "utils/syscache.h"
41 #include "utils/temprel.h"
43 #ifndef HAVE_GETRUSAGE
44 #include "rusagestub.h"
47 #include <sys/resource.h>
51 #include "access/xlog.h"
52 XLogRecPtr log_heap_move(Relation reln,
53 ItemPointerData from, HeapTuple newtup);
56 static MemoryContext vac_context = NULL;
58 static int MESSAGE_LEVEL; /* message level */
60 static TransactionId XmaxRecent;
62 /* non-export function prototypes */
63 static void vacuum_init(void);
64 static void vacuum_shutdown(void);
65 static void vac_vacuum(NameData *VacRelP, bool analyze, List *anal_cols2);
66 static VRelList getrels(NameData *VacRelP);
67 static void vacuum_rel(Oid relid, bool analyze, bool is_toastrel);
68 static void scan_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages, VacPageList fraged_pages);
69 static void repair_frag(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages, VacPageList fraged_pages, int nindices, Relation *Irel);
70 static void vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacpagelist);
71 static void vacuum_page(Page page, VacPage vacpage);
72 static void vacuum_index(VacPageList vacpagelist, Relation indrel, int num_tuples, int keep_tuples);
73 static void scan_index(Relation indrel, int num_tuples);
74 static void update_relstats(Oid relid, int num_pages, int num_tuples, bool hasindex, VRelStats *vacrelstats);
75 static VacPage tid_reaped(ItemPointer itemptr, VacPageList vacpagelist);
76 static void reap_page(VacPageList vacpagelist, VacPage vacpage);
77 static void vpage_insert(VacPageList vacpagelist, VacPage vpnew);
78 static void get_indices(Relation relation, int *nindices, Relation **Irel);
79 static void close_indices(int nindices, Relation *Irel);
80 static IndexInfo **get_index_desc(Relation onerel, int nindices,
82 static void *vac_find_eq(void *bot, int nelem, int size, void *elm,
83 int (*compar) (const void *, const void *));
84 static int vac_cmp_blk(const void *left, const void *right);
85 static int vac_cmp_offno(const void *left, const void *right);
86 static int vac_cmp_vtlinks(const void *left, const void *right);
87 static bool enough_space(VacPage vacpage, Size len);
88 static char *show_rusage(struct rusage * ru0);
92 vacuum(char *vacrel, bool verbose, bool analyze, List *anal_cols)
98 List *anal_cols2 = NIL;
100 if (anal_cols != NIL && !analyze)
101 elog(ERROR, "Can't vacuum columns, only tables. You can 'vacuum analyze' columns.");
104 * We cannot run VACUUM inside a user transaction block; if we were
105 * inside a transaction, then our commit- and
106 * start-transaction-command calls would not have the intended effect!
107 * Furthermore, the forced commit that occurs before truncating the
108 * relation's file would have the effect of committing the rest of the
109 * user's transaction too, which would certainly not be the desired
112 if (IsTransactionBlock())
113 elog(ERROR, "VACUUM cannot run inside a BEGIN/END block");
116 MESSAGE_LEVEL = NOTICE;
118 MESSAGE_LEVEL = DEBUG;
121 * Create special memory context for cross-transaction storage.
123 * Since it is a child of QueryContext, it will go away eventually
124 * even if we suffer an error; there's no need for special abort
127 vac_context = AllocSetContextCreate(QueryContext,
129 ALLOCSET_DEFAULT_MINSIZE,
130 ALLOCSET_DEFAULT_INITSIZE,
131 ALLOCSET_DEFAULT_MAXSIZE);
133 /* vacrel gets de-allocated on xact commit, so copy it to safe storage */
136 namestrcpy(&VacRel, vacrel);
137 VacRelName = &VacRel;
142 /* must also copy the column list, if any, to safe storage */
143 old = MemoryContextSwitchTo(vac_context);
144 foreach(le, anal_cols)
146 char *col = (char *) lfirst(le);
148 anal_cols2 = lappend(anal_cols2, pstrdup(col));
150 MemoryContextSwitchTo(old);
153 * Start up the vacuum cleaner.
155 * NOTE: since this commits the current transaction, the memory holding
156 * any passed-in parameters gets freed here. We must have already
157 * copied pass-by-reference parameters to safe storage. Don't make me
162 /* vacuum the database */
163 vac_vacuum(VacRelName, analyze, anal_cols2);
170 * vacuum_init(), vacuum_shutdown() -- start up and shut down the vacuum cleaner.
172 * Formerly, there was code here to prevent more than one VACUUM from
173 * executing concurrently in the same database. However, there's no
174 * good reason to prevent that, and manually removing lockfiles after
175 * a vacuum crash was a pain for dbadmins. So, forget about lockfiles,
176 * and just rely on the exclusive lock we grab on each target table
177 * to ensure that there aren't two VACUUMs running on the same table
180 * The strangeness with committing and starting transactions in the
181 * init and shutdown routines is due to the fact that the vacuum cleaner
182 * is invoked via an SQL command, and so is already executing inside
183 * a transaction. We need to leave ourselves in a predictable state
184 * on entry and exit to the vacuum cleaner. We commit the transaction
185 * started in PostgresMain() inside vacuum_init(), and start one in
186 * vacuum_shutdown() to match the commit waiting for us back in
192 /* matches the StartTransaction in PostgresMain() */
193 CommitTransactionCommand();
199 /* on entry, we are not in a transaction */
202 * Flush the init file that relcache.c uses to save startup time. The
203 * next backend startup will rebuild the init file with up-to-date
204 * information from pg_class. This lets the optimizer see the stats
205 * that we've collected for certain critical system indexes. See
206 * relcache.c for more details.
208 * Ignore any failure to unlink the file, since it might not be there if
209 * no backend has been started since the last vacuum...
211 unlink(RELCACHE_INIT_FILENAME);
213 /* matches the CommitTransaction in PostgresMain() */
214 StartTransactionCommand();
217 * Clean up working storage --- note we must do this after
218 * StartTransactionCommand, else we might be trying to delete
219 * the active context!
221 MemoryContextDelete(vac_context);
226 * vac_vacuum() -- vacuum the database.
228 * This routine builds a list of relations to vacuum, and then calls
229 * code that vacuums them one at a time. We are careful to vacuum each
230 * relation in a separate transaction in order to avoid holding too many
234 vac_vacuum(NameData *VacRelP, bool analyze, List *anal_cols2)
239 /* get list of relations */
240 vrl = getrels(VacRelP);
242 /* vacuum each heap relation */
243 for (cur = vrl; cur != (VRelList) NULL; cur = cur->vrl_next)
245 vacuum_rel(cur->vrl_relid, analyze, false);
246 /* analyze separately so locking is minimized */
248 analyze_rel(cur->vrl_relid, anal_cols2, MESSAGE_LEVEL);
253 getrels(NameData *VacRelP)
268 StartTransactionCommand();
270 if (NameStr(*VacRelP))
274 * we could use the cache here, but it is clearer to use scankeys
275 * for both vacuum cases, bjm 2000/01/19
277 char *nontemp_relname;
279 /* We must re-map temp table names bjm 2000-04-06 */
280 if ((nontemp_relname =
281 get_temp_rel_by_username(NameStr(*VacRelP))) == NULL)
282 nontemp_relname = NameStr(*VacRelP);
284 ScanKeyEntryInitialize(&key, 0x0, Anum_pg_class_relname,
286 PointerGetDatum(nontemp_relname));
290 ScanKeyEntryInitialize(&key, 0x0, Anum_pg_class_relkind,
291 F_CHAREQ, CharGetDatum('r'));
294 vrl = cur = (VRelList) NULL;
296 rel = heap_openr(RelationRelationName, AccessShareLock);
297 tupdesc = RelationGetDescr(rel);
299 scan = heap_beginscan(rel, false, SnapshotNow, 1, &key);
301 while (HeapTupleIsValid(tuple = heap_getnext(scan, 0)))
305 d = heap_getattr(tuple, Anum_pg_class_relname, tupdesc, &n);
308 d = heap_getattr(tuple, Anum_pg_class_relkind, tupdesc, &n);
310 rkind = DatumGetChar(d);
312 if (rkind != RELKIND_RELATION)
314 elog(NOTICE, "Vacuum: can not process indecies, views and certain system tables");
318 /* get a relation list entry for this guy */
319 if (vrl == (VRelList) NULL)
320 vrl = cur = (VRelList)
321 MemoryContextAlloc(vac_context, sizeof(VRelListData));
324 cur->vrl_next = (VRelList)
325 MemoryContextAlloc(vac_context, sizeof(VRelListData));
329 cur->vrl_relid = tuple->t_data->t_oid;
330 cur->vrl_next = (VRelList) NULL;
334 heap_close(rel, AccessShareLock);
337 elog(NOTICE, "Vacuum: table not found");
339 CommitTransactionCommand();
345 * vacuum_rel() -- vacuum one heap relation
347 * This routine vacuums a single heap, cleans out its indices, and
348 * updates its statistics num_pages and num_tuples statistics.
350 * Doing one heap at a time incurs extra overhead, since we need to
351 * check that the heap exists again just before we vacuum it. The
352 * reason that we do this is so that vacuuming can be spread across
353 * many small transactions. Otherwise, two-phase locking would require
354 * us to lock the entire database during one pass of the vacuum cleaner.
357 vacuum_rel(Oid relid, bool analyze, bool is_toastrel)
361 VacPageListData vacuum_pages; /* List of pages to vacuum and/or clean
363 VacPageListData fraged_pages; /* List of pages with space enough for
369 VRelStats *vacrelstats;
370 bool reindex = false;
374 StartTransactionCommand();
377 * Check for user-requested abort. Note we want this to be inside a
378 * transaction, so xact.c doesn't issue useless NOTICE.
384 * Race condition -- if the pg_class tuple has gone away since the
385 * last time we saw it, we don't need to vacuum it.
387 tuple = SearchSysCacheTuple(RELOID,
388 ObjectIdGetDatum(relid),
390 if (!HeapTupleIsValid(tuple))
393 CommitTransactionCommand();
398 * Open the class, get an exclusive lock on it, and check permissions.
400 * Note we choose to treat permissions failure as a NOTICE and keep
401 * trying to vacuum the rest of the DB --- is this appropriate?
403 onerel = heap_open(relid, AccessExclusiveLock);
405 if (!pg_ownercheck(GetUserId(), RelationGetRelationName(onerel),
408 elog(NOTICE, "Skipping \"%s\" --- only table owner can VACUUM it",
409 RelationGetRelationName(onerel));
410 heap_close(onerel, AccessExclusiveLock);
412 CommitTransactionCommand();
417 * Remember the relation'ss TOAST relation for later
419 toast_relid = onerel->rd_rel->reltoastrelid;
422 * Set up statistics-gathering machinery.
424 vacrelstats = (VRelStats *) palloc(sizeof(VRelStats));
425 vacrelstats->relid = relid;
426 vacrelstats->num_pages = vacrelstats->num_tuples = 0;
427 vacrelstats->hasindex = false;
429 GetXmaxRecent(&XmaxRecent);
433 vacuum_pages.num_pages = fraged_pages.num_pages = 0;
434 scan_heap(vacrelstats, onerel, &vacuum_pages, &fraged_pages);
435 if (IsIgnoringSystemIndexes() &&
436 IsSystemRelationName(RelationGetRelationName(onerel)))
439 /* Now open indices */
441 Irel = (Relation *) NULL;
442 get_indices(onerel, &nindices, &Irel);
445 else if (!RelationGetForm(onerel)->relhasindex)
448 vacrelstats->hasindex = true;
450 vacrelstats->hasindex = false;
453 for (i = 0; i < nindices; i++)
454 index_close(Irel[i]);
455 Irel = (Relation *) NULL;
456 activate_indexes_of_a_table(relid, false);
459 /* Clean/scan index relation(s) */
460 if (Irel != (Relation *) NULL)
462 if (vacuum_pages.num_pages > 0)
464 for (i = 0; i < nindices; i++)
465 vacuum_index(&vacuum_pages, Irel[i],
466 vacrelstats->num_tuples, 0);
470 /* just scan indices to update statistic */
471 for (i = 0; i < nindices; i++)
472 scan_index(Irel[i], vacrelstats->num_tuples);
476 if (fraged_pages.num_pages > 0)
478 /* Try to shrink heap */
479 repair_frag(vacrelstats, onerel, &vacuum_pages, &fraged_pages,
484 if (Irel != (Relation *) NULL)
485 close_indices(nindices, Irel);
486 if (vacuum_pages.num_pages > 0)
488 /* Clean pages from vacuum_pages list */
489 vacuum_heap(vacrelstats, onerel, &vacuum_pages);
494 * Flush dirty pages out to disk. We must do this even if we
495 * didn't do anything else, because we want to ensure that all
496 * tuples have correct on-row commit status on disk (see
497 * bufmgr.c's comments for FlushRelationBuffers()).
499 i = FlushRelationBuffers(onerel, vacrelstats->num_pages);
501 elog(ERROR, "VACUUM (vacuum_rel): FlushRelationBuffers returned %d",
506 activate_indexes_of_a_table(relid, true);
509 * ok - free vacuum_pages list of reaped pages
511 * Isn't this a waste of code? Upcoming commit should free memory, no?
513 if (vacuum_pages.num_pages > 0)
515 vacpage = vacuum_pages.pagedesc;
516 for (i = 0; i < vacuum_pages.num_pages; i++, vacpage++)
518 pfree(vacuum_pages.pagedesc);
519 if (fraged_pages.num_pages > 0)
520 pfree(fraged_pages.pagedesc);
523 /* all done with this class, but hold lock until commit */
524 heap_close(onerel, NoLock);
526 /* update statistics in pg_class */
527 update_relstats(vacrelstats->relid, vacrelstats->num_pages,
528 vacrelstats->num_tuples, vacrelstats->hasindex,
532 * If the relation has a secondary toast one, vacuum that too
533 * while we still hold the lock on the master table. We don't
534 * need to propagate "analyze" to it, because the toaster
535 * always uses hardcoded index access and statistics are
536 * totally unimportant for toast relations
538 if (toast_relid != InvalidOid)
539 vacuum_rel(toast_relid, false, true);
541 /* next command frees attribute stats */
543 CommitTransactionCommand();
547 * scan_heap() -- scan an open heap relation
549 * This routine sets commit times, constructs vacuum_pages list of
550 * empty/uninitialized pages and pages with dead tuples and
551 * ~LP_USED line pointers, constructs fraged_pages list of pages
552 * appropriate for purposes of shrinking and maintains statistics
553 * on the number of live tuples in a heap.
556 scan_heap(VRelStats *vacrelstats, Relation onerel,
557 VacPageList vacuum_pages, VacPageList fraged_pages)
575 uint32 tups_vacuumed,
586 Size min_tlen = MaxTupleSize;
589 bool do_shrinking = true;
590 VTupleLink vtlinks = (VTupleLink) palloc(100 * sizeof(VTupleLinkData));
592 int free_vtlinks = 100;
595 getrusage(RUSAGE_SELF, &ru0);
597 relname = RelationGetRelationName(onerel);
598 elog(MESSAGE_LEVEL, "--Relation %s--", relname);
600 tups_vacuumed = num_tuples = nkeep = nunused = ncrash = empty_pages =
601 new_pages = changed_pages = empty_end_pages = 0;
602 free_size = usable_free_size = 0;
604 nblocks = RelationGetNumberOfBlocks(onerel);
606 vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
607 vacpage->offsets_used = 0;
609 for (blkno = 0; blkno < nblocks; blkno++)
611 buf = ReadBuffer(onerel, blkno);
612 page = BufferGetPage(buf);
613 vacpage->blkno = blkno;
614 vacpage->offsets_free = 0;
618 elog(NOTICE, "Rel %s: Uninitialized page %u - fixing",
620 PageInit(page, BufferGetPageSize(buf), 0);
621 vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
622 free_size += (vacpage->free - sizeof(ItemIdData));
625 reap_page(vacuum_pages, vacpage);
630 if (PageIsEmpty(page))
632 vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
633 free_size += (vacpage->free - sizeof(ItemIdData));
636 reap_page(vacuum_pages, vacpage);
643 maxoff = PageGetMaxOffsetNumber(page);
644 for (offnum = FirstOffsetNumber;
646 offnum = OffsetNumberNext(offnum))
648 itemid = PageGetItemId(page, offnum);
651 * Collect un-used items too - it's possible to have indices
652 * pointing here after crash.
654 if (!ItemIdIsUsed(itemid))
656 vacpage->offsets[vacpage->offsets_free++] = offnum;
661 tuple.t_datamcxt = NULL;
662 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
663 tuple.t_len = ItemIdGetLength(itemid);
664 ItemPointerSet(&(tuple.t_self), blkno, offnum);
667 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
669 if (tuple.t_data->t_infomask & HEAP_XMIN_INVALID)
671 else if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
673 if (TransactionIdDidCommit((TransactionId)
674 tuple.t_data->t_cmin))
676 tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
682 tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
686 else if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
688 if (!TransactionIdDidCommit((TransactionId)
689 tuple.t_data->t_cmin))
691 tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
697 tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
703 if (TransactionIdDidAbort(tuple.t_data->t_xmin))
705 else if (TransactionIdDidCommit(tuple.t_data->t_xmin))
707 tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
710 else if (!TransactionIdIsInProgress(tuple.t_data->t_xmin))
714 * Not Aborted, Not Committed, Not in Progress -
715 * so it's from crashed process. - vadim 11/26/96
722 elog(NOTICE, "Rel %s: TID %u/%u: InsertTransactionInProgress %u - can't shrink relation",
723 relname, blkno, offnum, tuple.t_data->t_xmin);
724 do_shrinking = false;
730 * here we are concerned about tuples with xmin committed and
731 * xmax unknown or committed
733 if (tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED &&
734 !(tuple.t_data->t_infomask & HEAP_XMAX_INVALID))
736 if (tuple.t_data->t_infomask & HEAP_XMAX_COMMITTED)
738 if (tuple.t_data->t_infomask & HEAP_MARKED_FOR_UPDATE)
740 tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
741 tuple.t_data->t_infomask &=
742 ~(HEAP_XMAX_COMMITTED | HEAP_MARKED_FOR_UPDATE);
748 else if (TransactionIdDidAbort(tuple.t_data->t_xmax))
750 tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
753 else if (TransactionIdDidCommit(tuple.t_data->t_xmax))
755 if (tuple.t_data->t_infomask & HEAP_MARKED_FOR_UPDATE)
757 tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
758 tuple.t_data->t_infomask &=
759 ~(HEAP_XMAX_COMMITTED | HEAP_MARKED_FOR_UPDATE);
765 else if (!TransactionIdIsInProgress(tuple.t_data->t_xmax))
769 * Not Aborted, Not Committed, Not in Progress - so it
770 * from crashed process. - vadim 06/02/97
772 tuple.t_data->t_infomask |= HEAP_XMAX_INVALID;
773 tuple.t_data->t_infomask &=
774 ~(HEAP_XMAX_COMMITTED | HEAP_MARKED_FOR_UPDATE);
779 elog(NOTICE, "Rel %s: TID %u/%u: DeleteTransactionInProgress %u - can't shrink relation",
780 relname, blkno, offnum, tuple.t_data->t_xmax);
781 do_shrinking = false;
785 * If tuple is recently deleted then we must not remove it
788 if (tupgone && (tuple.t_data->t_infomask & HEAP_XMIN_INVALID) == 0 && tuple.t_data->t_xmax >= XmaxRecent)
792 if (!(tuple.t_data->t_infomask & HEAP_XMAX_COMMITTED))
794 tuple.t_data->t_infomask |= HEAP_XMAX_COMMITTED;
799 * If we do shrinking and this tuple is updated one
800 * then remember it to construct updated tuple
803 if (do_shrinking && !(ItemPointerEquals(&(tuple.t_self),
804 &(tuple.t_data->t_ctid))))
806 if (free_vtlinks == 0)
809 vtlinks = (VTupleLink) repalloc(vtlinks,
810 (free_vtlinks + num_vtlinks) *
811 sizeof(VTupleLinkData));
813 vtlinks[num_vtlinks].new_tid = tuple.t_data->t_ctid;
814 vtlinks[num_vtlinks].this_tid = tuple.t_self;
824 if (!OidIsValid(tuple.t_data->t_oid))
826 elog(NOTICE, "Rel %s: TID %u/%u: OID IS INVALID. TUPGONE %d.",
827 relname, blkno, offnum, tupgone);
835 * Here we are building a temporary copy of the page with
836 * dead tuples removed. Below we will apply
837 * PageRepairFragmentation to the copy, so that we can
838 * determine how much space will be available after
839 * removal of dead tuples. But note we are NOT changing
840 * the real page yet...
842 if (tempPage == (Page) NULL)
846 pageSize = PageGetPageSize(page);
847 tempPage = (Page) palloc(pageSize);
848 memmove(tempPage, page, pageSize);
851 /* mark it unused on the temp page */
852 lpp = &(((PageHeader) tempPage)->pd_linp[offnum - 1]);
853 lpp->lp_flags &= ~LP_USED;
855 vacpage->offsets[vacpage->offsets_free++] = offnum;
862 if (tuple.t_len < min_tlen)
863 min_tlen = tuple.t_len;
864 if (tuple.t_len > max_tlen)
865 max_tlen = tuple.t_len;
878 if (tempPage != (Page) NULL)
879 { /* Some tuples are gone */
880 PageRepairFragmentation(tempPage);
881 vacpage->free = ((PageHeader) tempPage)->pd_upper - ((PageHeader) tempPage)->pd_lower;
882 free_size += vacpage->free;
883 reap_page(vacuum_pages, vacpage);
885 tempPage = (Page) NULL;
887 else if (vacpage->offsets_free > 0)
888 { /* there are only ~LP_USED line pointers */
889 vacpage->free = ((PageHeader) page)->pd_upper - ((PageHeader) page)->pd_lower;
890 free_size += vacpage->free;
891 reap_page(vacuum_pages, vacpage);
903 /* save stats in the rel list for use later */
904 vacrelstats->num_tuples = num_tuples;
905 vacrelstats->num_pages = nblocks;
906 /* vacrelstats->natts = attr_cnt;*/
908 min_tlen = max_tlen = 0;
909 vacrelstats->min_tlen = min_tlen;
910 vacrelstats->max_tlen = max_tlen;
912 vacuum_pages->empty_end_pages = empty_end_pages;
913 fraged_pages->empty_end_pages = empty_end_pages;
916 * Try to make fraged_pages keeping in mind that we can't use free
917 * space of "empty" end-pages and last page if it reaped.
919 if (do_shrinking && vacuum_pages->num_pages - empty_end_pages > 0)
921 int nusf; /* blocks usefull for re-using */
923 nusf = vacuum_pages->num_pages - empty_end_pages;
924 if ((vacuum_pages->pagedesc[nusf - 1])->blkno == nblocks - empty_end_pages - 1)
927 for (i = 0; i < nusf; i++)
929 vp = vacuum_pages->pagedesc[i];
930 if (enough_space(vp, min_tlen))
932 vpage_insert(fraged_pages, vp);
933 usable_free_size += vp->free;
938 if (usable_free_size > 0 && num_vtlinks > 0)
940 qsort((char *) vtlinks, num_vtlinks, sizeof(VTupleLinkData),
942 vacrelstats->vtlinks = vtlinks;
943 vacrelstats->num_vtlinks = num_vtlinks;
947 vacrelstats->vtlinks = NULL;
948 vacrelstats->num_vtlinks = 0;
952 elog(MESSAGE_LEVEL, "Pages %u: Changed %u, reaped %u, Empty %u, New %u; \
953 Tup %u: Vac %u, Keep/VTL %u/%u, Crash %u, UnUsed %u, MinLen %u, MaxLen %u; \
954 Re-using: Free/Avail. Space %u/%u; EndEmpty/Avail. Pages %u/%u. %s",
955 nblocks, changed_pages, vacuum_pages->num_pages, empty_pages,
956 new_pages, num_tuples, tups_vacuumed,
957 nkeep, vacrelstats->num_vtlinks, ncrash,
958 nunused, min_tlen, max_tlen, free_size, usable_free_size,
959 empty_end_pages, fraged_pages->num_pages,
966 * repair_frag() -- try to repair relation's fragmentation
968 * This routine marks dead tuples as unused and tries re-use dead space
969 * by moving tuples (and inserting indices if needed). It constructs
970 * Nvacpagelist list of free-ed pages (moved tuples) and clean indices
971 * for them after committing (in hack-manner - without losing locks
972 * and freeing memory!) current transaction. It truncates relation
973 * if some end-blocks are gone away.
976 repair_frag(VRelStats *vacrelstats, Relation onerel,
977 VacPageList vacuum_pages, VacPageList fraged_pages,
978 int nindices, Relation *Irel)
988 OffsetNumber offnum = 0,
997 IndexInfo **indexInfo = NULL;
998 Datum idatum[INDEX_MAX_KEYS];
999 char inulls[INDEX_MAX_KEYS];
1000 InsertIndexResult iresult;
1001 VacPageListData Nvacpagelist;
1002 VacPage cur_page = NULL,
1007 int last_move_dest_block = -1,
1022 getrusage(RUSAGE_SELF, &ru0);
1024 myXID = GetCurrentTransactionId();
1025 myCID = GetCurrentCommandId();
1027 tupdesc = RelationGetDescr(onerel);
1029 if (Irel != (Relation *) NULL) /* preparation for index' inserts */
1030 indexInfo = get_index_desc(onerel, nindices, Irel);
1032 Nvacpagelist.num_pages = 0;
1033 num_fraged_pages = fraged_pages->num_pages;
1034 Assert(vacuum_pages->num_pages > vacuum_pages->empty_end_pages);
1035 vacuumed_pages = vacuum_pages->num_pages - vacuum_pages->empty_end_pages;
1036 last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
1037 last_vacuum_block = last_vacuum_page->blkno;
1038 cur_buffer = InvalidBuffer;
1041 vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber));
1042 vacpage->offsets_used = vacpage->offsets_free = 0;
1045 * Scan pages backwards from the last nonempty page, trying to move
1046 * tuples down to lower pages. Quit when we reach a page that we have
1047 * moved any tuples onto. Note that if a page is still in the
1048 * fraged_pages list (list of candidate move-target pages) when we
1049 * reach it, we will remove it from the list. This ensures we never
1050 * move a tuple up to a higher page number.
1052 * NB: this code depends on the vacuum_pages and fraged_pages lists being
1053 * in order, and on fraged_pages being a subset of vacuum_pages.
1055 nblocks = vacrelstats->num_pages;
1056 for (blkno = nblocks - vacuum_pages->empty_end_pages - 1;
1057 blkno > last_move_dest_block;
1060 buf = ReadBuffer(onerel, blkno);
1061 page = BufferGetPage(buf);
1063 vacpage->offsets_free = 0;
1065 isempty = PageIsEmpty(page);
1068 if (blkno == last_vacuum_block) /* it's reaped page */
1070 if (last_vacuum_page->offsets_free > 0) /* there are dead tuples */
1071 { /* on this page - clean */
1073 vacuum_page(page, last_vacuum_page);
1079 if (vacuumed_pages > 0)
1081 /* get prev reaped page from vacuum_pages */
1082 last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1];
1083 last_vacuum_block = last_vacuum_page->blkno;
1087 last_vacuum_page = NULL;
1088 last_vacuum_block = -1;
1090 if (num_fraged_pages > 0 &&
1091 fraged_pages->pagedesc[num_fraged_pages - 1]->blkno ==
1092 (BlockNumber) blkno)
1094 /* page is in fraged_pages too; remove it */
1106 chain_tuple_moved = false; /* no one chain-tuple was moved
1107 * off this page, yet */
1108 vacpage->blkno = blkno;
1109 maxoff = PageGetMaxOffsetNumber(page);
1110 for (offnum = FirstOffsetNumber;
1112 offnum = OffsetNumberNext(offnum))
1114 itemid = PageGetItemId(page, offnum);
1116 if (!ItemIdIsUsed(itemid))
1119 tuple.t_datamcxt = NULL;
1120 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1121 tuple_len = tuple.t_len = ItemIdGetLength(itemid);
1122 ItemPointerSet(&(tuple.t_self), blkno, offnum);
1124 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1126 if ((TransactionId) tuple.t_data->t_cmin != myXID)
1127 elog(ERROR, "Invalid XID in t_cmin");
1128 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1129 elog(ERROR, "HEAP_MOVED_IN was not expected");
1132 * If this (chain) tuple is moved by me already then I
1133 * have to check is it in vacpage or not - i.e. is it moved
1134 * while cleaning this page or some previous one.
1136 if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
1138 if (keep_tuples == 0)
1140 if (chain_tuple_moved) /* some chains was moved
1142 { /* cleaning this page */
1143 Assert(vacpage->offsets_free > 0);
1144 for (i = 0; i < vacpage->offsets_free; i++)
1146 if (vacpage->offsets[i] == offnum)
1149 if (i >= vacpage->offsets_free) /* not found */
1151 vacpage->offsets[vacpage->offsets_free++] = offnum;
1157 vacpage->offsets[vacpage->offsets_free++] = offnum;
1162 elog(ERROR, "HEAP_MOVED_OFF was expected");
1166 * If this tuple is in the chain of tuples created in updates
1167 * by "recent" transactions then we have to move all chain of
1168 * tuples to another places.
1170 if ((tuple.t_data->t_infomask & HEAP_UPDATED &&
1171 tuple.t_data->t_xmin >= XmaxRecent) ||
1172 (!(tuple.t_data->t_infomask & HEAP_XMAX_INVALID) &&
1173 !(ItemPointerEquals(&(tuple.t_self), &(tuple.t_data->t_ctid)))))
1178 ItemPointerData Ctid;
1179 HeapTupleData tp = tuple;
1180 Size tlen = tuple_len;
1181 VTupleMove vtmove = (VTupleMove)
1182 palloc(100 * sizeof(VTupleMoveData));
1184 int free_vtmove = 100;
1185 VacPage to_vacpage = NULL;
1187 bool freeCbuf = false;
1190 if (vacrelstats->vtlinks == NULL)
1191 elog(ERROR, "No one parent tuple was found");
1192 if (cur_buffer != InvalidBuffer)
1194 WriteBuffer(cur_buffer);
1195 cur_buffer = InvalidBuffer;
1199 * If this tuple is in the begin/middle of the chain then
1200 * we have to move to the end of chain.
1202 while (!(tp.t_data->t_infomask & HEAP_XMAX_INVALID) &&
1203 !(ItemPointerEquals(&(tp.t_self), &(tp.t_data->t_ctid))))
1205 Ctid = tp.t_data->t_ctid;
1207 ReleaseBuffer(Cbuf);
1209 Cbuf = ReadBuffer(onerel,
1210 ItemPointerGetBlockNumber(&Ctid));
1211 Cpage = BufferGetPage(Cbuf);
1212 Citemid = PageGetItemId(Cpage,
1213 ItemPointerGetOffsetNumber(&Ctid));
1214 if (!ItemIdIsUsed(Citemid))
1218 * This means that in the middle of chain there
1219 * was tuple updated by older (than XmaxRecent)
1220 * xaction and this tuple is already deleted by
1221 * me. Actually, upper part of chain should be
1222 * removed and seems that this should be handled
1223 * in scan_heap(), but it's not implemented at
1224 * the moment and so we just stop shrinking here.
1226 ReleaseBuffer(Cbuf);
1229 elog(NOTICE, "Child itemid in update-chain marked as unused - can't continue repair_frag");
1232 tp.t_datamcxt = NULL;
1233 tp.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
1235 tlen = tp.t_len = ItemIdGetLength(Citemid);
1239 /* first, can chain be moved ? */
1242 if (to_vacpage == NULL ||
1243 !enough_space(to_vacpage, tlen))
1247 * if to_vacpage no longer has enough free space to be
1248 * useful, remove it from fraged_pages list
1250 if (to_vacpage != NULL &&
1251 !enough_space(to_vacpage, vacrelstats->min_tlen))
1253 Assert(num_fraged_pages > to_item);
1254 memmove(fraged_pages->pagedesc + to_item,
1255 fraged_pages->pagedesc + to_item + 1,
1256 sizeof(VacPage) * (num_fraged_pages - to_item - 1));
1259 for (i = 0; i < num_fraged_pages; i++)
1261 if (enough_space(fraged_pages->pagedesc[i], tlen))
1265 /* can't move item anywhere */
1266 if (i == num_fraged_pages)
1268 for (i = 0; i < num_vtmove; i++)
1270 Assert(vtmove[i].vacpage->offsets_used > 0);
1271 (vtmove[i].vacpage->offsets_used)--;
1277 to_vacpage = fraged_pages->pagedesc[to_item];
1279 to_vacpage->free -= MAXALIGN(tlen);
1280 if (to_vacpage->offsets_used >= to_vacpage->offsets_free)
1281 to_vacpage->free -= MAXALIGN(sizeof(ItemIdData));
1282 (to_vacpage->offsets_used)++;
1283 if (free_vtmove == 0)
1286 vtmove = (VTupleMove) repalloc(vtmove,
1287 (free_vtmove + num_vtmove) *
1288 sizeof(VTupleMoveData));
1290 vtmove[num_vtmove].tid = tp.t_self;
1291 vtmove[num_vtmove].vacpage = to_vacpage;
1292 if (to_vacpage->offsets_used == 1)
1293 vtmove[num_vtmove].cleanVpd = true;
1295 vtmove[num_vtmove].cleanVpd = false;
1300 if (!(tp.t_data->t_infomask & HEAP_UPDATED) ||
1301 tp.t_data->t_xmin < XmaxRecent)
1304 /* Well, try to find tuple with old row version */
1311 VTupleLinkData vtld,
1314 vtld.new_tid = tp.t_self;
1316 vac_find_eq((void *) (vacrelstats->vtlinks),
1317 vacrelstats->num_vtlinks,
1318 sizeof(VTupleLinkData),
1322 elog(ERROR, "Parent tuple was not found");
1323 tp.t_self = vtlp->this_tid;
1324 Pbuf = ReadBuffer(onerel,
1325 ItemPointerGetBlockNumber(&(tp.t_self)));
1326 Ppage = BufferGetPage(Pbuf);
1327 Pitemid = PageGetItemId(Ppage,
1328 ItemPointerGetOffsetNumber(&(tp.t_self)));
1329 if (!ItemIdIsUsed(Pitemid))
1330 elog(ERROR, "Parent itemid marked as unused");
1331 Ptp.t_datamcxt = NULL;
1332 Ptp.t_data = (HeapTupleHeader) PageGetItem(Ppage, Pitemid);
1333 Assert(ItemPointerEquals(&(vtld.new_tid),
1334 &(Ptp.t_data->t_ctid)));
1337 * Read above about cases when
1338 * !ItemIdIsUsed(Citemid) (child item is
1339 * removed)... Due to the fact that at the moment
1340 * we don't remove unuseful part of update-chain,
1341 * it's possible to get too old parent row here.
1342 * Like as in the case which caused this problem,
1343 * we stop shrinking here. I could try to find
1344 * real parent row but want not to do it because
1345 * of real solution will be implemented anyway,
1346 * latter, and we are too close to 6.5 release. -
1349 if (Ptp.t_data->t_xmax != tp.t_data->t_xmin)
1352 ReleaseBuffer(Cbuf);
1354 ReleaseBuffer(Pbuf);
1355 for (i = 0; i < num_vtmove; i++)
1357 Assert(vtmove[i].vacpage->offsets_used > 0);
1358 (vtmove[i].vacpage->offsets_used)--;
1361 elog(NOTICE, "Too old parent tuple found - can't continue repair_frag");
1364 #ifdef NOT_USED /* I'm not sure that this will wotk
1368 * If this tuple is updated version of row and it
1369 * was created by the same transaction then no one
1370 * is interested in this tuple - mark it as
1373 if (Ptp.t_data->t_infomask & HEAP_UPDATED &&
1374 Ptp.t_data->t_xmin == Ptp.t_data->t_xmax)
1376 TransactionIdStore(myXID,
1377 (TransactionId *) &(Ptp.t_data->t_cmin));
1378 Ptp.t_data->t_infomask &=
1379 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1380 Ptp.t_data->t_infomask |= HEAP_MOVED_OFF;
1385 tp.t_datamcxt = Ptp.t_datamcxt;
1386 tp.t_data = Ptp.t_data;
1387 tlen = tp.t_len = ItemIdGetLength(Pitemid);
1389 ReleaseBuffer(Cbuf);
1394 if (num_vtmove == 0)
1398 ReleaseBuffer(Cbuf);
1399 if (num_vtmove == 0) /* chain can't be moved */
1404 ItemPointerSetInvalid(&Ctid);
1405 for (ti = 0; ti < num_vtmove; ti++)
1407 VacPage destvacpage = vtmove[ti].vacpage;
1409 /* Get page to move from */
1410 tuple.t_self = vtmove[ti].tid;
1411 Cbuf = ReadBuffer(onerel,
1412 ItemPointerGetBlockNumber(&(tuple.t_self)));
1414 /* Get page to move to */
1415 cur_buffer = ReadBuffer(onerel, destvacpage->blkno);
1417 LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
1418 if (cur_buffer != Cbuf)
1419 LockBuffer(Cbuf, BUFFER_LOCK_EXCLUSIVE);
1421 ToPage = BufferGetPage(cur_buffer);
1422 Cpage = BufferGetPage(Cbuf);
1424 /* NO ELOG(ERROR) TILL CHANGES ARE LOGGED */
1426 Citemid = PageGetItemId(Cpage,
1427 ItemPointerGetOffsetNumber(&(tuple.t_self)));
1428 tuple.t_datamcxt = NULL;
1429 tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
1430 tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
1433 * make a copy of the source tuple, and then mark the
1434 * source tuple MOVED_OFF.
1436 heap_copytuple_with_tuple(&tuple, &newtup);
1438 RelationInvalidateHeapTuple(onerel, &tuple);
1440 TransactionIdStore(myXID, (TransactionId *) &(tuple.t_data->t_cmin));
1441 tuple.t_data->t_infomask &=
1442 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1443 tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
1446 * If this page was not used before - clean it.
1448 * NOTE: a nasty bug used to lurk here. It is possible
1449 * for the source and destination pages to be the same
1450 * (since this tuple-chain member can be on a page lower
1451 * than the one we're currently processing in the outer
1452 * loop). If that's true, then after vacuum_page() the
1453 * source tuple will have been moved, and tuple.t_data
1454 * will be pointing at garbage. Therefore we must do
1455 * everything that uses tuple.t_data BEFORE this step!!
1457 * This path is different from the other callers of
1458 * vacuum_page, because we have already incremented the
1459 * vacpage's offsets_used field to account for the
1460 * tuple(s) we expect to move onto the page. Therefore
1461 * vacuum_page's check for offsets_used == 0 is
1462 * wrong. But since that's a good debugging check for
1463 * all other callers, we work around it here rather
1466 if (!PageIsEmpty(ToPage) && vtmove[ti].cleanVpd)
1468 int sv_offsets_used = destvacpage->offsets_used;
1470 destvacpage->offsets_used = 0;
1471 vacuum_page(ToPage, destvacpage);
1472 destvacpage->offsets_used = sv_offsets_used;
1476 * Update the state of the copied tuple, and store it
1477 * on the destination page.
1479 TransactionIdStore(myXID, (TransactionId *) &(newtup.t_data->t_cmin));
1480 newtup.t_data->t_infomask &=
1481 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF);
1482 newtup.t_data->t_infomask |= HEAP_MOVED_IN;
1483 newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
1484 InvalidOffsetNumber, LP_USED);
1485 if (newoff == InvalidOffsetNumber)
1487 elog(STOP, "moving chain: failed to add item with len = %u to page %u",
1488 tuple_len, destvacpage->blkno);
1490 newitemid = PageGetItemId(ToPage, newoff);
1491 pfree(newtup.t_data);
1492 newtup.t_datamcxt = NULL;
1493 newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
1494 ItemPointerSet(&(newtup.t_self), destvacpage->blkno, newoff);
1499 log_heap_move(onerel, tuple.t_self, &newtup);
1501 if (Cbuf != cur_buffer)
1503 PageSetLSN(Cpage, recptr);
1504 PageSetSUI(Cpage, ThisStartUpID);
1506 PageSetLSN(ToPage, recptr);
1507 PageSetSUI(ToPage, ThisStartUpID);
1511 if (((int) destvacpage->blkno) > last_move_dest_block)
1512 last_move_dest_block = destvacpage->blkno;
1515 * Set new tuple's t_ctid pointing to itself for last
1516 * tuple in chain, and to next tuple in chain otherwise.
1518 if (!ItemPointerIsValid(&Ctid))
1519 newtup.t_data->t_ctid = newtup.t_self;
1521 newtup.t_data->t_ctid = Ctid;
1522 Ctid = newtup.t_self;
1527 * Remember that we moved tuple from the current page
1528 * (corresponding index tuple will be cleaned).
1531 vacpage->offsets[vacpage->offsets_free++] =
1532 ItemPointerGetOffsetNumber(&(tuple.t_self));
1536 LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
1537 if (cur_buffer != Cbuf)
1538 LockBuffer(Cbuf, BUFFER_LOCK_UNLOCK);
1540 if (Irel != (Relation *) NULL)
1543 * XXX using CurrentMemoryContext here means
1544 * intra-vacuum memory leak for functional indexes.
1545 * Should fix someday.
1547 * XXX This code fails to handle partial indexes!
1548 * Probably should change it to use ExecOpenIndices.
1550 for (i = 0; i < nindices; i++)
1552 FormIndexDatum(indexInfo[i],
1555 CurrentMemoryContext,
1558 iresult = index_insert(Irel[i],
1567 WriteBuffer(cur_buffer);
1570 cur_buffer = InvalidBuffer;
1572 chain_tuple_moved = true;
1576 /* try to find new page for this tuple */
1577 if (cur_buffer == InvalidBuffer ||
1578 !enough_space(cur_page, tuple_len))
1580 if (cur_buffer != InvalidBuffer)
1582 WriteBuffer(cur_buffer);
1583 cur_buffer = InvalidBuffer;
1586 * If previous target page is now too full to add *any*
1587 * tuple to it, remove it from fraged_pages.
1589 if (!enough_space(cur_page, vacrelstats->min_tlen))
1591 Assert(num_fraged_pages > cur_item);
1592 memmove(fraged_pages->pagedesc + cur_item,
1593 fraged_pages->pagedesc + cur_item + 1,
1594 sizeof(VacPage) * (num_fraged_pages - cur_item - 1));
1598 for (i = 0; i < num_fraged_pages; i++)
1600 if (enough_space(fraged_pages->pagedesc[i], tuple_len))
1603 if (i == num_fraged_pages)
1604 break; /* can't move item anywhere */
1606 cur_page = fraged_pages->pagedesc[cur_item];
1607 cur_buffer = ReadBuffer(onerel, cur_page->blkno);
1608 LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
1609 ToPage = BufferGetPage(cur_buffer);
1610 /* if this page was not used before - clean it */
1611 if (!PageIsEmpty(ToPage) && cur_page->offsets_used == 0)
1612 vacuum_page(ToPage, cur_page);
1615 LockBuffer(cur_buffer, BUFFER_LOCK_EXCLUSIVE);
1617 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1620 heap_copytuple_with_tuple(&tuple, &newtup);
1622 RelationInvalidateHeapTuple(onerel, &tuple);
1625 * Mark new tuple as moved_in by vacuum and store vacuum XID
1628 TransactionIdStore(myXID, (TransactionId *) &(newtup.t_data->t_cmin));
1629 newtup.t_data->t_infomask &=
1630 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_OFF);
1631 newtup.t_data->t_infomask |= HEAP_MOVED_IN;
1633 /* add tuple to the page */
1634 newoff = PageAddItem(ToPage, (Item) newtup.t_data, tuple_len,
1635 InvalidOffsetNumber, LP_USED);
1636 if (newoff == InvalidOffsetNumber)
1639 failed to add item with len = %u to page %u (free space %u, nusd %u, noff %u)",
1640 tuple_len, cur_page->blkno, cur_page->free,
1641 cur_page->offsets_used, cur_page->offsets_free);
1643 newitemid = PageGetItemId(ToPage, newoff);
1644 pfree(newtup.t_data);
1645 newtup.t_datamcxt = NULL;
1646 newtup.t_data = (HeapTupleHeader) PageGetItem(ToPage, newitemid);
1647 ItemPointerSet(&(newtup.t_data->t_ctid), cur_page->blkno, newoff);
1648 newtup.t_self = newtup.t_data->t_ctid;
1651 * Mark old tuple as moved_off by vacuum and store vacuum XID
1654 TransactionIdStore(myXID, (TransactionId *) &(tuple.t_data->t_cmin));
1655 tuple.t_data->t_infomask &=
1656 ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN);
1657 tuple.t_data->t_infomask |= HEAP_MOVED_OFF;
1662 log_heap_move(onerel, tuple.t_self, &newtup);
1664 PageSetLSN(page, recptr);
1665 PageSetSUI(page, ThisStartUpID);
1666 PageSetLSN(ToPage, recptr);
1667 PageSetSUI(ToPage, ThisStartUpID);
1671 cur_page->offsets_used++;
1673 cur_page->free = ((PageHeader) ToPage)->pd_upper - ((PageHeader) ToPage)->pd_lower;
1674 if (((int) cur_page->blkno) > last_move_dest_block)
1675 last_move_dest_block = cur_page->blkno;
1677 vacpage->offsets[vacpage->offsets_free++] = offnum;
1679 LockBuffer(cur_buffer, BUFFER_LOCK_UNLOCK);
1680 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1682 /* insert index' tuples if needed */
1683 if (Irel != (Relation *) NULL)
1686 * XXX using CurrentMemoryContext here means
1687 * intra-vacuum memory leak for functional indexes.
1688 * Should fix someday.
1690 * XXX This code fails to handle partial indexes!
1691 * Probably should change it to use ExecOpenIndices.
1693 for (i = 0; i < nindices; i++)
1695 FormIndexDatum(indexInfo[i],
1698 CurrentMemoryContext,
1701 iresult = index_insert(Irel[i],
1711 } /* walk along page */
1713 if (offnum < maxoff && keep_tuples > 0)
1717 for (off = OffsetNumberNext(offnum);
1719 off = OffsetNumberNext(off))
1721 itemid = PageGetItemId(page, off);
1722 if (!ItemIdIsUsed(itemid))
1724 tuple.t_datamcxt = NULL;
1725 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1726 if (tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)
1728 if ((TransactionId) tuple.t_data->t_cmin != myXID)
1729 elog(ERROR, "Invalid XID in t_cmin (4)");
1730 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1731 elog(ERROR, "HEAP_MOVED_IN was not expected (2)");
1732 if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
1734 /* some chains was moved while */
1735 if (chain_tuple_moved)
1736 { /* cleaning this page */
1737 Assert(vacpage->offsets_free > 0);
1738 for (i = 0; i < vacpage->offsets_free; i++)
1740 if (vacpage->offsets[i] == off)
1743 if (i >= vacpage->offsets_free) /* not found */
1745 vacpage->offsets[vacpage->offsets_free++] = off;
1746 Assert(keep_tuples > 0);
1752 vacpage->offsets[vacpage->offsets_free++] = off;
1753 Assert(keep_tuples > 0);
1760 if (vacpage->offsets_free > 0) /* some tuples were moved */
1762 if (chain_tuple_moved) /* else - they are ordered */
1764 qsort((char *) (vacpage->offsets), vacpage->offsets_free,
1765 sizeof(OffsetNumber), vac_cmp_offno);
1767 reap_page(&Nvacpagelist, vacpage);
1775 if (offnum <= maxoff)
1776 break; /* some item(s) left */
1778 } /* walk along relation */
1780 blkno++; /* new number of blocks */
1782 if (cur_buffer != InvalidBuffer)
1784 Assert(num_moved > 0);
1785 WriteBuffer(cur_buffer);
1792 * We have to commit our tuple' movings before we'll truncate
1793 * relation, but we shouldn't lose our locks. And so - quick hack:
1794 * flush buffers and record status of current transaction as
1795 * committed, and continue. - vadim 11/13/96
1798 TransactionIdCommit(myXID);
1803 * Clean uncleaned reaped pages from vacuum_pages list list and set
1804 * xmin committed for inserted tuples
1807 for (i = 0, curpage = vacuum_pages->pagedesc; i < vacuumed_pages; i++, curpage++)
1809 Assert((*curpage)->blkno < (BlockNumber) blkno);
1810 buf = ReadBuffer(onerel, (*curpage)->blkno);
1811 page = BufferGetPage(buf);
1812 if ((*curpage)->offsets_used == 0) /* this page was not used */
1814 if (!PageIsEmpty(page))
1815 vacuum_page(page, *curpage);
1818 /* this page was used */
1821 max_offset = PageGetMaxOffsetNumber(page);
1822 for (newoff = FirstOffsetNumber;
1823 newoff <= max_offset;
1824 newoff = OffsetNumberNext(newoff))
1826 itemid = PageGetItemId(page, newoff);
1827 if (!ItemIdIsUsed(itemid))
1829 tuple.t_datamcxt = NULL;
1830 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1831 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1833 if ((TransactionId) tuple.t_data->t_cmin != myXID)
1834 elog(ERROR, "Invalid XID in t_cmin (2)");
1835 if (tuple.t_data->t_infomask & HEAP_MOVED_IN)
1837 tuple.t_data->t_infomask |= HEAP_XMIN_COMMITTED;
1840 else if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
1841 tuple.t_data->t_infomask |= HEAP_XMIN_INVALID;
1843 elog(ERROR, "HEAP_MOVED_OFF/HEAP_MOVED_IN was expected");
1846 Assert((*curpage)->offsets_used == num_tuples);
1847 checked_moved += num_tuples;
1851 Assert(num_moved == checked_moved);
1853 elog(MESSAGE_LEVEL, "Rel %s: Pages: %u --> %u; Tuple(s) moved: %u. %s",
1854 RelationGetRelationName(onerel),
1855 nblocks, blkno, num_moved,
1858 if (Nvacpagelist.num_pages > 0)
1860 /* vacuum indices again if needed */
1861 if (Irel != (Relation *) NULL)
1867 /* re-sort Nvacpagelist.pagedesc */
1868 for (vpleft = Nvacpagelist.pagedesc,
1869 vpright = Nvacpagelist.pagedesc + Nvacpagelist.num_pages - 1;
1870 vpleft < vpright; vpleft++, vpright--)
1876 Assert(keep_tuples >= 0);
1877 for (i = 0; i < nindices; i++)
1878 vacuum_index(&Nvacpagelist, Irel[i],
1879 vacrelstats->num_tuples, keep_tuples);
1882 /* clean moved tuples from last page in Nvacpagelist list */
1883 if (vacpage->blkno == (BlockNumber) (blkno - 1) &&
1884 vacpage->offsets_free > 0)
1886 buf = ReadBuffer(onerel, vacpage->blkno);
1887 page = BufferGetPage(buf);
1889 for (offnum = FirstOffsetNumber;
1891 offnum = OffsetNumberNext(offnum))
1893 itemid = PageGetItemId(page, offnum);
1894 if (!ItemIdIsUsed(itemid))
1896 tuple.t_datamcxt = NULL;
1897 tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
1899 if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
1901 if ((TransactionId) tuple.t_data->t_cmin != myXID)
1902 elog(ERROR, "Invalid XID in t_cmin (3)");
1903 if (tuple.t_data->t_infomask & HEAP_MOVED_OFF)
1905 itemid->lp_flags &= ~LP_USED;
1909 elog(ERROR, "HEAP_MOVED_OFF was expected (2)");
1913 Assert(vacpage->offsets_free == num_tuples);
1914 PageRepairFragmentation(page);
1918 /* now - free new list of reaped pages */
1919 curpage = Nvacpagelist.pagedesc;
1920 for (i = 0; i < Nvacpagelist.num_pages; i++, curpage++)
1922 pfree(Nvacpagelist.pagedesc);
1926 * Flush dirty pages out to disk. We do this unconditionally, even if
1927 * we don't need to truncate, because we want to ensure that all tuples
1928 * have correct on-row commit status on disk (see bufmgr.c's comments
1929 * for FlushRelationBuffers()).
1931 i = FlushRelationBuffers(onerel, blkno);
1933 elog(ERROR, "VACUUM (repair_frag): FlushRelationBuffers returned %d",
1936 /* truncate relation, if needed */
1937 if (blkno < nblocks)
1939 blkno = smgrtruncate(DEFAULT_SMGR, onerel, blkno);
1941 vacrelstats->num_pages = blkno; /* set new number of blocks */
1944 if (Irel != (Relation *) NULL) /* pfree index' allocations */
1946 close_indices(nindices, Irel);
1951 if (vacrelstats->vtlinks != NULL)
1952 pfree(vacrelstats->vtlinks);
1956 * vacuum_heap() -- free dead tuples
1958 * This routine marks dead tuples as unused and truncates relation
1959 * if there are "empty" end-blocks.
1962 vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
1970 nblocks = vacuum_pages->num_pages;
1971 nblocks -= vacuum_pages->empty_end_pages; /* nothing to do with
1974 for (i = 0, vacpage = vacuum_pages->pagedesc; i < nblocks; i++, vacpage++)
1976 if ((*vacpage)->offsets_free > 0)
1978 buf = ReadBuffer(onerel, (*vacpage)->blkno);
1979 page = BufferGetPage(buf);
1980 vacuum_page(page, *vacpage);
1986 * Flush dirty pages out to disk. We do this unconditionally, even if
1987 * we don't need to truncate, because we want to ensure that all tuples
1988 * have correct on-row commit status on disk (see bufmgr.c's comments
1989 * for FlushRelationBuffers()).
1991 Assert(vacrelstats->num_pages >= vacuum_pages->empty_end_pages);
1992 nblocks = vacrelstats->num_pages - vacuum_pages->empty_end_pages;
1994 i = FlushRelationBuffers(onerel, nblocks);
1996 elog(ERROR, "VACUUM (vacuum_heap): FlushRelationBuffers returned %d",
1999 /* truncate relation if there are some empty end-pages */
2000 if (vacuum_pages->empty_end_pages > 0)
2002 elog(MESSAGE_LEVEL, "Rel %s: Pages: %u --> %u.",
2003 RelationGetRelationName(onerel),
2004 vacrelstats->num_pages, nblocks);
2005 nblocks = smgrtruncate(DEFAULT_SMGR, onerel, nblocks);
2006 Assert(nblocks >= 0);
2007 vacrelstats->num_pages = nblocks; /* set new number of blocks */
2012 * vacuum_page() -- free dead tuples on a page
2013 * and repair its fragmentation.
2016 vacuum_page(Page page, VacPage vacpage)
2021 /* There shouldn't be any tuples moved onto the page yet! */
2022 Assert(vacpage->offsets_used == 0);
2024 for (i = 0; i < vacpage->offsets_free; i++)
2026 itemid = &(((PageHeader) page)->pd_linp[vacpage->offsets[i] - 1]);
2027 itemid->lp_flags &= ~LP_USED;
2029 PageRepairFragmentation(page);
2034 * _scan_index() -- scan one index relation to update statistic.
2038 scan_index(Relation indrel, int num_tuples)
2040 RetrieveIndexResult res;
2041 IndexScanDesc iscan;
2046 getrusage(RUSAGE_SELF, &ru0);
2048 /* walk through the entire index */
2049 iscan = index_beginscan(indrel, false, 0, (ScanKey) NULL);
2052 while ((res = index_getnext(iscan, ForwardScanDirection))
2053 != (RetrieveIndexResult) NULL)
2059 index_endscan(iscan);
2061 /* now update statistics in pg_class */
2062 nipages = RelationGetNumberOfBlocks(indrel);
2063 update_relstats(RelationGetRelid(indrel), nipages, nitups, false, NULL);
2065 elog(MESSAGE_LEVEL, "Index %s: Pages %u; Tuples %u. %s",
2066 RelationGetRelationName(indrel), nipages, nitups,
2069 if (nitups != num_tuples)
2070 elog(NOTICE, "Index %s: NUMBER OF INDEX' TUPLES (%u) IS NOT THE SAME AS HEAP' (%u).\
2071 \n\tRecreate the index.",
2072 RelationGetRelationName(indrel), nitups, num_tuples);
2077 * vacuum_index() -- vacuum one index relation.
2079 * Vpl is the VacPageList of the heap we're currently vacuuming.
2080 * It's locked. Indrel is an index relation on the vacuumed heap.
2081 * We don't set locks on the index relation here, since the indexed
2082 * access methods support locking at different granularities.
2083 * We let them handle it.
2085 * Finally, we arrange to update the index relation's statistics in
2089 vacuum_index(VacPageList vacpagelist, Relation indrel, int num_tuples, int keep_tuples)
2091 RetrieveIndexResult res;
2092 IndexScanDesc iscan;
2093 ItemPointer heapptr;
2095 int num_index_tuples;
2100 getrusage(RUSAGE_SELF, &ru0);
2102 /* walk through the entire index */
2103 iscan = index_beginscan(indrel, false, 0, (ScanKey) NULL);
2105 num_index_tuples = 0;
2107 while ((res = index_getnext(iscan, ForwardScanDirection))
2108 != (RetrieveIndexResult) NULL)
2110 heapptr = &res->heap_iptr;
2112 if ((vp = tid_reaped(heapptr, vacpagelist)) != (VacPage) NULL)
2115 elog(DEBUG, "<%x,%x> -> <%x,%x>",
2116 ItemPointerGetBlockNumber(&(res->index_iptr)),
2117 ItemPointerGetOffsetNumber(&(res->index_iptr)),
2118 ItemPointerGetBlockNumber(&(res->heap_iptr)),
2119 ItemPointerGetOffsetNumber(&(res->heap_iptr)));
2121 if (vp->offsets_free == 0)
2123 elog(NOTICE, "Index %s: pointer to EmptyPage (blk %u off %u) - fixing",
2124 RelationGetRelationName(indrel),
2125 vp->blkno, ItemPointerGetOffsetNumber(heapptr));
2128 index_delete(indrel, &res->index_iptr);
2136 index_endscan(iscan);
2138 /* now update statistics in pg_class */
2139 num_pages = RelationGetNumberOfBlocks(indrel);
2140 update_relstats(RelationGetRelid(indrel), num_pages, num_index_tuples, false, NULL);
2142 elog(MESSAGE_LEVEL, "Index %s: Pages %u; Tuples %u: Deleted %u. %s",
2143 RelationGetRelationName(indrel), num_pages,
2144 num_index_tuples - keep_tuples, tups_vacuumed,
2147 if (num_index_tuples != num_tuples + keep_tuples)
2148 elog(NOTICE, "Index %s: NUMBER OF INDEX' TUPLES (%u) IS NOT THE SAME AS HEAP' (%u).\
2149 \n\tRecreate the index.",
2150 RelationGetRelationName(indrel), num_index_tuples, num_tuples);
2155 * tid_reaped() -- is a particular tid reaped?
2157 * vacpagelist->VacPage_array is sorted in right order.
2160 tid_reaped(ItemPointer itemptr, VacPageList vacpagelist)
2162 OffsetNumber ioffno;
2166 VacPageData vacpage;
2168 vacpage.blkno = ItemPointerGetBlockNumber(itemptr);
2169 ioffno = ItemPointerGetOffsetNumber(itemptr);
2172 vpp = (VacPage *) vac_find_eq((void *) (vacpagelist->pagedesc),
2173 vacpagelist->num_pages, sizeof(VacPage), (void *) &vp,
2176 if (vpp == (VacPage *) NULL)
2177 return (VacPage) NULL;
2180 /* ok - we are on true page */
2182 if (vp->offsets_free == 0)
2183 { /* this is EmptyPage !!! */
2187 voff = (OffsetNumber *) vac_find_eq((void *) (vp->offsets),
2188 vp->offsets_free, sizeof(OffsetNumber), (void *) &ioffno,
2191 if (voff == (OffsetNumber *) NULL)
2192 return (VacPage) NULL;
2199 * update_relstats() -- update statistics for one relation
2201 * Statistics are stored in several places: the pg_class row for the
2202 * relation has stats about the whole relation, the pg_attribute rows
2203 * for each attribute store "dispersion", and there is a pg_statistic
2204 * row for each (non-system) attribute. (Dispersion probably ought to
2205 * be moved to pg_statistic, but it's not worth doing unless there's
2206 * another reason to have to change pg_attribute.) Dispersion and
2207 * pg_statistic values are only updated by VACUUM ANALYZE, but we
2208 * always update the stats in pg_class.
2210 * This routine works for both index and heap relation entries in
2211 * pg_class. We violate no-overwrite semantics here by storing new
2212 * values for the statistics columns directly into the pg_class
2213 * tuple that's already on the page. The reason for this is that if
2214 * we updated these tuples in the usual way, vacuuming pg_class itself
2215 * wouldn't work very well --- by the time we got done with a vacuum
2216 * cycle, most of the tuples in pg_class would've been obsoleted.
2217 * Updating pg_class's own statistics would be especially tricky.
2218 * Of course, this only works for fixed-size never-null columns, but
2222 update_relstats(Oid relid, int num_pages, int num_tuples, bool hasindex,
2223 VRelStats *vacrelstats)
2228 Form_pg_class pgcform;
2232 * update number of tuples and number of pages in pg_class
2234 rd = heap_openr(RelationRelationName, RowExclusiveLock);
2236 ctup = SearchSysCacheTupleCopy(RELOID,
2237 ObjectIdGetDatum(relid),
2239 if (!HeapTupleIsValid(ctup))
2240 elog(ERROR, "pg_class entry for relid %u vanished during vacuuming",
2243 /* get the buffer cache tuple */
2244 rtup.t_self = ctup->t_self;
2245 heap_fetch(rd, SnapshotNow, &rtup, &buffer);
2246 heap_freetuple(ctup);
2248 /* overwrite the existing statistics in the tuple */
2249 pgcform = (Form_pg_class) GETSTRUCT(&rtup);
2250 pgcform->reltuples = num_tuples;
2251 pgcform->relpages = num_pages;
2252 pgcform->relhasindex = hasindex;
2254 /* invalidate the tuple in the cache and write the buffer */
2255 RelationInvalidateHeapTuple(rd, &rtup);
2256 WriteBuffer(buffer);
2258 heap_close(rd, RowExclusiveLock);
2262 * reap_page() -- save a page on the array of reaped pages.
2264 * As a side effect of the way that the vacuuming loop for a given
2265 * relation works, higher pages come after lower pages in the array
2266 * (and highest tid on a page is last).
2269 reap_page(VacPageList vacpagelist, VacPage vacpage)
2273 /* allocate a VacPageData entry */
2274 newvacpage = (VacPage) palloc(sizeof(VacPageData) + vacpage->offsets_free * sizeof(OffsetNumber));
2277 if (vacpage->offsets_free > 0)
2278 memmove(newvacpage->offsets, vacpage->offsets, vacpage->offsets_free * sizeof(OffsetNumber));
2279 newvacpage->blkno = vacpage->blkno;
2280 newvacpage->free = vacpage->free;
2281 newvacpage->offsets_used = vacpage->offsets_used;
2282 newvacpage->offsets_free = vacpage->offsets_free;
2284 /* insert this page into vacpagelist list */
2285 vpage_insert(vacpagelist, newvacpage);
2290 vpage_insert(VacPageList vacpagelist, VacPage vpnew)
2292 #define PG_NPAGEDESC 1024
2294 /* allocate a VacPage entry if needed */
2295 if (vacpagelist->num_pages == 0)
2297 vacpagelist->pagedesc = (VacPage *) palloc(PG_NPAGEDESC * sizeof(VacPage));
2298 vacpagelist->num_allocated_pages = PG_NPAGEDESC;
2300 else if (vacpagelist->num_pages >= vacpagelist->num_allocated_pages)
2302 vacpagelist->num_allocated_pages *= 2;
2303 vacpagelist->pagedesc = (VacPage *) repalloc(vacpagelist->pagedesc, vacpagelist->num_allocated_pages * sizeof(VacPage));
2305 vacpagelist->pagedesc[vacpagelist->num_pages] = vpnew;
2306 (vacpagelist->num_pages)++;
2311 vac_find_eq(void *bot, int nelem, int size, void *elm,
2312 int (*compar) (const void *, const void *))
2315 int last = nelem - 1;
2316 int celm = nelem / 2;
2320 last_move = first_move = true;
2323 if (first_move == true)
2325 res = compar(bot, elm);
2332 if (last_move == true)
2334 res = compar(elm, (void *) ((char *) bot + last * size));
2338 return (void *) ((char *) bot + last * size);
2341 res = compar(elm, (void *) ((char *) bot + celm * size));
2343 return (void *) ((char *) bot + celm * size);
2357 last = last - celm - 1;
2358 bot = (void *) ((char *) bot + (celm + 1) * size);
2359 celm = (last + 1) / 2;
2366 vac_cmp_blk(const void *left, const void *right)
2371 lblk = (*((VacPage *) left))->blkno;
2372 rblk = (*((VacPage *) right))->blkno;
2383 vac_cmp_offno(const void *left, const void *right)
2386 if (*(OffsetNumber *) left < *(OffsetNumber *) right)
2388 if (*(OffsetNumber *) left == *(OffsetNumber *) right)
2395 vac_cmp_vtlinks(const void *left, const void *right)
2398 if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi <
2399 ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2401 if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi >
2402 ((VTupleLink) right)->new_tid.ip_blkid.bi_hi)
2404 /* bi_hi-es are equal */
2405 if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo <
2406 ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2408 if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo >
2409 ((VTupleLink) right)->new_tid.ip_blkid.bi_lo)
2411 /* bi_lo-es are equal */
2412 if (((VTupleLink) left)->new_tid.ip_posid <
2413 ((VTupleLink) right)->new_tid.ip_posid)
2415 if (((VTupleLink) left)->new_tid.ip_posid >
2416 ((VTupleLink) right)->new_tid.ip_posid)
2424 get_indices(Relation relation, int *nindices, Relation **Irel)
2430 indexoidlist = RelationGetIndexList(relation);
2432 *nindices = length(indexoidlist);
2435 *Irel = (Relation *) palloc(*nindices * sizeof(Relation));
2440 foreach(indexoidscan, indexoidlist)
2442 Oid indexoid = lfirsti(indexoidscan);
2444 (*Irel)[i] = index_open(indexoid);
2448 freeList(indexoidlist);
2453 close_indices(int nindices, Relation *Irel)
2456 if (Irel == (Relation *) NULL)
2460 index_close(Irel[nindices]);
2467 * Obtain IndexInfo data for each index on the rel
2470 get_index_desc(Relation onerel, int nindices, Relation *Irel)
2472 IndexInfo **indexInfo;
2474 HeapTuple cachetuple;
2476 indexInfo = (IndexInfo **) palloc(nindices * sizeof(IndexInfo *));
2478 for (i = 0; i < nindices; i++)
2480 cachetuple = SearchSysCacheTuple(INDEXRELID,
2481 ObjectIdGetDatum(RelationGetRelid(Irel[i])),
2483 if (!HeapTupleIsValid(cachetuple))
2484 elog(ERROR, "get_index_desc: index %u not found",
2485 RelationGetRelid(Irel[i]));
2486 indexInfo[i] = BuildIndexInfo(cachetuple);
2494 enough_space(VacPage vacpage, Size len)
2497 len = MAXALIGN(len);
2499 if (len > vacpage->free)
2502 if (vacpage->offsets_used < vacpage->offsets_free) /* there are free
2504 return true; /* and len <= free_space */
2506 /* ok. noff_usd >= noff_free and so we'll have to allocate new itemid */
2507 if (len + MAXALIGN(sizeof(ItemIdData)) <= vacpage->free)
2516 * Compute elapsed time since ru0 usage snapshot, and format into
2517 * a displayable string. Result is in a static string, which is
2518 * tacky, but no one ever claimed that the Postgres backend is
2522 show_rusage(struct rusage * ru0)
2524 static char result[64];
2527 getrusage(RUSAGE_SELF, &ru1);
2529 if (ru1.ru_stime.tv_usec < ru0->ru_stime.tv_usec)
2531 ru1.ru_stime.tv_sec--;
2532 ru1.ru_stime.tv_usec += 1000000;
2534 if (ru1.ru_utime.tv_usec < ru0->ru_utime.tv_usec)
2536 ru1.ru_utime.tv_sec--;
2537 ru1.ru_utime.tv_usec += 1000000;
2540 snprintf(result, sizeof(result),
2541 "CPU %d.%02ds/%d.%02du sec.",
2542 (int) (ru1.ru_stime.tv_sec - ru0->ru_stime.tv_sec),
2543 (int) (ru1.ru_stime.tv_usec - ru0->ru_stime.tv_usec) / 10000,
2544 (int) (ru1.ru_utime.tv_sec - ru0->ru_utime.tv_sec),
2545 (int) (ru1.ru_utime.tv_usec - ru0->ru_utime.tv_usec) / 10000);