1 /*-------------------------------------------------------------------------
4 * POSTGRES free space map for quickly finding free space in relations
7 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * $PostgreSQL: pgsql/src/backend/storage/freespace/freespace.c,v 1.69 2008/11/27 13:32:26 heikki Exp $
16 * Free Space Map keeps track of the amount of free space on pages, and
17 * allows quickly searching for a page with enough free space. The FSM is
18 * stored in a dedicated relation fork of all heap relations, and those
19 * index access methods that need it (see also indexfsm.c). See README for
22 *-------------------------------------------------------------------------
26 #include "access/htup.h"
27 #include "access/xlogutils.h"
28 #include "storage/bufpage.h"
29 #include "storage/bufmgr.h"
30 #include "storage/freespace.h"
31 #include "storage/fsm_internals.h"
32 #include "storage/lmgr.h"
33 #include "storage/lwlock.h"
34 #include "storage/smgr.h"
35 #include "utils/rel.h"
36 #include "utils/inval.h"
37 #include "miscadmin.h"
40 * We use just one byte to store the amount of free space on a page, so we
41 * divide the amount of free space a page can have into 256 different
42 * categories. The highest category, 255, represents a page with at least
43 * MaxFSMRequestSize bytes of free space, and the second highest category
44 * represents the range from 254 * FSM_CAT_STEP, inclusive, to
45 * MaxFSMRequestSize, exclusive.
47 * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
48 * default 8k BLCKSZ, and that MaxFSMRequestSize is 24 bytes, the categories
60 * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
61 * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
62 * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
63 * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
64 * completely empty page, that would mean that we could never satisfy a
65 * request of exactly MaxFSMRequestSize bytes.
67 #define FSM_CATEGORIES 256
68 #define FSM_CAT_STEP (BLCKSZ / FSM_CATEGORIES)
69 #define MaxFSMRequestSize MaxHeapTupleSize
72 * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
73 * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
74 * 216 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
75 * this means that 4096 bytes is the smallest BLCKSZ that we can get away
76 * with a 3-level tree, and 512 is the smallest we support.
78 #define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
80 #define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1)
81 #define FSM_BOTTOM_LEVEL 0
84 * The internal FSM routines work on a logical addressing scheme. Each
85 * level of the tree can be thought of as a separately addressable file.
89 int level; /* level */
90 int logpageno; /* page number within the level */
93 /* Address of the root page. */
94 static const FSMAddress FSM_ROOT_ADDRESS = { FSM_ROOT_LEVEL, 0 };
96 /* functions to navigate the tree */
97 static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
98 static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
99 static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
100 static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
101 static BlockNumber fsm_logical_to_physical(FSMAddress addr);
103 static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
104 static void fsm_extend(Relation rel, BlockNumber fsm_nblocks);
106 /* functions to convert amount of free space to a FSM category */
107 static uint8 fsm_space_avail_to_cat(Size avail);
108 static uint8 fsm_space_needed_to_cat(Size needed);
109 static Size fsm_space_cat_to_avail(uint8 cat);
111 /* workhorse functions for various operations */
112 static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
113 uint8 newValue, uint8 minValue);
114 static BlockNumber fsm_search(Relation rel, uint8 min_cat);
115 static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof);
118 /******** Public API ********/
121 * GetPageWithFreeSpace - try to find a page in the given relation with
122 * at least the specified amount of free space.
124 * If successful, return the block number; if not, return InvalidBlockNumber.
126 * The caller must be prepared for the possibility that the returned page
127 * will turn out to have too little space available by the time the caller
128 * gets a lock on it. In that case, the caller should report the actual
129 * amount of free space available on that page and then try again (see
130 * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned,
131 * extend the relation.
134 GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
136 uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded);
137 return fsm_search(rel, min_cat);
141 * RecordAndGetPageWithFreeSpace - update info about a page and try again.
143 * We provide this combo form to save some locking overhead, compared to
144 * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
145 * also some effort to return a page close to the old page; if there's a
146 * page with enough free space on the same FSM page where the old one page
147 * is located, it is preferred.
150 RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
151 Size oldSpaceAvail, Size spaceNeeded)
153 int old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
154 int search_cat = fsm_space_needed_to_cat(spaceNeeded);
159 /* Get the location of the FSM byte representing the heap block */
160 addr = fsm_get_location(oldPage, &slot);
162 search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
165 * If fsm_set_and_search found a suitable new block, return that.
166 * Otherwise, search as usual.
168 if (search_slot != -1)
169 return fsm_get_heap_blk(addr, search_slot);
171 return fsm_search(rel, search_cat);
175 * RecordPageWithFreeSpace - update info about a page.
177 * Note that if the new spaceAvail value is higher than the old value stored
178 * in the FSM, the space might not become visible to searchers until the next
179 * FreeSpaceMapVacuum call, which updates the upper level pages.
182 RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
184 int new_cat = fsm_space_avail_to_cat(spaceAvail);
188 /* Get the location of the FSM byte representing the heap block */
189 addr = fsm_get_location(heapBlk, &slot);
191 fsm_set_and_search(rel, addr, slot, new_cat, 0);
195 * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
199 XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
202 int new_cat = fsm_space_avail_to_cat(spaceAvail);
209 /* Get the location of the FSM byte representing the heap block */
210 addr = fsm_get_location(heapBlk, &slot);
211 blkno = fsm_logical_to_physical(addr);
213 /* If the page doesn't exist already, extend */
214 buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR);
215 page = BufferGetPage(buf);
217 PageInit(page, BLCKSZ, 0);
219 if (fsm_set_avail(page, slot, new_cat))
220 MarkBufferDirty(buf);
221 UnlockReleaseBuffer(buf);
225 * GetRecordedFreePage - return the amount of free space on a particular page,
226 * according to the FSM.
229 GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
236 /* Get the location of the FSM byte representing the heap block */
237 addr = fsm_get_location(heapBlk, &slot);
239 buf = fsm_readbuf(rel, addr, false);
240 if (!BufferIsValid(buf))
242 cat = fsm_get_avail(BufferGetPage(buf), slot);
245 return fsm_space_cat_to_avail(cat);
249 * FreeSpaceMapTruncateRel - adjust for truncation of a relation.
251 * The caller must hold AccessExclusiveLock on the relation, to ensure
252 * that other backends receive the relcache invalidation event that this
253 * function sends, before accessing the FSM again.
255 * nblocks is the new size of the heap.
258 FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks)
260 BlockNumber new_nfsmblocks;
261 FSMAddress first_removed_address;
262 uint16 first_removed_slot;
265 RelationOpenSmgr(rel);
268 * If no FSM has been created yet for this relation, there's nothing to
271 if (!smgrexists(rel->rd_smgr, FSM_FORKNUM))
274 /* Get the location in the FSM of the first removed heap block */
275 first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
278 * Zero out the tail of the last remaining FSM page. If the slot
279 * representing the first removed heap block is at a page boundary, as
280 * the first slot on the FSM page that first_removed_address points to,
281 * we can just truncate that page altogether.
283 if (first_removed_slot > 0)
285 buf = fsm_readbuf(rel, first_removed_address, false);
286 if (!BufferIsValid(buf))
287 return; /* nothing to do; the FSM was already smaller */
288 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
289 fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
290 MarkBufferDirty(buf);
291 UnlockReleaseBuffer(buf);
293 new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
297 new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
298 if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks)
299 return; /* nothing to do; the FSM was already smaller */
302 /* Truncate the unused FSM pages */
303 smgrtruncate(rel->rd_smgr, FSM_FORKNUM, new_nfsmblocks, rel->rd_istemp);
306 * Need to invalidate the relcache entry, because rd_fsm_nblocks
307 * seen by other backends is no longer valid.
310 CacheInvalidateRelcache(rel);
312 rel->rd_fsm_nblocks = new_nfsmblocks;
316 * FreeSpaceMapVacuum - scan and fix any inconsistencies in the FSM
319 FreeSpaceMapVacuum(Relation rel)
324 * Traverse the tree in depth-first order. The tree is stored physically
325 * in depth-first order, so this should be pretty I/O efficient.
327 fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, &dummy);
330 /******** Internal routines ********/
333 * Return category corresponding x bytes of free space
336 fsm_space_avail_to_cat(Size avail)
340 Assert(avail < BLCKSZ);
342 if (avail >= MaxFSMRequestSize)
345 cat = avail / FSM_CAT_STEP;
348 * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
358 * Return the lower bound of the range of free space represented by given
362 fsm_space_cat_to_avail(uint8 cat)
364 /* The highest category represents exactly MaxFSMRequestSize bytes. */
366 return MaxFSMRequestSize;
368 return cat * FSM_CAT_STEP;
372 * Which category does a page need to have, to accommodate x bytes of data?
373 * While fsm_size_to_avail_cat() rounds down, this needs to round up.
376 fsm_space_needed_to_cat(Size needed)
380 /* Can't ask for more space than the highest category represents */
381 if (needed > MaxFSMRequestSize)
382 elog(ERROR, "invalid FSM request size %lu",
383 (unsigned long) needed);
388 cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
397 * Returns the physical block number an FSM page
400 fsm_logical_to_physical(FSMAddress addr)
407 * Calculate the logical page number of the first leaf page below the
410 leafno = addr.logpageno;
411 for (l = 0; l < addr.level; l++)
412 leafno *= SlotsPerFSMPage;
414 /* Count upper level nodes required to address the leaf page */
416 for (l = 0; l < FSM_TREE_DEPTH; l++)
419 leafno /= SlotsPerFSMPage;
423 * If the page we were asked for wasn't at the bottom level, subtract
424 * the additional lower level pages we counted above.
428 /* Turn the page count into 0-based block number */
433 * Return the FSM location corresponding to given heap block.
436 fsm_get_location(BlockNumber heapblk, uint16 *slot)
440 addr.level = FSM_BOTTOM_LEVEL;
441 addr.logpageno = heapblk / SlotsPerFSMPage;
442 *slot = heapblk % SlotsPerFSMPage;
448 * Return the heap block number corresponding to given location in the FSM.
451 fsm_get_heap_blk(FSMAddress addr, uint16 slot)
453 Assert(addr.level == FSM_BOTTOM_LEVEL);
454 return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
458 * Given a logical address of a child page, get the logical page number of
459 * the parent, and the slot within the parent corresponding to the child.
462 fsm_get_parent(FSMAddress child, uint16 *slot)
466 Assert(child.level < FSM_ROOT_LEVEL);
468 parent.level = child.level + 1;
469 parent.logpageno = child.logpageno / SlotsPerFSMPage;
470 *slot = child.logpageno % SlotsPerFSMPage;
476 * Given a logical address of a parent page, and a slot number get the
477 * logical address of the corresponding child page.
480 fsm_get_child(FSMAddress parent, uint16 slot)
484 Assert(parent.level > FSM_BOTTOM_LEVEL);
486 child.level = parent.level - 1;
487 child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
495 * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
496 * true, the FSM file is extended.
499 fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
501 BlockNumber blkno = fsm_logical_to_physical(addr);
504 RelationOpenSmgr(rel);
506 /* If we haven't cached the size of the FSM yet, check it first */
507 if (rel->rd_fsm_nblocks == InvalidBlockNumber)
509 if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
510 rel->rd_fsm_nblocks = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
512 rel->rd_fsm_nblocks = 0;
515 /* Handle requests beyond EOF */
516 if (blkno >= rel->rd_fsm_nblocks)
519 fsm_extend(rel, blkno + 1);
521 return InvalidBuffer;
525 * Use ZERO_ON_ERROR mode, and initialize the page if necessary. The FSM
526 * information is not accurate anyway, so it's better to clear corrupt
527 * pages than error out. Since the FSM changes are not WAL-logged, the
528 * so-called torn page problem on crash can lead to pages with corrupt
529 * headers, for example.
531 buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
532 if (PageIsNew(BufferGetPage(buf)))
533 PageInit(BufferGetPage(buf), BLCKSZ, 0);
538 * Ensure that the FSM fork is at least n_fsmblocks long, extending
539 * it if necessary with empty pages. And by empty, I mean pages filled
540 * with zeros, meaning there's no free space.
543 fsm_extend(Relation rel, BlockNumber fsm_nblocks)
545 BlockNumber fsm_nblocks_now;
548 pg = (Page) palloc(BLCKSZ);
549 PageInit(pg, BLCKSZ, 0);
552 * We use the relation extension lock to lock out other backends
553 * trying to extend the FSM at the same time. It also locks out
554 * extension of the main fork, unnecessarily, but extending the
555 * FSM happens seldom enough that it doesn't seem worthwhile to
556 * have a separate lock tag type for it.
558 * Note that another backend might have extended the relation
559 * before we get the lock.
561 LockRelationForExtension(rel, ExclusiveLock);
563 /* Create the FSM file first if it doesn't exist */
564 if ((rel->rd_fsm_nblocks == 0 || rel->rd_fsm_nblocks == InvalidBlockNumber)
565 && !smgrexists(rel->rd_smgr, FSM_FORKNUM))
567 smgrcreate(rel->rd_smgr, FSM_FORKNUM, false);
571 fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
573 while (fsm_nblocks_now < fsm_nblocks)
575 smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now,
576 (char *) pg, rel->rd_istemp);
580 UnlockRelationForExtension(rel, ExclusiveLock);
584 /* Update the relcache with the up-to-date size */
586 CacheInvalidateRelcache(rel);
587 rel->rd_fsm_nblocks = fsm_nblocks_now;
591 * Set value in given FSM page and slot.
593 * If minValue > 0, the updated page is also searched for a page with at
594 * least minValue of free space. If one is found, its slot number is
595 * returned, -1 otherwise.
598 fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
599 uint8 newValue, uint8 minValue)
605 buf = fsm_readbuf(rel, addr, true);
606 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
608 page = BufferGetPage(buf);
610 if (fsm_set_avail(page, slot, newValue))
611 MarkBufferDirty(buf);
615 /* Search while we still hold the lock */
616 newslot = fsm_search_avail(buf, minValue,
617 addr.level == FSM_BOTTOM_LEVEL,
621 UnlockReleaseBuffer(buf);
627 * Search the tree for a heap page with at least min_cat of free space
630 fsm_search(Relation rel, uint8 min_cat)
633 FSMAddress addr = FSM_ROOT_ADDRESS;
641 /* Read the FSM page. */
642 buf = fsm_readbuf(rel, addr, false);
644 /* Search within the page */
645 if (BufferIsValid(buf))
647 LockBuffer(buf, BUFFER_LOCK_SHARE);
648 slot = fsm_search_avail(buf, min_cat,
649 (addr.level == FSM_BOTTOM_LEVEL),
652 max_avail = fsm_get_max_avail(BufferGetPage(buf));
653 UnlockReleaseBuffer(buf);
661 * Descend the tree, or return the found block if we're at the
664 if (addr.level == FSM_BOTTOM_LEVEL)
665 return fsm_get_heap_blk(addr, slot);
667 addr = fsm_get_child(addr, slot);
669 else if (addr.level == FSM_ROOT_LEVEL)
672 * At the root, failure means there's no page with enough free
673 * space in the FSM. Give up.
675 return InvalidBlockNumber;
683 * At lower level, failure can happen if the value in the upper-
684 * level node didn't reflect the value on the lower page. Update
685 * the upper node, to avoid falling into the same trap again, and
688 * There's a race condition here, if another backend updates this
689 * page right after we release it, and gets the lock on the parent
690 * page before us. We'll then update the parent page with the now
691 * stale information we had. It's OK, because it should happen
692 * rarely, and will be fixed by the next vacuum.
694 parent = fsm_get_parent(addr, &parentslot);
695 fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
698 * If the upper pages are badly out of date, we might need to
699 * loop quite a few times, updating them as we go. Any
700 * inconsistencies should eventually be corrected and the loop
701 * should end. Looping indefinitely is nevertheless scary, so
702 * provide an emergency valve.
704 if (restarts++ > 10000)
705 return InvalidBlockNumber;
707 /* Start search all over from the root */
708 addr = FSM_ROOT_ADDRESS;
715 * Recursive guts of FreeSpaceMapVacuum
718 fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p)
724 /* Read the page if it exists, or return EOF */
725 buf = fsm_readbuf(rel, addr, false);
726 if (!BufferIsValid(buf))
734 page = BufferGetPage(buf);
737 * Recurse into children, and fix the information stored about them
740 if (addr.level > FSM_BOTTOM_LEVEL)
745 for (slot = 0; slot < SlotsPerFSMPage; slot++)
749 /* After we hit end-of-file, just clear the rest of the slots */
751 child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), &eof);
755 /* Update information about the child */
756 if (fsm_get_avail(page, slot) != child_avail)
758 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
759 fsm_set_avail(BufferGetPage(buf), slot, child_avail);
760 MarkBufferDirty(buf);
761 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
766 max_avail = fsm_get_max_avail(BufferGetPage(buf));
769 * Reset the next slot pointer. This encourages the use of low-numbered
770 * pages, increasing the chances that a later vacuum can truncate the
773 ((FSMPage) PageGetContents(page))->fp_next_slot = 0;