CREATE EXTENSION pageinspect;
-CREATE TABLE test1 (a int, b int);
-INSERT INTO test1 VALUES (16777217, 131584);
-VACUUM test1; -- set up FSM
+CREATE TABLE test_rel_forks (a int);
+-- Make sure there are enough blocks in the heap for the FSM to be created.
+INSERT INTO test_rel_forks SELECT i from generate_series(1,1000) i;
+-- set up FSM and VM
+VACUUM test_rel_forks;
-- The page contents can vary, so just test that it can be read
-- successfully, but don't keep the output.
-SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0;
main_0
--------
8192
(1 row)
-SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1;
-ERROR: block number 1 is out of range for relation "test1"
-SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100;
+ERROR: block number 100 is out of range for relation "test_rel_forks"
+SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0;
fsm_0
-------
8192
(1 row)
-SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1;
- fsm_1
--------
- 8192
-(1 row)
-
-SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 10)) AS fsm_10;
+ERROR: block number 10 is out of range for relation "test_rel_forks"
+SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0;
vm_0
------
8192
(1 row)
-SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1;
-ERROR: block number 1 is out of range for relation "test1"
+SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1;
+ERROR: block number 1 is out of range for relation "test_rel_forks"
SELECT octet_length(get_raw_page('xxx', 'main', 0));
ERROR: relation "xxx" does not exist
-SELECT octet_length(get_raw_page('test1', 'xxx', 0));
+SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0));
ERROR: invalid fork name
HINT: Valid fork names are "main", "fsm", "vm", and "init".
-SELECT get_raw_page('test1', 0) = get_raw_page('test1', 'main', 0);
+SELECT * FROM fsm_page_contents(get_raw_page('test_rel_forks', 'fsm', 0));
+ fsm_page_contents
+-------------------
+ 0: 147 +
+ 1: 147 +
+ 3: 147 +
+ 7: 147 +
+ 15: 147 +
+ 31: 147 +
+ 63: 147 +
+ 127: 147 +
+ 255: 147 +
+ 511: 147 +
+ 1023: 147 +
+ 2047: 147 +
+ 4095: 147 +
+ fp_next_slot: 0 +
+
+(1 row)
+
+SELECT get_raw_page('test_rel_forks', 0) = get_raw_page('test_rel_forks', 'main', 0);
?column?
----------
t
(1 row)
+DROP TABLE test_rel_forks;
+CREATE TABLE test1 (a int, b int);
+INSERT INTO test1 VALUES (16777217, 131584);
SELECT pagesize, version FROM page_header(get_raw_page('test1', 0));
pagesize | version
----------+---------
{"\\x01000001","\\x00020200"}
(1 row)
-SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0));
- fsm_page_contents
--------------------
- 0: 254 +
- 1: 254 +
- 3: 254 +
- 7: 254 +
- 15: 254 +
- 31: 254 +
- 63: 254 +
- 127: 254 +
- 255: 254 +
- 511: 254 +
- 1023: 254 +
- 2047: 254 +
- 4095: 254 +
- fp_next_slot: 0 +
-
-(1 row)
-
DROP TABLE test1;
-- check that using any of these functions with a partitioned table or index
-- would fail
CREATE EXTENSION pageinspect;
-CREATE TABLE test1 (a int, b int);
-INSERT INTO test1 VALUES (16777217, 131584);
+CREATE TABLE test_rel_forks (a int);
+-- Make sure there are enough blocks in the heap for the FSM to be created.
+INSERT INTO test_rel_forks SELECT i from generate_series(1,1000) i;
-VACUUM test1; -- set up FSM
+-- set up FSM and VM
+VACUUM test_rel_forks;
-- The page contents can vary, so just test that it can be read
-- successfully, but don't keep the output.
-SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0;
-SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1;
+SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100;
-SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0;
-SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1;
+SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 10)) AS fsm_10;
-SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0;
-SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1;
+SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1;
SELECT octet_length(get_raw_page('xxx', 'main', 0));
-SELECT octet_length(get_raw_page('test1', 'xxx', 0));
+SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0));
+
+SELECT * FROM fsm_page_contents(get_raw_page('test_rel_forks', 'fsm', 0));
+
+SELECT get_raw_page('test_rel_forks', 0) = get_raw_page('test_rel_forks', 'main', 0);
-SELECT get_raw_page('test1', 0) = get_raw_page('test1', 'main', 0);
+DROP TABLE test_rel_forks;
+
+CREATE TABLE test1 (a int, b int);
+INSERT INTO test1 VALUES (16777217, 131584);
SELECT pagesize, version FROM page_header(get_raw_page('test1', 0));
SELECT tuple_data_split('test1'::regclass, t_data, t_infomask, t_infomask2, t_bits)
FROM heap_page_items(get_raw_page('test1', 0));
-SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0));
-
DROP TABLE test1;
-- check that using any of these functions with a partitioned table or index
<indexterm><primary>FSM</primary><see>Free Space Map</see></indexterm>
<para>
-Each heap and index relation, except for hash indexes, has a Free Space Map
-(FSM) to keep track of available space in the relation. It's stored
-alongside the main relation data in a separate relation fork, named after the
-filenode number of the relation, plus a <literal>_fsm</literal> suffix. For example,
-if the filenode of a relation is 12345, the FSM is stored in a file called
-<filename>12345_fsm</filename>, in the same directory as the main relation file.
+Each heap relation, unless it is very small, and each index relation, except
+for hash indexes, has a Free Space Map (FSM) to keep track of available
+space in the relation. It's stored alongside the main relation data in a
+separate relation fork, named after the filenode number of the relation, plus
+a <literal>_fsm</literal> suffix. For example, if the filenode of a relation
+is 12345, the FSM is stored in a file called <filename>12345_fsm</filename>,
+in the same directory as the main relation file.
</para>
<para>
freespace = PageGetFreeSpace(page);
blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
ReleaseBuffer(state->bs_currentInsertBuf);
- RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
+ RecordPageWithFreeSpace(state->bs_irel, blk, freespace, InvalidBlockNumber);
FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
}
if (extended)
{
- RecordPageWithFreeSpace(idxrel, newblk, freespace);
+ RecordPageWithFreeSpace(idxrel, newblk, freespace, InvalidBlockNumber);
FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
}
if (extended)
{
- RecordPageWithFreeSpace(idxrel, blk, freespace);
+ RecordPageWithFreeSpace(idxrel, blk, freespace, InvalidBlockNumber);
FreeSpaceMapVacuumRange(idxrel, blk, blk + 1);
}
/* Measure free space and record it */
RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf),
- br_page_get_freespace(page));
+ br_page_get_freespace(page), InvalidBlockNumber);
}
/*
/* Choose initial target page, re-using existing target if known */
newblk = RelationGetTargetBlock(irel);
if (newblk == InvalidBlockNumber)
- newblk = GetPageWithFreeSpace(irel, itemsz);
+ newblk = GetPageWithFreeSpace(irel, itemsz, true);
/*
* Loop until we find a page with sufficient free space. By the time we
* pages whose FSM records were forgotten in a crash.
*/
RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer),
- br_page_get_freespace(page));
+ br_page_get_freespace(page), InvalidBlockNumber);
}
* Immediately update the bottom level of the FSM. This has a good
* chance of making this page visible to other concurrently inserting
* backends, and we want that to happen without delay.
+ *
+ * Since we know the table will end up with extraBlocks additional
+ * pages, we pass the final number to avoid possible unnecessary
+ * system calls and to make sure the FSM is created when we add the
+ * first new page.
*/
- RecordPageWithFreeSpace(relation, blockNum, freespace);
+ RecordPageWithFreeSpace(relation, blockNum, freespace,
+ firstBlock + extraBlocks);
}
while (--extraBlocks > 0);
* We have no cached target page, so ask the FSM for an initial
* target.
*/
- targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
-
- /*
- * If the FSM knows nothing of the rel, try the last page before we
- * give up and extend. This avoids one-tuple-per-page syndrome during
- * bootstrapping or in a recently-started system.
- */
- if (targetBlock == InvalidBlockNumber)
- {
- BlockNumber nblocks = RelationGetNumberOfBlocks(relation);
-
- if (nblocks > 0)
- targetBlock = nblocks - 1;
- }
+ targetBlock = GetPageWithFreeSpace(relation,
+ len + saveFreeSpace,
+ false);
}
loop:
{
/* use this page as future insert target, too */
RelationSetTargetBlock(relation, targetBlock);
+
+ /*
+ * In case we used an in-memory map of available blocks, reset it
+ * for next use.
+ */
+ if (targetBlock < HEAP_FSM_CREATION_THRESHOLD)
+ FSMClearLocalMap();
+
return buffer;
}
/*
* Check if some other backend has extended a block for us while
- * we were waiting on the lock.
+ * we were waiting on the lock. We only check the FSM -- if there
+ * isn't one we don't recheck the number of blocks.
*/
- targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
+ targetBlock = GetPageWithFreeSpace(relation,
+ len + saveFreeSpace,
+ true);
/*
* If some other waiter has already extended the relation, we
*/
RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer));
+ /*
+ * In case we used an in-memory map of available blocks, reset it for next
+ * use. We do this unconditionally since after relation extension we
+ * can't skip this based on the targetBlock.
+ */
+ FSMClearLocalMap();
+
return buffer;
}
static void lazy_scan_heap(Relation onerel, int options,
LVRelStats *vacrelstats, Relation *Irel, int nindexes,
bool aggressive);
-static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
+static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats, BlockNumber nblocks);
static bool lazy_check_needs_freeze(Buffer buf, bool *hastup);
static void lazy_vacuum_index(Relation indrel,
IndexBulkDeleteResult **stats,
pgstat_progress_update_multi_param(2, hvp_index, hvp_val);
/* Remove tuples from heap */
- lazy_vacuum_heap(onerel, vacrelstats);
+ lazy_vacuum_heap(onerel, vacrelstats, nblocks);
/*
* Forget the now-vacuumed tuples, and press on, but be careful
MarkBufferDirty(buf);
UnlockReleaseBuffer(buf);
- RecordPageWithFreeSpace(onerel, blkno, freespace);
+ RecordPageWithFreeSpace(onerel, blkno, freespace, nblocks);
continue;
}
}
UnlockReleaseBuffer(buf);
- RecordPageWithFreeSpace(onerel, blkno, freespace);
+ RecordPageWithFreeSpace(onerel, blkno, freespace, nblocks);
continue;
}
* taken if there are no indexes.)
*/
if (vacrelstats->num_dead_tuples == prev_dead_count)
- RecordPageWithFreeSpace(onerel, blkno, freespace);
+ RecordPageWithFreeSpace(onerel, blkno, freespace, nblocks);
}
/* report that everything is scanned and vacuumed */
/* Remove tuples from heap */
pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
- lazy_vacuum_heap(onerel, vacrelstats);
+ lazy_vacuum_heap(onerel, vacrelstats, nblocks);
vacrelstats->num_index_scans++;
}
* Note: the reason for doing this as a second pass is we cannot remove
* the tuples until we've removed their index entries, and we want to
* process index entry removal in batches as large as possible.
+ * Note: nblocks is passed as an optimization for RecordPageWithFreeSpace().
*/
static void
-lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
+lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats, BlockNumber nblocks)
{
int tupindex;
int npages;
freespace = PageGetHeapFreeSpace(page);
UnlockReleaseBuffer(buf);
- RecordPageWithFreeSpace(onerel, tblk, freespace);
+ RecordPageWithFreeSpace(onerel, tblk, freespace, nblocks);
npages++;
}
#include "replication/walsender.h"
#include "storage/condition_variable.h"
#include "storage/fd.h"
+#include "storage/freespace.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/proc.h"
pgstat_report_wait_end();
pgstat_progress_end_command();
+ /*
+ * In case we aborted during RelationGetBufferForTuple(), clear the local
+ * map of heap pages.
+ */
+ FSMClearLocalMap();
+
/* Clean up buffer I/O and buffer context locks, too */
AbortBufferIO();
UnlockBuffers();
pgstat_report_wait_end();
pgstat_progress_end_command();
+
+ /*
+ * In case we aborted during RelationGetBufferForTuple(), clear the local
+ * map of heap pages.
+ */
+ FSMClearLocalMap();
+
AbortBufferIO();
UnlockBuffers();
exists and the relation must be extended by one page. As of PostgreSQL 8.4
each relation has its own, extensible free space map stored in a separate
"fork" of its relation. This eliminates the disadvantages of the former
-fixed-size FSM.
+fixed-size FSM. There are two exceptions:
+
+1. Hash indexes never have a FSM.
+2. For very small tables, a 3-page relation fork would be relatively large
+and wasteful, so to save space we refrain from creating the FSM if the
+heap has HEAP_FSM_CREATION_THRESHOLD pages or fewer.
+
+To locate free space in the latter case, we simply try pages directly without
+knowing ahead of time how much free space they have. To maintain good
+performance, we create a local in-memory map of pages to try, and only mark
+every other page as available. For example, in a 3-page heap, the local map
+would look like:
+
+ANAN
+0123
+
+Pages 0 and 2 are marked "available", and page 1 as "not available".
+Page 3 is beyond the end of the relation, so is likewise marked "not
+available". First we try page 2, and if that doesn't have sufficient free
+space we try page 0 before giving up and extending the relation. There may
+be some wasted free space on block 1, but if the relation extends to 4 pages:
+
+NANA
+0123
+
+We not only have the new page 3 at our disposal, we can now check page 1
+for free space as well.
+
+Once the FSM is created for a heap we don't remove it even if somebody deletes
+all the rows from the corresponding relation. We don't think it is a useful
+optimization as it is quite likely that relation will again grow to the same
+size.
+
+FSM data structure
+------------------
It is important to keep the map small so that it can be searched rapidly.
Therefore, we don't attempt to record the exact free space on a page.
----
- fastroot to avoid traversing upper nodes with just 1 child
-- use a different system for tables that fit into one FSM page, with a
- mechanism to switch to the real thing as it grows.
#define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1)
#define FSM_BOTTOM_LEVEL 0
+/* Status codes for the local map. */
+
+/* Either already tried, or beyond the end of the relation */
+#define FSM_LOCAL_NOT_AVAIL 0x00
+
+/* Available to try */
+#define FSM_LOCAL_AVAIL 0x01
+
/*
* The internal FSM routines work on a logical addressing scheme. Each
* level of the tree can be thought of as a separately addressable file.
/* Address of the root page. */
static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
+/* Local map of block numbers for small heaps with no FSM. */
+typedef struct
+{
+ BlockNumber nblocks;
+ uint8 map[HEAP_FSM_CREATION_THRESHOLD];
+} FSMLocalMap;
+
+static FSMLocalMap fsm_local_map = {0, {FSM_LOCAL_NOT_AVAIL}};
+
+#define FSM_LOCAL_MAP_EXISTS (fsm_local_map.nblocks > 0)
+
/* functions to navigate the tree */
static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
/* workhorse functions for various operations */
static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
uint8 newValue, uint8 minValue);
+static void fsm_local_set(Relation rel, BlockNumber cur_nblocks);
static BlockNumber fsm_search(Relation rel, uint8 min_cat);
+static BlockNumber fsm_local_search(void);
static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
BlockNumber start, BlockNumber end,
bool *eof);
+static bool fsm_allow_writes(Relation rel, BlockNumber heapblk,
+ BlockNumber nblocks, BlockNumber *get_nblocks);
/******** Public API ********/
* amount of free space available on that page and then try again (see
* RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned,
* extend the relation.
+ *
+ * For very small heap relations that don't have a FSM, we try every other
+ * page before extending the relation. To keep track of which pages have
+ * been tried, initialize a local in-memory map of pages.
*/
BlockNumber
-GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
+GetPageWithFreeSpace(Relation rel, Size spaceNeeded, bool check_fsm_only)
{
uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded);
+ BlockNumber target_block,
+ nblocks;
+
+ /* First try the FSM, if it exists. */
+ target_block = fsm_search(rel, min_cat);
+
+ if (target_block == InvalidBlockNumber &&
+ (rel->rd_rel->relkind == RELKIND_RELATION ||
+ rel->rd_rel->relkind == RELKIND_TOASTVALUE) &&
+ !check_fsm_only)
+ {
+ nblocks = RelationGetNumberOfBlocks(rel);
+
+ if (nblocks > HEAP_FSM_CREATION_THRESHOLD)
+ {
+ /*
+ * If the FSM knows nothing of the rel, try the last page before
+ * we give up and extend. This avoids one-tuple-per-page syndrome
+ * during bootstrapping or in a recently-started system.
+ */
+ target_block = nblocks - 1;
+ }
+ else if (nblocks > 0)
+ {
+ /* Create or update local map and get first candidate block. */
+ fsm_local_set(rel, nblocks);
+ target_block = fsm_local_search();
+ }
+ }
- return fsm_search(rel, min_cat);
+ return target_block;
}
/*
* also some effort to return a page close to the old page; if there's a
* page with enough free space on the same FSM page where the old one page
* is located, it is preferred.
+ *
+ * For very small heap relations that don't have a FSM, we update the local
+ * map to indicate we have tried a page, and return the next page to try.
*/
BlockNumber
RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
Size oldSpaceAvail, Size spaceNeeded)
{
- int old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
- int search_cat = fsm_space_needed_to_cat(spaceNeeded);
+ int old_cat;
+ int search_cat;
FSMAddress addr;
uint16 slot;
int search_slot;
+ BlockNumber nblocks = InvalidBlockNumber;
+
+ /* First try the local map, if it exists. */
+ if (FSM_LOCAL_MAP_EXISTS)
+ {
+ Assert((rel->rd_rel->relkind == RELKIND_RELATION ||
+ rel->rd_rel->relkind == RELKIND_TOASTVALUE) &&
+ fsm_local_map.map[oldPage] == FSM_LOCAL_AVAIL);
+
+ fsm_local_map.map[oldPage] = FSM_LOCAL_NOT_AVAIL;
+ return fsm_local_search();
+ }
+
+ if (!fsm_allow_writes(rel, oldPage, InvalidBlockNumber, &nblocks))
+ {
+ /*
+ * If we have neither a local map nor a FSM, we probably just
+ * tried the target block in the smgr relation entry and failed,
+ * so we'll need to create the local map.
+ */
+ fsm_local_set(rel, nblocks);
+ return fsm_local_search();
+ }
+
+ /* Normal FSM logic follows */
+
+ old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
+ search_cat = fsm_space_needed_to_cat(spaceNeeded);
/* Get the location of the FSM byte representing the heap block */
addr = fsm_get_location(oldPage, &slot);
* Note that if the new spaceAvail value is higher than the old value stored
* in the FSM, the space might not become visible to searchers until the next
* FreeSpaceMapVacuum call, which updates the upper level pages.
+ *
+ * Callers have no need for a local map.
*/
void
-RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
+RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk,
+ Size spaceAvail, BlockNumber nblocks)
{
- int new_cat = fsm_space_avail_to_cat(spaceAvail);
+ int new_cat;
FSMAddress addr;
uint16 slot;
+ BlockNumber dummy;
+
+ if (!fsm_allow_writes(rel, heapBlk, nblocks, &dummy))
+ /* No FSM to update and no local map either */
+ return;
/* Get the location of the FSM byte representing the heap block */
addr = fsm_get_location(heapBlk, &slot);
+ new_cat = fsm_space_avail_to_cat(spaceAvail);
fsm_set_and_search(rel, addr, slot, new_cat, 0);
}
+/*
+ * Clear the local map. We must call this when we have found a block with
+ * enough free space, when we extend the relation, or on transaction abort.
+ */
+void
+FSMClearLocalMap(void)
+{
+ fsm_local_map.nblocks = 0;
+ memset(&fsm_local_map.map, FSM_LOCAL_NOT_AVAIL,
+ sizeof(fsm_local_map.map));
+}
+
/*
* XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
* WAL replay
BlockNumber blkno;
Buffer buf;
Page page;
+ bool write_to_fsm;
+
+ /* This is meant to mirror the logic in fsm_allow_writes() */
+ if (heapBlk >= HEAP_FSM_CREATION_THRESHOLD)
+ write_to_fsm = true;
+ else
+ {
+ /* Open the relation at smgr level */
+ SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+
+ if (smgrexists(smgr, FSM_FORKNUM))
+ write_to_fsm = true;
+ else
+ {
+ BlockNumber heap_nblocks = smgrnblocks(smgr, MAIN_FORKNUM);
+ if (heap_nblocks > HEAP_FSM_CREATION_THRESHOLD)
+ write_to_fsm = true;
+ else
+ write_to_fsm = false;
+ }
+ }
+
+ if (!write_to_fsm)
+ return;
/* Get the location of the FSM byte representing the heap block */
addr = fsm_get_location(heapBlk, &slot);
return max_avail;
}
+
+/*
+ * For heaps, we prevent creation of the FSM unless the number of pages
+ * exceeds HEAP_FSM_CREATION_THRESHOLD. For tables that don't already have
+ * a FSM, this will save an inode and a few kB of space.
+ *
+ * XXX The API is a little awkward -- if the caller passes a valid nblocks
+ * value, it can avoid invoking a system call. If the caller passes
+ * InvalidBlockNumber and receives a false return value, it can get an
+ * up-to-date relation size from get_nblocks. This saves a few cycles in
+ * the caller, which would otherwise need to get the relation size by itself.
+ */
+static bool
+fsm_allow_writes(Relation rel, BlockNumber heapblk,
+ BlockNumber nblocks, BlockNumber *get_nblocks)
+{
+ bool skip_get_nblocks;
+
+ if (heapblk >= HEAP_FSM_CREATION_THRESHOLD)
+ return true;
+
+ /* Non-heap rels can always create a FSM. */
+ if (rel->rd_rel->relkind != RELKIND_RELATION &&
+ rel->rd_rel->relkind != RELKIND_TOASTVALUE)
+ return true;
+
+ /*
+ * If the caller knows nblocks, we can avoid a system call later.
+ * If it doesn't, maybe we have relpages from a previous VACUUM.
+ * Since the table may have extended since then, we still have to
+ * count the pages later if we can't return now.
+ */
+ if (nblocks != InvalidBlockNumber)
+ {
+ if (nblocks > HEAP_FSM_CREATION_THRESHOLD)
+ return true;
+ else
+ skip_get_nblocks = true;
+ }
+ else
+ {
+ if (rel->rd_rel->relpages != InvalidBlockNumber &&
+ rel->rd_rel->relpages > HEAP_FSM_CREATION_THRESHOLD)
+ return true;
+ else
+ skip_get_nblocks = false;
+ }
+
+ RelationOpenSmgr(rel);
+ if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
+ return true;
+
+ if (skip_get_nblocks)
+ return false;
+
+ /* last resort */
+ *get_nblocks = RelationGetNumberOfBlocks(rel);
+ if (*get_nblocks > HEAP_FSM_CREATION_THRESHOLD)
+ return true;
+ else
+ return false;
+}
+
+/*
+ * Initialize or update the local map of blocks to try, for when there is
+ * no FSM.
+ *
+ * When we initialize the map, the whole heap is potentially available to
+ * try. Testing revealed that trying every block can cause a small
+ * performance dip compared to when we use a FSM, so we try every other
+ * block instead.
+ */
+static void
+fsm_local_set(Relation rel, BlockNumber cur_nblocks)
+{
+ BlockNumber blkno,
+ cached_target_block;
+
+ /* The local map must not be set already. */
+ Assert(!FSM_LOCAL_MAP_EXISTS);
+
+ /*
+ * Starting at the current last block in the relation and working
+ * backwards, mark alternating blocks as available.
+ */
+ blkno = cur_nblocks - 1;
+ while (true)
+ {
+ fsm_local_map.map[blkno] = FSM_LOCAL_AVAIL;
+ if (blkno >= 2)
+ blkno -= 2;
+ else
+ break;
+ }
+
+ /* Cache the number of blocks. */
+ fsm_local_map.nblocks = cur_nblocks;
+
+ /* Set the status of the cached target block to 'unavailable'. */
+ cached_target_block = RelationGetTargetBlock(rel);
+ if (cached_target_block != InvalidBlockNumber &&
+ cached_target_block < cur_nblocks)
+ fsm_local_map.map[cached_target_block] = FSM_LOCAL_NOT_AVAIL;
+}
+
+/*
+ * Search the local map for an available block to try, in descending order.
+ * As such, there is no heuristic available to decide which order will be
+ * better to try, but the probability of having space in the last block in the
+ * map is higher because that is the most recent block added to the heap.
+ *
+ * This function is used when there is no FSM.
+ */
+static BlockNumber
+fsm_local_search(void)
+{
+ BlockNumber target_block;
+
+ /* Local map must be set by now. */
+ Assert(FSM_LOCAL_MAP_EXISTS);
+
+ target_block = fsm_local_map.nblocks;
+ do
+ {
+ target_block--;
+ if (fsm_local_map.map[target_block] == FSM_LOCAL_AVAIL)
+ return target_block;
+ } while (target_block > 0);
+
+ return InvalidBlockNumber;
+}
BlockNumber
GetFreeIndexPage(Relation rel)
{
- BlockNumber blkno = GetPageWithFreeSpace(rel, BLCKSZ / 2);
+ BlockNumber blkno = GetPageWithFreeSpace(rel, BLCKSZ / 2, true);
if (blkno != InvalidBlockNumber)
RecordUsedIndexPage(rel, blkno);
void
RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
{
- RecordPageWithFreeSpace(rel, freeBlock, BLCKSZ - 1);
+ RecordPageWithFreeSpace(rel, freeBlock, BLCKSZ - 1, InvalidBlockNumber);
}
void
RecordUsedIndexPage(Relation rel, BlockNumber usedBlock)
{
- RecordPageWithFreeSpace(rel, usedBlock, 0);
+ RecordPageWithFreeSpace(rel, usedBlock, 0, InvalidBlockNumber);
}
/*
#include "storage/relfilenode.h"
#include "utils/relcache.h"
+/* Only create the FSM if the heap has greater than this many blocks */
+#define HEAP_FSM_CREATION_THRESHOLD 4
+
/* prototypes for public functions in freespace.c */
extern Size GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk);
-extern BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded);
+extern BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded,
+ bool check_fsm_only);
extern BlockNumber RecordAndGetPageWithFreeSpace(Relation rel,
BlockNumber oldPage,
Size oldSpaceAvail,
Size spaceNeeded);
extern void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk,
- Size spaceAvail);
+ Size spaceAvail, BlockNumber nblocks);
+extern void FSMClearLocalMap(void);
extern void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
Size spaceAvail);
--- /dev/null
+--
+-- Free Space Map test
+--
+CREATE TABLE fsm_check_size (num int, str text);
+-- Fill 3 blocks with as many large records as will fit
+-- No FSM
+INSERT INTO fsm_check_size SELECT i, rpad('', 1024, 'a')
+FROM generate_series(1,7*3) i;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'main') AS heap_size,
+pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+ heap_size | fsm_size
+-----------+----------
+ 24576 | 0
+(1 row)
+
+-- Clear some space on block 0
+DELETE FROM fsm_check_size WHERE num <= 5;
+VACUUM fsm_check_size;
+-- Insert small record in block 2 to set the cached smgr targetBlock
+INSERT INTO fsm_check_size VALUES(99, 'b');
+-- Insert large record and make sure it goes in block 0 rather than
+-- causing the relation to extend
+INSERT INTO fsm_check_size VALUES (101, rpad('', 1024, 'a'));
+SELECT pg_relation_size('fsm_check_size', 'main') AS heap_size,
+pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+ heap_size | fsm_size
+-----------+----------
+ 24576 | 0
+(1 row)
+
+-- Extend table with enough blocks to exceed the FSM threshold
+-- FSM is created and extended to 3 blocks
+INSERT INTO fsm_check_size SELECT i, 'c' FROM generate_series(200,1200) i;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+ fsm_size
+----------
+ 24576
+(1 row)
+
+-- Truncate heap to 1 block
+-- No change in FSM
+DELETE FROM fsm_check_size WHERE num > 7;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+ fsm_size
+----------
+ 24576
+(1 row)
+
+-- Truncate heap to 0 blocks
+-- FSM now truncated to 2 blocks
+DELETE FROM fsm_check_size;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+ fsm_size
+----------
+ 16384
+(1 row)
+
+-- Add long random string to extend TOAST table to 1 block
+INSERT INTO fsm_check_size
+VALUES(0, (SELECT string_agg(md5(chr(i)), '')
+ FROM generate_series(1,100) i));
+VACUUM fsm_check_size;
+SELECT pg_relation_size(reltoastrelid, 'main') AS toast_size,
+pg_relation_size(reltoastrelid, 'fsm') AS toast_fsm_size
+FROM pg_class WHERE relname = 'fsm_check_size';
+ toast_size | toast_fsm_size
+------------+----------------
+ 8192 | 0
+(1 row)
+
+DROP TABLE fsm_check_size;
# ----------
test: sanity_check
+# ----------
+# fsm does a delete followed by vacuum, and running it in parallel can prevent
+# removal of rows.
+# ----------
+test: fsm
+
# ----------
# Believe it or not, select creates a table, subsequent
# tests need.
test: create_am
test: hash_func
test: sanity_check
+test: fsm
test: errors
test: select
test: select_into
--- /dev/null
+--
+-- Free Space Map test
+--
+
+CREATE TABLE fsm_check_size (num int, str text);
+
+-- Fill 3 blocks with as many large records as will fit
+-- No FSM
+INSERT INTO fsm_check_size SELECT i, rpad('', 1024, 'a')
+FROM generate_series(1,7*3) i;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'main') AS heap_size,
+pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+
+-- Clear some space on block 0
+DELETE FROM fsm_check_size WHERE num <= 5;
+VACUUM fsm_check_size;
+
+-- Insert small record in block 2 to set the cached smgr targetBlock
+INSERT INTO fsm_check_size VALUES(99, 'b');
+
+-- Insert large record and make sure it goes in block 0 rather than
+-- causing the relation to extend
+INSERT INTO fsm_check_size VALUES (101, rpad('', 1024, 'a'));
+SELECT pg_relation_size('fsm_check_size', 'main') AS heap_size,
+pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+
+-- Extend table with enough blocks to exceed the FSM threshold
+-- FSM is created and extended to 3 blocks
+INSERT INTO fsm_check_size SELECT i, 'c' FROM generate_series(200,1200) i;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+
+-- Truncate heap to 1 block
+-- No change in FSM
+DELETE FROM fsm_check_size WHERE num > 7;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+
+-- Truncate heap to 0 blocks
+-- FSM now truncated to 2 blocks
+DELETE FROM fsm_check_size;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+
+-- Add long random string to extend TOAST table to 1 block
+INSERT INTO fsm_check_size
+VALUES(0, (SELECT string_agg(md5(chr(i)), '')
+ FROM generate_series(1,100) i));
+VACUUM fsm_check_size;
+SELECT pg_relation_size(reltoastrelid, 'main') AS toast_size,
+pg_relation_size(reltoastrelid, 'fsm') AS toast_fsm_size
+FROM pg_class WHERE relname = 'fsm_check_size';
+
+DROP TABLE fsm_check_size;