diff --git a/contrib/pageinspect/expected/page.out b/contrib/pageinspect/expected/page.out
index 3fcd9fbe6d..89b73ca991 100644
--- a/contrib/pageinspect/expected/page.out
+++ b/contrib/pageinspect/expected/page.out
@@ -1,48 +1,69 @@
CREATE EXTENSION pageinspect;
-CREATE TABLE test1 (a int, b int);
-INSERT INTO test1 VALUES (16777217, 131584);
-VACUUM test1; -- set up FSM
+CREATE TABLE test_rel_forks (a int);
+-- Make sure there are enough blocks in the heap for the FSM to be created.
+INSERT INTO test_rel_forks SELECT i from generate_series(1,2000) i;
+-- set up FSM and VM
+VACUUM test_rel_forks;
-- The page contents can vary, so just test that it can be read
-- successfully, but don't keep the output.
-SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0;
main_0
--------
8192
(1 row)
-SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1;
-ERROR: block number 1 is out of range for relation "test1"
-SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100;
+ERROR: block number 100 is out of range for relation "test_rel_forks"
+SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0;
fsm_0
-------
8192
(1 row)
-SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1;
- fsm_1
--------
- 8192
-(1 row)
-
-SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 20)) AS fsm_20;
+ERROR: block number 20 is out of range for relation "test_rel_forks"
+SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0;
vm_0
------
8192
(1 row)
-SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1;
-ERROR: block number 1 is out of range for relation "test1"
+SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1;
+ERROR: block number 1 is out of range for relation "test_rel_forks"
SELECT octet_length(get_raw_page('xxx', 'main', 0));
ERROR: relation "xxx" does not exist
-SELECT octet_length(get_raw_page('test1', 'xxx', 0));
+SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0));
ERROR: invalid fork name
HINT: Valid fork names are "main", "fsm", "vm", and "init".
-SELECT get_raw_page('test1', 0) = get_raw_page('test1', 'main', 0);
+SELECT * FROM fsm_page_contents(get_raw_page('test_rel_forks', 'fsm', 0));
+ fsm_page_contents
+-------------------
+ 0: 39 +
+ 1: 39 +
+ 3: 39 +
+ 7: 39 +
+ 15: 39 +
+ 31: 39 +
+ 63: 39 +
+ 127: 39 +
+ 255: 39 +
+ 511: 39 +
+ 1023: 39 +
+ 2047: 39 +
+ 4095: 39 +
+ fp_next_slot: 0 +
+
+(1 row)
+
+SELECT get_raw_page('test_rel_forks', 0) = get_raw_page('test_rel_forks', 'main', 0);
?column?
----------
t
(1 row)
+DROP TABLE test_rel_forks;
+CREATE TABLE test1 (a int, b int);
+INSERT INTO test1 VALUES (16777217, 131584);
SELECT pagesize, version FROM page_header(get_raw_page('test1', 0));
pagesize | version
----------+---------
@@ -62,26 +83,6 @@ SELECT tuple_data_split('test1'::regclass, t_data, t_infomask, t_infomask2, t_bi
{"\\x01000001","\\x00020200"}
(1 row)
-SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0));
- fsm_page_contents
--------------------
- 0: 254 +
- 1: 254 +
- 3: 254 +
- 7: 254 +
- 15: 254 +
- 31: 254 +
- 63: 254 +
- 127: 254 +
- 255: 254 +
- 511: 254 +
- 1023: 254 +
- 2047: 254 +
- 4095: 254 +
- fp_next_slot: 0 +
-
-(1 row)
-
DROP TABLE test1;
-- check that using any of these functions with a partitioned table or index
-- would fail
diff --git a/contrib/pageinspect/sql/page.sql b/contrib/pageinspect/sql/page.sql
index 8ac9991837..67166ef54c 100644
--- a/contrib/pageinspect/sql/page.sql
+++ b/contrib/pageinspect/sql/page.sql
@@ -1,26 +1,35 @@
CREATE EXTENSION pageinspect;
-CREATE TABLE test1 (a int, b int);
-INSERT INTO test1 VALUES (16777217, 131584);
+CREATE TABLE test_rel_forks (a int);
+-- Make sure there are enough blocks in the heap for the FSM to be created.
+INSERT INTO test_rel_forks SELECT i from generate_series(1,2000) i;
-VACUUM test1; -- set up FSM
+-- set up FSM and VM
+VACUUM test_rel_forks;
-- The page contents can vary, so just test that it can be read
-- successfully, but don't keep the output.
-SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0;
-SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1;
+SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100;
-SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0;
-SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1;
+SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 20)) AS fsm_20;
-SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0;
-SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1;
+SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0;
+SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1;
SELECT octet_length(get_raw_page('xxx', 'main', 0));
-SELECT octet_length(get_raw_page('test1', 'xxx', 0));
+SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0));
-SELECT get_raw_page('test1', 0) = get_raw_page('test1', 'main', 0);
+SELECT * FROM fsm_page_contents(get_raw_page('test_rel_forks', 'fsm', 0));
+
+SELECT get_raw_page('test_rel_forks', 0) = get_raw_page('test_rel_forks', 'main', 0);
+
+DROP TABLE test_rel_forks;
+
+CREATE TABLE test1 (a int, b int);
+INSERT INTO test1 VALUES (16777217, 131584);
SELECT pagesize, version FROM page_header(get_raw_page('test1', 0));
@@ -29,8 +38,6 @@ SELECT page_checksum(get_raw_page('test1', 0), 0) IS NOT NULL AS silly_checksum_
SELECT tuple_data_split('test1'::regclass, t_data, t_infomask, t_infomask2, t_bits)
FROM heap_page_items(get_raw_page('test1', 0));
-SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0));
-
DROP TABLE test1;
-- check that using any of these functions with a partitioned table or index
diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml
index 8ef2ac8010..cbdad0c3fb 100644
--- a/doc/src/sgml/storage.sgml
+++ b/doc/src/sgml/storage.sgml
@@ -590,12 +590,13 @@ tuple would otherwise be too big.
FSMFree Space Map
-Each heap and index relation, except for hash indexes, has a Free Space Map
-(FSM) to keep track of available space in the relation. It's stored
-alongside the main relation data in a separate relation fork, named after the
-filenode number of the relation, plus a _fsm suffix. For example,
-if the filenode of a relation is 12345, the FSM is stored in a file called
-12345_fsm, in the same directory as the main relation file.
+Each heap relation, unless it is very small, and each index relation, except
+for hash indexes, has a Free Space Map (FSM) to keep track of available
+space in the relation. It's stored alongside the main relation data in a
+separate relation fork, named after the filenode number of the relation, plus
+a _fsm suffix. For example, if the filenode of a relation
+is 12345, the FSM is stored in a file called 12345_fsm,
+in the same directory as the main relation file.
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 467d91e681..8f008dd008 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -1150,7 +1150,7 @@ terminate_brin_buildstate(BrinBuildState *state)
freespace = PageGetFreeSpace(page);
blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
ReleaseBuffer(state->bs_currentInsertBuf);
- RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
+ RecordPageWithFreeSpace(state->bs_irel, blk, freespace, InvalidBlockNumber);
FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
}
diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c
index 164a468155..2eb354f948 100644
--- a/src/backend/access/brin/brin_pageops.c
+++ b/src/backend/access/brin/brin_pageops.c
@@ -310,7 +310,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
if (extended)
{
- RecordPageWithFreeSpace(idxrel, newblk, freespace);
+ RecordPageWithFreeSpace(idxrel, newblk, freespace, InvalidBlockNumber);
FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
}
@@ -461,7 +461,7 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
if (extended)
{
- RecordPageWithFreeSpace(idxrel, blk, freespace);
+ RecordPageWithFreeSpace(idxrel, blk, freespace, InvalidBlockNumber);
FreeSpaceMapVacuumRange(idxrel, blk, blk + 1);
}
@@ -654,7 +654,7 @@ brin_page_cleanup(Relation idxrel, Buffer buf)
/* Measure free space and record it */
RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf),
- br_page_get_freespace(page));
+ br_page_get_freespace(page), InvalidBlockNumber);
}
/*
@@ -703,7 +703,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
/* Choose initial target page, re-using existing target if known */
newblk = RelationGetTargetBlock(irel);
if (newblk == InvalidBlockNumber)
- newblk = GetPageWithFreeSpace(irel, itemsz);
+ newblk = GetPageWithFreeSpace(irel, itemsz, true);
/*
* Loop until we find a page with sufficient free space. By the time we
@@ -895,7 +895,7 @@ brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer)
* pages whose FSM records were forgotten in a crash.
*/
RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer),
- br_page_get_freespace(page));
+ br_page_get_freespace(page), InvalidBlockNumber);
}
diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index d41d318eef..a9c8ec43a7 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -246,8 +246,14 @@ RelationAddExtraBlocks(Relation relation, BulkInsertState bistate)
* Immediately update the bottom level of the FSM. This has a good
* chance of making this page visible to other concurrently inserting
* backends, and we want that to happen without delay.
+ *
+ * Since we know the table will end up with extraBlocks additional
+ * pages, we pass the final number to avoid possible unnecessary
+ * system calls and to make sure the FSM is created when we add the
+ * first new page.
*/
- RecordPageWithFreeSpace(relation, blockNum, freespace);
+ RecordPageWithFreeSpace(relation, blockNum, freespace,
+ firstBlock + extraBlocks);
}
while (--extraBlocks > 0);
@@ -384,20 +390,9 @@ RelationGetBufferForTuple(Relation relation, Size len,
* We have no cached target page, so ask the FSM for an initial
* target.
*/
- targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
-
- /*
- * If the FSM knows nothing of the rel, try the last page before we
- * give up and extend. This avoids one-tuple-per-page syndrome during
- * bootstrapping or in a recently-started system.
- */
- if (targetBlock == InvalidBlockNumber)
- {
- BlockNumber nblocks = RelationGetNumberOfBlocks(relation);
-
- if (nblocks > 0)
- targetBlock = nblocks - 1;
- }
+ targetBlock = GetPageWithFreeSpace(relation,
+ len + saveFreeSpace,
+ false);
}
loop:
@@ -504,6 +499,13 @@ loop:
{
/* use this page as future insert target, too */
RelationSetTargetBlock(relation, targetBlock);
+
+ /*
+ * In case we used an in-memory map of available blocks, reset it
+ * for next use.
+ */
+ FSMClearLocalMap();
+
return buffer;
}
@@ -563,9 +565,12 @@ loop:
/*
* Check if some other backend has extended a block for us while
- * we were waiting on the lock.
+ * we were waiting on the lock. We only check the FSM -- if there
+ * isn't one we don't recheck the number of blocks.
*/
- targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
+ targetBlock = GetPageWithFreeSpace(relation,
+ len + saveFreeSpace,
+ true);
/*
* If some other waiter has already extended the relation, we
@@ -670,5 +675,11 @@ loop:
*/
RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer));
+ /*
+ * In case we used an in-memory map of available blocks, reset it for next
+ * use.
+ */
+ FSMClearLocalMap();
+
return buffer;
}
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 26dfb0c7e0..9416c31889 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -153,7 +153,7 @@ static BufferAccessStrategy vac_strategy;
static void lazy_scan_heap(Relation onerel, int options,
LVRelStats *vacrelstats, Relation *Irel, int nindexes,
bool aggressive);
-static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
+static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats, BlockNumber nblocks);
static bool lazy_check_needs_freeze(Buffer buf, bool *hastup);
static void lazy_vacuum_index(Relation indrel,
IndexBulkDeleteResult **stats,
@@ -758,7 +758,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
pgstat_progress_update_multi_param(2, hvp_index, hvp_val);
/* Remove tuples from heap */
- lazy_vacuum_heap(onerel, vacrelstats);
+ lazy_vacuum_heap(onerel, vacrelstats, nblocks);
/*
* Forget the now-vacuumed tuples, and press on, but be careful
@@ -897,7 +897,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
Size freespace;
freespace = BufferGetPageSize(buf) - SizeOfPageHeaderData;
- RecordPageWithFreeSpace(onerel, blkno, freespace);
+ RecordPageWithFreeSpace(onerel, blkno, freespace, nblocks);
}
}
continue;
@@ -941,7 +941,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
}
UnlockReleaseBuffer(buf);
- RecordPageWithFreeSpace(onerel, blkno, freespace);
+ RecordPageWithFreeSpace(onerel, blkno, freespace, nblocks);
continue;
}
@@ -1338,7 +1338,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
* taken if there are no indexes.)
*/
if (vacrelstats->num_dead_tuples == prev_dead_count)
- RecordPageWithFreeSpace(onerel, blkno, freespace);
+ RecordPageWithFreeSpace(onerel, blkno, freespace, nblocks);
}
/* report that everything is scanned and vacuumed */
@@ -1400,7 +1400,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
/* Remove tuples from heap */
pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
- lazy_vacuum_heap(onerel, vacrelstats);
+ lazy_vacuum_heap(onerel, vacrelstats, nblocks);
vacrelstats->num_index_scans++;
}
@@ -1471,9 +1471,10 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
* Note: the reason for doing this as a second pass is we cannot remove
* the tuples until we've removed their index entries, and we want to
* process index entry removal in batches as large as possible.
+ * Note: nblocks is passed as an optimization for RecordPageWithFreeSpace().
*/
static void
-lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
+lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats, BlockNumber nblocks)
{
int tupindex;
int npages;
@@ -1510,7 +1511,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
freespace = PageGetHeapFreeSpace(page);
UnlockReleaseBuffer(buf);
- RecordPageWithFreeSpace(onerel, tblk, freespace);
+ RecordPageWithFreeSpace(onerel, tblk, freespace, nblocks);
npages++;
}
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 0181976964..92bda87804 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -48,6 +48,7 @@
#include "replication/walsender.h"
#include "storage/condition_variable.h"
#include "storage/fd.h"
+#include "storage/freespace.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/proc.h"
@@ -2493,6 +2494,12 @@ AbortTransaction(void)
pgstat_report_wait_end();
pgstat_progress_end_command();
+ /*
+ * In case we aborted during RelationGetBufferForTuple(), clear the local
+ * map of heap pages.
+ */
+ FSMClearLocalMap();
+
/* Clean up buffer I/O and buffer context locks, too */
AbortBufferIO();
UnlockBuffers();
@@ -4714,6 +4721,13 @@ AbortSubTransaction(void)
pgstat_report_wait_end();
pgstat_progress_end_command();
+
+ /*
+ * In case we aborted during RelationGetBufferForTuple(), clear the local
+ * map of heap pages.
+ */
+ FSMClearLocalMap();
+
AbortBufferIO();
UnlockBuffers();
diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README
index e7ff23b76f..0d3cd29772 100644
--- a/src/backend/storage/freespace/README
+++ b/src/backend/storage/freespace/README
@@ -8,7 +8,41 @@ free space to hold a tuple to be stored; or to determine that no such page
exists and the relation must be extended by one page. As of PostgreSQL 8.4
each relation has its own, extensible free space map stored in a separate
"fork" of its relation. This eliminates the disadvantages of the former
-fixed-size FSM.
+fixed-size FSM. There are two exceptions:
+
+1. Hash indexes never have a FSM.
+2. For very small tables, a 3-page relation fork would be relatively large
+and wasteful, so to save space we refrain from creating the FSM if the
+heap has HEAP_FSM_CREATION_THRESHOLD pages or fewer.
+
+To locate free space in the latter case, we simply try pages directly without
+knowing ahead of time how much free space they have. To maintain good
+performance, we create a local in-memory map of pages to try, and only mark
+every other page as available. For example, in a 3-page heap, the local map
+would look like:
+
+ANAN
+0123
+
+Pages 0 and 2 are marked "available", and page 1 as "not available".
+Page 3 is beyond the end of the relation, so is likewise marked "not
+available". First we try page 2, and if that doesn't have sufficient free
+space we try page 0 before giving up and extending the relation. There may
+be some wasted free space on block 1, but if the relation extends to 4 pages:
+
+NANA
+0123
+
+We not only have the new page 3 at our disposal, we can now check page 1
+for free space as well.
+
+Once the FSM is created for a heap we don't remove it even if somebody deletes
+all the rows from the corresponding relation. We don't think it is a useful
+optimization as it is quite likely that relation will again grow to the same
+size.
+
+FSM data structure
+------------------
It is important to keep the map small so that it can be searched rapidly.
Therefore, we don't attempt to record the exact free space on a page.
@@ -192,5 +226,3 @@ TODO
----
- fastroot to avoid traversing upper nodes with just 1 child
-- use a different system for tables that fit into one FSM page, with a
- mechanism to switch to the real thing as it grows.
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index eee8286057..d3f207b854 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -76,6 +76,14 @@
#define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1)
#define FSM_BOTTOM_LEVEL 0
+/* Status codes for the local map. */
+
+/* Either already tried, or beyond the end of the relation */
+#define FSM_LOCAL_NOT_AVAIL 0x00
+
+/* Available to try */
+#define FSM_LOCAL_AVAIL 0x01
+
/*
* The internal FSM routines work on a logical addressing scheme. Each
* level of the tree can be thought of as a separately addressable file.
@@ -89,6 +97,23 @@ typedef struct
/* Address of the root page. */
static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
+/* Local map of block numbers for small heaps with no FSM. */
+typedef struct
+{
+ BlockNumber nblocks;
+ uint8 map[HEAP_FSM_CREATION_THRESHOLD];
+} FSMLocalMap;
+
+static FSMLocalMap fsm_local_map =
+{
+ 0,
+ {
+ FSM_LOCAL_NOT_AVAIL
+ }
+};
+
+#define FSM_LOCAL_MAP_EXISTS (fsm_local_map.nblocks > 0)
+
/* functions to navigate the tree */
static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
@@ -107,10 +132,14 @@ static Size fsm_space_cat_to_avail(uint8 cat);
/* workhorse functions for various operations */
static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
uint8 newValue, uint8 minValue);
+static void fsm_local_set(Relation rel, BlockNumber cur_nblocks);
static BlockNumber fsm_search(Relation rel, uint8 min_cat);
+static BlockNumber fsm_local_search(void);
static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
BlockNumber start, BlockNumber end,
bool *eof);
+static bool fsm_allow_writes(Relation rel, BlockNumber heapblk,
+ BlockNumber nblocks, BlockNumber *get_nblocks);
/******** Public API ********/
@@ -127,13 +156,46 @@ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
* amount of free space available on that page and then try again (see
* RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned,
* extend the relation.
+ *
+ * For very small heap relations that don't have a FSM, we try every other
+ * page before extending the relation. To keep track of which pages have
+ * been tried, initialize a local in-memory map of pages.
*/
BlockNumber
-GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
+GetPageWithFreeSpace(Relation rel, Size spaceNeeded, bool check_fsm_only)
{
uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded);
+ BlockNumber target_block,
+ nblocks;
- return fsm_search(rel, min_cat);
+ /* First try the FSM, if it exists. */
+ target_block = fsm_search(rel, min_cat);
+
+ if (target_block == InvalidBlockNumber &&
+ (rel->rd_rel->relkind == RELKIND_RELATION ||
+ rel->rd_rel->relkind == RELKIND_TOASTVALUE) &&
+ !check_fsm_only)
+ {
+ nblocks = RelationGetNumberOfBlocks(rel);
+
+ if (nblocks > HEAP_FSM_CREATION_THRESHOLD)
+ {
+ /*
+ * If the FSM knows nothing of the rel, try the last page before
+ * we give up and extend. This avoids one-tuple-per-page syndrome
+ * during bootstrapping or in a recently-started system.
+ */
+ target_block = nblocks - 1;
+ }
+ else if (nblocks > 0)
+ {
+ /* Create or update local map and get first candidate block. */
+ fsm_local_set(rel, nblocks);
+ target_block = fsm_local_search();
+ }
+ }
+
+ return target_block;
}
/*
@@ -144,16 +206,47 @@ GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
* also some effort to return a page close to the old page; if there's a
* page with enough free space on the same FSM page where the old one page
* is located, it is preferred.
+ *
+ * For very small heap relations that don't have a FSM, we update the local
+ * map to indicate we have tried a page, and return the next page to try.
*/
BlockNumber
RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
Size oldSpaceAvail, Size spaceNeeded)
{
- int old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
- int search_cat = fsm_space_needed_to_cat(spaceNeeded);
+ int old_cat;
+ int search_cat;
FSMAddress addr;
uint16 slot;
int search_slot;
+ BlockNumber nblocks = InvalidBlockNumber;
+
+ /* First try the local map, if it exists. */
+ if (FSM_LOCAL_MAP_EXISTS)
+ {
+ Assert((rel->rd_rel->relkind == RELKIND_RELATION ||
+ rel->rd_rel->relkind == RELKIND_TOASTVALUE) &&
+ fsm_local_map.map[oldPage] == FSM_LOCAL_AVAIL);
+
+ fsm_local_map.map[oldPage] = FSM_LOCAL_NOT_AVAIL;
+ return fsm_local_search();
+ }
+
+ if (!fsm_allow_writes(rel, oldPage, InvalidBlockNumber, &nblocks))
+ {
+ /*
+ * If we have neither a local map nor a FSM, we probably just tried
+ * the target block in the smgr relation entry and failed, so we'll
+ * need to create the local map.
+ */
+ fsm_local_set(rel, nblocks);
+ return fsm_local_search();
+ }
+
+ /* Normal FSM logic follows */
+
+ old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
+ search_cat = fsm_space_needed_to_cat(spaceNeeded);
/* Get the location of the FSM byte representing the heap block */
addr = fsm_get_location(oldPage, &slot);
@@ -176,20 +269,44 @@ RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
* Note that if the new spaceAvail value is higher than the old value stored
* in the FSM, the space might not become visible to searchers until the next
* FreeSpaceMapVacuum call, which updates the upper level pages.
+ *
+ * Callers have no need for a local map.
*/
void
-RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
+RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk,
+ Size spaceAvail, BlockNumber nblocks)
{
- int new_cat = fsm_space_avail_to_cat(spaceAvail);
+ int new_cat;
FSMAddress addr;
uint16 slot;
+ BlockNumber dummy;
+
+ if (!fsm_allow_writes(rel, heapBlk, nblocks, &dummy))
+ /* No FSM to update and no local map either */
+ return;
/* Get the location of the FSM byte representing the heap block */
addr = fsm_get_location(heapBlk, &slot);
+ new_cat = fsm_space_avail_to_cat(spaceAvail);
fsm_set_and_search(rel, addr, slot, new_cat, 0);
}
+/*
+ * Clear the local map. We must call this when we have found a block with
+ * enough free space, when we extend the relation, or on transaction abort.
+ */
+void
+FSMClearLocalMap(void)
+{
+ if (FSM_LOCAL_MAP_EXISTS)
+ {
+ fsm_local_map.nblocks = 0;
+ memset(&fsm_local_map.map, FSM_LOCAL_NOT_AVAIL,
+ sizeof(fsm_local_map.map));
+ }
+}
+
/*
* XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
* WAL replay
@@ -204,6 +321,31 @@ XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
BlockNumber blkno;
Buffer buf;
Page page;
+ bool write_to_fsm;
+
+ /* This is meant to mirror the logic in fsm_allow_writes() */
+ if (heapBlk >= HEAP_FSM_CREATION_THRESHOLD)
+ write_to_fsm = true;
+ else
+ {
+ /* Open the relation at smgr level */
+ SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+
+ if (smgrexists(smgr, FSM_FORKNUM))
+ write_to_fsm = true;
+ else
+ {
+ BlockNumber heap_nblocks = smgrnblocks(smgr, MAIN_FORKNUM);
+
+ if (heap_nblocks > HEAP_FSM_CREATION_THRESHOLD)
+ write_to_fsm = true;
+ else
+ write_to_fsm = false;
+ }
+ }
+
+ if (!write_to_fsm)
+ return;
/* Get the location of the FSM byte representing the heap block */
addr = fsm_get_location(heapBlk, &slot);
@@ -904,3 +1046,134 @@ fsm_vacuum_page(Relation rel, FSMAddress addr,
return max_avail;
}
+
+/*
+ * For heaps, we prevent creation of the FSM unless the number of pages
+ * exceeds HEAP_FSM_CREATION_THRESHOLD. For tables that don't already have
+ * a FSM, this will save an inode and a few kB of space.
+ *
+ * XXX The API is a little awkward -- if the caller passes a valid nblocks
+ * value, it can avoid invoking a system call. If the caller passes
+ * InvalidBlockNumber and receives a false return value, it can get an
+ * up-to-date relation size from get_nblocks. This saves a few cycles in
+ * the caller, which would otherwise need to get the relation size by itself.
+ */
+static bool
+fsm_allow_writes(Relation rel, BlockNumber heapblk,
+ BlockNumber nblocks, BlockNumber *get_nblocks)
+{
+ bool skip_get_nblocks;
+
+ if (heapblk >= HEAP_FSM_CREATION_THRESHOLD)
+ return true;
+
+ /* Non-heap rels can always create a FSM. */
+ if (rel->rd_rel->relkind != RELKIND_RELATION &&
+ rel->rd_rel->relkind != RELKIND_TOASTVALUE)
+ return true;
+
+ /*
+ * If the caller knows nblocks, we can avoid a system call later. If it
+ * doesn't, maybe we have relpages from a previous VACUUM. Since the table
+ * may have extended since then, we still have to count the pages later if
+ * we can't return now.
+ */
+ if (nblocks != InvalidBlockNumber)
+ {
+ if (nblocks > HEAP_FSM_CREATION_THRESHOLD)
+ return true;
+ else
+ skip_get_nblocks = true;
+ }
+ else
+ {
+ if (rel->rd_rel->relpages != InvalidBlockNumber &&
+ rel->rd_rel->relpages > HEAP_FSM_CREATION_THRESHOLD)
+ return true;
+ else
+ skip_get_nblocks = false;
+ }
+
+ RelationOpenSmgr(rel);
+ if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
+ return true;
+
+ if (skip_get_nblocks)
+ return false;
+
+ /* last resort */
+ *get_nblocks = RelationGetNumberOfBlocks(rel);
+ if (*get_nblocks > HEAP_FSM_CREATION_THRESHOLD)
+ return true;
+ else
+ return false;
+}
+
+/*
+ * Initialize or update the local map of blocks to try, for when there is
+ * no FSM.
+ *
+ * When we initialize the map, the whole heap is potentially available to
+ * try. Testing revealed that trying every block can cause a small
+ * performance dip compared to when we use a FSM, so we try every other
+ * block instead.
+ */
+static void
+fsm_local_set(Relation rel, BlockNumber cur_nblocks)
+{
+ BlockNumber blkno,
+ cached_target_block;
+
+ /* The local map must not be set already. */
+ Assert(!FSM_LOCAL_MAP_EXISTS);
+
+ /*
+ * Starting at the current last block in the relation and working
+ * backwards, mark alternating blocks as available.
+ */
+ blkno = cur_nblocks - 1;
+ while (true)
+ {
+ fsm_local_map.map[blkno] = FSM_LOCAL_AVAIL;
+ if (blkno >= 2)
+ blkno -= 2;
+ else
+ break;
+ }
+
+ /* Cache the number of blocks. */
+ fsm_local_map.nblocks = cur_nblocks;
+
+ /* Set the status of the cached target block to 'unavailable'. */
+ cached_target_block = RelationGetTargetBlock(rel);
+ if (cached_target_block != InvalidBlockNumber &&
+ cached_target_block < cur_nblocks)
+ fsm_local_map.map[cached_target_block] = FSM_LOCAL_NOT_AVAIL;
+}
+
+/*
+ * Search the local map for an available block to try, in descending order.
+ * As such, there is no heuristic available to decide which order will be
+ * better to try, but the probability of having space in the last block in the
+ * map is higher because that is the most recent block added to the heap.
+ *
+ * This function is used when there is no FSM.
+ */
+static BlockNumber
+fsm_local_search(void)
+{
+ BlockNumber target_block;
+
+ /* Local map must be set by now. */
+ Assert(FSM_LOCAL_MAP_EXISTS);
+
+ target_block = fsm_local_map.nblocks;
+ do
+ {
+ target_block--;
+ if (fsm_local_map.map[target_block] == FSM_LOCAL_AVAIL)
+ return target_block;
+ } while (target_block > 0);
+
+ return InvalidBlockNumber;
+}
diff --git a/src/backend/storage/freespace/indexfsm.c b/src/backend/storage/freespace/indexfsm.c
index 58cedeaa9f..9d8f43d373 100644
--- a/src/backend/storage/freespace/indexfsm.c
+++ b/src/backend/storage/freespace/indexfsm.c
@@ -37,7 +37,7 @@
BlockNumber
GetFreeIndexPage(Relation rel)
{
- BlockNumber blkno = GetPageWithFreeSpace(rel, BLCKSZ / 2);
+ BlockNumber blkno = GetPageWithFreeSpace(rel, BLCKSZ / 2, true);
if (blkno != InvalidBlockNumber)
RecordUsedIndexPage(rel, blkno);
@@ -51,7 +51,7 @@ GetFreeIndexPage(Relation rel)
void
RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
{
- RecordPageWithFreeSpace(rel, freeBlock, BLCKSZ - 1);
+ RecordPageWithFreeSpace(rel, freeBlock, BLCKSZ - 1, InvalidBlockNumber);
}
@@ -61,7 +61,7 @@ RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
void
RecordUsedIndexPage(Relation rel, BlockNumber usedBlock)
{
- RecordPageWithFreeSpace(rel, usedBlock, 0);
+ RecordPageWithFreeSpace(rel, usedBlock, 0, InvalidBlockNumber);
}
/*
diff --git a/src/include/storage/freespace.h b/src/include/storage/freespace.h
index 8b00033438..dbaae651c5 100644
--- a/src/include/storage/freespace.h
+++ b/src/include/storage/freespace.h
@@ -18,15 +18,20 @@
#include "storage/relfilenode.h"
#include "utils/relcache.h"
+/* Only create the FSM if the heap has greater than this many blocks */
+#define HEAP_FSM_CREATION_THRESHOLD 4
+
/* prototypes for public functions in freespace.c */
extern Size GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk);
-extern BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded);
+extern BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded,
+ bool check_fsm_only);
extern BlockNumber RecordAndGetPageWithFreeSpace(Relation rel,
BlockNumber oldPage,
Size oldSpaceAvail,
Size spaceNeeded);
extern void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk,
- Size spaceAvail);
+ Size spaceAvail, BlockNumber nblocks);
+extern void FSMClearLocalMap(void);
extern void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
Size spaceAvail);
diff --git a/src/test/regress/expected/fsm.out b/src/test/regress/expected/fsm.out
new file mode 100644
index 0000000000..b02993188c
--- /dev/null
+++ b/src/test/regress/expected/fsm.out
@@ -0,0 +1,48 @@
+--
+-- Free Space Map test
+--
+CREATE TABLE fsm_check_size (num int, str text);
+-- With one block, there should be no FSM
+INSERT INTO fsm_check_size VALUES(1, 'a');
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'main') AS heap_size,
+pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+ heap_size | fsm_size
+-----------+----------
+ 8192 | 0
+(1 row)
+
+-- Extend table with enough blocks to exceed the FSM threshold
+DO $$
+DECLARE curtid tid;
+num int;
+BEGIN
+num = 11;
+ LOOP
+ INSERT INTO fsm_check_size VALUES (num, 'b') RETURNING ctid INTO curtid;
+ EXIT WHEN curtid >= tid '(4, 0)';
+ num = num + 1;
+ END LOOP;
+END;
+$$;
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+ fsm_size
+----------
+ 24576
+(1 row)
+
+-- Add long random string to extend TOAST table to 1 block
+INSERT INTO fsm_check_size
+VALUES(0, (SELECT string_agg(md5(chr(i)), '')
+ FROM generate_series(1,100) i));
+VACUUM fsm_check_size;
+SELECT pg_relation_size(reltoastrelid, 'main') AS toast_size,
+pg_relation_size(reltoastrelid, 'fsm') AS toast_fsm_size
+FROM pg_class WHERE relname = 'fsm_check_size';
+ toast_size | toast_fsm_size
+------------+----------------
+ 8192 | 0
+(1 row)
+
+DROP TABLE fsm_check_size;
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index cc0bbf5db9..4051a4ad4e 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -68,6 +68,12 @@ test: create_aggregate create_function_3 create_cast constraints triggers inheri
# ----------
test: sanity_check
+# ----------
+# fsm does a delete followed by vacuum, and running it in parallel can prevent
+# removal of rows.
+# ----------
+test: fsm
+
# ----------
# Believe it or not, select creates a table, subsequent
# tests need.
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index 0c10c7100c..ac1ea622d6 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -80,6 +80,7 @@ test: roleattributes
test: create_am
test: hash_func
test: sanity_check
+test: fsm
test: errors
test: select
test: select_into
diff --git a/src/test/regress/sql/fsm.sql b/src/test/regress/sql/fsm.sql
new file mode 100644
index 0000000000..332c3e2b2d
--- /dev/null
+++ b/src/test/regress/sql/fsm.sql
@@ -0,0 +1,41 @@
+--
+-- Free Space Map test
+--
+
+CREATE TABLE fsm_check_size (num int, str text);
+
+-- With one block, there should be no FSM
+INSERT INTO fsm_check_size VALUES(1, 'a');
+
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'main') AS heap_size,
+pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+
+-- Extend table with enough blocks to exceed the FSM threshold
+DO $$
+DECLARE curtid tid;
+num int;
+BEGIN
+num = 11;
+ LOOP
+ INSERT INTO fsm_check_size VALUES (num, 'b') RETURNING ctid INTO curtid;
+ EXIT WHEN curtid >= tid '(4, 0)';
+ num = num + 1;
+ END LOOP;
+END;
+$$;
+
+VACUUM fsm_check_size;
+SELECT pg_relation_size('fsm_check_size', 'fsm') AS fsm_size;
+
+-- Add long random string to extend TOAST table to 1 block
+INSERT INTO fsm_check_size
+VALUES(0, (SELECT string_agg(md5(chr(i)), '')
+ FROM generate_series(1,100) i));
+
+VACUUM fsm_check_size;
+SELECT pg_relation_size(reltoastrelid, 'main') AS toast_size,
+pg_relation_size(reltoastrelid, 'fsm') AS toast_fsm_size
+FROM pg_class WHERE relname = 'fsm_check_size';
+
+DROP TABLE fsm_check_size;