Skip full index scan during cleanup of B-tree indexes when possible

Vacuum of index consists from two stages: multiple (zero of more) ambulkdelete
calls and one amvacuumcleanup call. When workload on particular table
is append-only, then autovacuum isn't intended to touch this table. However,
user may run vacuum manually in order to fill visibility map and get benefits
of index-only scans. Then ambulkdelete wouldn't be called for indexes
of such table (because no heap tuples were deleted), only amvacuumcleanup would
be called In this case, amvacuumcleanup would perform full index scan for
two objectives: put recyclable pages into free space map and update index
statistics.

This patch allows btvacuumclanup to skip full index scan when two conditions
are satisfied: no pages are going to be put into free space map and index
statistics isn't stalled. In order to check first condition, we store
oldest btpo_xact in the meta-page. When it's precedes RecentGlobalXmin, then
there are some recyclable pages. In order to check second condition we store
number of heap tuples observed during previous full index scan by cleanup.
If fraction of newly inserted tuples is less than
vacuum_cleanup_index_scale_factor, then statistics isn't considered to be
stalled. vacuum_cleanup_index_scale_factor can be defined as both reloption and GUC (default).

This patch bumps B-tree meta-page version. Upgrade of meta-page is performed
"on the fly": during VACUUM meta-page is rewritten with new version. No special
handling in pg_upgrade is required.

Author: Masahiko Sawada, Alexander Korotkov
Review by: Peter Geoghegan, Kyotaro Horiguchi, Alexander Korotkov, Yura Sokolov
Discussion: https://www.postgresql.org/message-id/flat/CAD21AoAX+d2oD_nrd9O2YkpzHaFr=uQeGr9s1rKC3O4ENc568g@mail.gmail.com
This commit is contained in:
Teodor Sigaev 2018-04-04 19:29:00 +03:00
parent eac93e20af
commit 857f9c36cd
23 changed files with 458 additions and 45 deletions

View File

@ -1500,12 +1500,14 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
errmsg("index \"%s\" meta page is corrupt",
RelationGetRelationName(state->rel))));
if (metad->btm_version != BTREE_VERSION)
if (metad->btm_version < BTREE_MIN_VERSION ||
metad->btm_version > BTREE_VERSION)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("version mismatch in index \"%s\": file version %d, code version %d",
errmsg("version mismatch in index \"%s\": file version %d, "
"current version %d, minimal supported version %d",
RelationGetRelationName(state->rel),
metad->btm_version, BTREE_VERSION)));
metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
}
/*

View File

@ -5,7 +5,8 @@ OBJS = rawpage.o heapfuncs.o btreefuncs.o fsmfuncs.o \
brinfuncs.o ginfuncs.o hashfuncs.o $(WIN32RES)
EXTENSION = pageinspect
DATA = pageinspect--1.5.sql pageinspect--1.5--1.6.sql \
DATA = pageinspect--1.6--1.7.sql \
pageinspect--1.5.sql pageinspect--1.5--1.6.sql \
pageinspect--1.4--1.5.sql pageinspect--1.3--1.4.sql \
pageinspect--1.2--1.3.sql pageinspect--1.1--1.2.sql \
pageinspect--1.0--1.1.sql pageinspect--unpackaged--1.0.sql

View File

@ -511,7 +511,7 @@ bt_metap(PG_FUNCTION_ARGS)
BTMetaPageData *metad;
TupleDesc tupleDesc;
int j;
char *values[6];
char *values[8];
Buffer buffer;
Page page;
HeapTuple tuple;
@ -555,6 +555,8 @@ bt_metap(PG_FUNCTION_ARGS)
values[j++] = psprintf("%d", metad->btm_level);
values[j++] = psprintf("%d", metad->btm_fastroot);
values[j++] = psprintf("%d", metad->btm_fastlevel);
values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact);
values[j++] = psprintf("%lf", metad->btm_last_cleanup_num_heap_tuples);
tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
values);

View File

@ -3,13 +3,15 @@ INSERT INTO test1 VALUES (72057594037927937, 'text');
CREATE INDEX test1_a_idx ON test1 USING btree (a);
\x
SELECT * FROM bt_metap('test1_a_idx');
-[ RECORD 1 ]-----
-[ RECORD 1 ]-----------+-------
magic | 340322
version | 2
version | 3
root | 1
level | 0
fastroot | 1
fastlevel | 0
oldest_xact | 0
last_cleanup_num_tuples | -1
SELECT * FROM bt_page_stats('test1_a_idx', 0);
ERROR: block 0 is a meta page

View File

@ -0,0 +1,26 @@
/* contrib/pageinspect/pageinspect--1.6--1.7.sql */
-- complain if script is sourced in psql, rather than via ALTER EXTENSION
\echo Use "ALTER EXTENSION pageinspect UPDATE TO '1.7'" to load this file. \quit
--
-- bt_metap()
--
DROP FUNCTION bt_metap(IN relname text,
OUT magic int4,
OUT version int4,
OUT root int4,
OUT level int4,
OUT fastroot int4,
OUT fastlevel int4);
CREATE FUNCTION bt_metap(IN relname text,
OUT magic int4,
OUT version int4,
OUT root int4,
OUT level int4,
OUT fastroot int4,
OUT fastlevel int4,
OUT oldest_xact int4,
OUT last_cleanup_num_tuples real)
AS 'MODULE_PATHNAME', 'bt_metap'
LANGUAGE C STRICT PARALLEL SAFE;

View File

@ -1,5 +1,5 @@
# pageinspect extension
comment = 'inspect the contents of database pages at a low level'
default_version = '1.6'
default_version = '1.7'
module_pathname = '$libdir/pageinspect'
relocatable = true

View File

@ -48,7 +48,7 @@ select version, tree_level,
from pgstatindex('test_pkey');
version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation
---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
(1 row)
select version, tree_level,
@ -58,7 +58,7 @@ select version, tree_level,
from pgstatindex('test_pkey'::text);
version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation
---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
(1 row)
select version, tree_level,
@ -68,7 +68,7 @@ select version, tree_level,
from pgstatindex('test_pkey'::name);
version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation
---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
(1 row)
select version, tree_level,
@ -78,7 +78,7 @@ select version, tree_level,
from pgstatindex('test_pkey'::regclass);
version | tree_level | index_size | root_block_no | internal_pages | leaf_pages | empty_pages | deleted_pages | avg_leaf_density | leaf_fragmentation
---------+------------+------------+---------------+----------------+------------+-------------+---------------+------------------+--------------------
2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | NaN
(1 row)
select pg_relpages('test');
@ -229,7 +229,7 @@ create index test_partition_hash_idx on test_partition using hash (a);
select pgstatindex('test_partition_idx');
pgstatindex
------------------------------
(2,0,8192,0,0,0,0,0,NaN,NaN)
(3,0,8192,0,0,0,0,0,NaN,NaN)
(1 row)
select pgstathashindex('test_partition_hash_idx');

View File

@ -1882,6 +1882,31 @@ include_dir 'conf.d'
</note>
</sect2>
<sect2 id="runtime-config-index-vacuum">
<title>Index Vacuum</title>
<variablelist>
<varlistentry id="guc-vacuum-cleanup-index-scale-factor" xreflabel="vacuum_cleanup_index_scale_factor">
<term><varname>vacuum_cleanup_index_scale_factor</varname> (<type>floating point</type>)
<indexterm>
<primary><varname>vacuum_cleanup_index_scale_factor</varname> configuration parameter</primary>
</indexterm>
</term>
<listitem>
<para>
When no tuples were deleted from the heap, B-tree indexes might still
be scanned during <command>VACUUM</command> cleanup stage by two
reasons. The first reason is that B-tree index contains deleted pages
which can be recycled during cleanup. The second reason is that B-tree
index statistics is stalled. The criterion of stalled index statistics
is number of inserted tuples since previous statistics collection
is greater than <varname>vacuum_cleanup_index_scale_factor</varname>
fraction of total number of heap tuples.
</para>
</listitem>
</varlistentry>
</variablelist>
</sect2>
<sect2 id="runtime-config-resource-background-writer">
<title>Background Writer</title>

View File

@ -247,13 +247,15 @@ test=# SELECT * FROM heap_page_item_attrs(get_raw_page('pg_class', 0), 'pg_class
index's metapage. For example:
<screen>
test=# SELECT * FROM bt_metap('pg_cast_oid_index');
-[ RECORD 1 ]-----
-[ RECORD 1 ]-----------+-------
magic | 340322
version | 2
version | 3
root | 1
level | 0
fastroot | 1
fastlevel | 0
oldest_xact | 582
last_cleanup_num_tuples | 1000
</screen>
</para>
</listitem>

View File

@ -369,6 +369,21 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class=
</varlistentry>
</variablelist>
<para>
B-tree indexes additionally accept this parameter:
</para>
<variablelist>
<varlistentry>
<term><literal>vacuum_cleanup_index_scale_factor</literal></term>
<listitem>
<para>
Per-table value for <xref linkend="guc-vacuum-cleanup-index-scale-factor"/>.
</para>
</listitem>
</varlistentry>
</variablelist>
<para>
GiST indexes additionally accept this parameter:
</para>

View File

@ -409,6 +409,15 @@ static relopt_real realRelOpts[] =
},
0, -1.0, DBL_MAX
},
{
{
"vacuum_cleanup_index_scale_factor",
"Number of tuple inserts prior to index cleanup as a fraction of reltuples.",
RELOPT_KIND_BTREE,
ShareUpdateExclusiveLock
},
-1, 0.0, 100.0
},
/* list terminator */
{{NULL}}
};
@ -1371,7 +1380,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind)
{"user_catalog_table", RELOPT_TYPE_BOOL,
offsetof(StdRdOptions, user_catalog_table)},
{"parallel_workers", RELOPT_TYPE_INT,
offsetof(StdRdOptions, parallel_workers)}
offsetof(StdRdOptions, parallel_workers)},
{"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL,
offsetof(StdRdOptions, vacuum_cleanup_index_scale_factor)}
};
options = parseRelOptions(reloptions, validate, kind, &numoptions);

View File

@ -939,6 +939,9 @@ _bt_insertonpg(Relation rel,
if (BufferIsValid(metabuf))
{
/* upgrade meta-page if needed */
if (metad->btm_version < BTREE_VERSION)
_bt_upgrademetapage(metapg);
metad->btm_fastroot = itup_blkno;
metad->btm_fastlevel = lpageop->btpo.level;
MarkBufferDirty(metabuf);
@ -997,6 +1000,9 @@ _bt_insertonpg(Relation rel,
xlmeta.level = metad->btm_level;
xlmeta.fastroot = metad->btm_fastroot;
xlmeta.fastlevel = metad->btm_fastlevel;
xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
xlmeta.last_cleanup_num_heap_tuples =
metad->btm_last_cleanup_num_heap_tuples;
XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
@ -2049,6 +2055,10 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg);
/* upgrade metapage if needed */
if (metad->btm_version < BTREE_VERSION)
_bt_upgrademetapage(metapg);
/*
* Create downlink item for left page (old root). Since this will be the
* first item in a non-leaf page, it implicitly has minus-infinity key
@ -2138,6 +2148,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
md.level = metad->btm_level;
md.fastroot = rootblknum;
md.fastlevel = metad->btm_level;
md.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));

View File

@ -60,6 +60,8 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
metad->btm_level = level;
metad->btm_fastroot = rootbknum;
metad->btm_fastlevel = level;
metad->btm_oldest_btpo_xact = InvalidTransactionId;
metad->btm_last_cleanup_num_heap_tuples = -1.0;
metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
metaopaque->btpo_flags = BTP_META;
@ -73,6 +75,114 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
}
/*
* _bt_upgrademetapage() -- Upgrade a meta-page from an old format to the new.
*
* This routine does purely in-memory image upgrade. Caller is
* responsible for locking, WAL-logging etc.
*/
void
_bt_upgrademetapage(Page page)
{
BTMetaPageData *metad;
BTPageOpaque metaopaque;
metad = BTPageGetMeta(page);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
/* It must be really a meta page of upgradable version */
Assert(metaopaque->btpo_flags & BTP_META);
Assert(metad->btm_version < BTREE_VERSION);
Assert(metad->btm_version >= BTREE_MIN_VERSION);
/* Set version number and fill extra fields added into version 3 */
metad->btm_version = BTREE_VERSION;
metad->btm_oldest_btpo_xact = InvalidTransactionId;
metad->btm_last_cleanup_num_heap_tuples = -1.0;
/* Adjust pd_lower (see _bt_initmetapage() for details) */
((PageHeader) page)->pd_lower =
((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
}
/*
* _bt_update_meta_cleanup_info() -- Update cleanup-related information in
* the metapage.
*
* This routine checks if provided cleanup-related information is matching
* to those written in the metapage. On mismatch, metapage is overritten.
*/
void
_bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
float8 numHeapTuples)
{
Buffer metabuf;
Page metapg;
BTPageOpaque metaopaque;
BTMetaPageData *metad;
bool needsRewrite = false;
XLogRecPtr recptr;
/* read the metapage and check if it needs rewrite */
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
metad = BTPageGetMeta(metapg);
/* outdated version of metapage always needs rewrite */
if (metad->btm_version < BTREE_VERSION)
needsRewrite = true;
else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
needsRewrite = true;
if (!needsRewrite)
{
_bt_relbuf(rel, metabuf);
return;
}
/* trade in our read lock for a write lock */
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
LockBuffer(metabuf, BT_WRITE);
START_CRIT_SECTION();
/* upgrade meta-page if needed */
if (metad->btm_version < BTREE_VERSION)
_bt_upgrademetapage(metapg);
/* update cleanup-related infromation */
metad->btm_oldest_btpo_xact = oldestBtpoXact;
metad->btm_last_cleanup_num_heap_tuples = numHeapTuples;
MarkBufferDirty(metabuf);
/* write wal record if needed */
if (RelationNeedsWAL(rel))
{
xl_btree_metadata md;
XLogBeginInsert();
XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
md.root = metad->btm_root;
md.level = metad->btm_level;
md.fastroot = metad->btm_fastroot;
md.fastlevel = metad->btm_fastlevel;
md.oldest_btpo_xact = oldestBtpoXact;
md.last_cleanup_num_heap_tuples = numHeapTuples;
XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
PageSetLSN(metapg, recptr);
}
END_CRIT_SECTION();
_bt_relbuf(rel, metabuf);
}
/*
* _bt_getroot() -- Get the root page of the btree.
*
@ -124,7 +234,8 @@ _bt_getroot(Relation rel, int access)
metad = (BTMetaPageData *) rel->rd_amcache;
/* We shouldn't have cached it if any of these fail */
Assert(metad->btm_magic == BTREE_MAGIC);
Assert(metad->btm_version == BTREE_VERSION);
Assert(metad->btm_version >= BTREE_MIN_VERSION);
Assert(metad->btm_version <= BTREE_VERSION);
Assert(metad->btm_root != P_NONE);
rootblkno = metad->btm_fastroot;
@ -170,12 +281,14 @@ _bt_getroot(Relation rel, int access)
errmsg("index \"%s\" is not a btree",
RelationGetRelationName(rel))));
if (metad->btm_version != BTREE_VERSION)
if (metad->btm_version < BTREE_MIN_VERSION ||
metad->btm_version > BTREE_VERSION)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("version mismatch in index \"%s\": file version %d, code version %d",
errmsg("version mismatch in index \"%s\": file version %d, "
"current version %d, minimal supported version %d",
RelationGetRelationName(rel),
metad->btm_version, BTREE_VERSION)));
metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
/* if no root page initialized yet, do it */
if (metad->btm_root == P_NONE)
@ -191,6 +304,10 @@ _bt_getroot(Relation rel, int access)
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
LockBuffer(metabuf, BT_WRITE);
/* upgrade metapage if needed */
if (metad->btm_version < BTREE_VERSION)
_bt_upgrademetapage(metapg);
/*
* Race condition: if someone else initialized the metadata between
* the time we released the read lock and acquired the write lock, we
@ -229,6 +346,8 @@ _bt_getroot(Relation rel, int access)
metad->btm_level = 0;
metad->btm_fastroot = rootblkno;
metad->btm_fastlevel = 0;
metad->btm_oldest_btpo_xact = InvalidTransactionId;
metad->btm_last_cleanup_num_heap_tuples = -1.0;
MarkBufferDirty(rootbuf);
MarkBufferDirty(metabuf);
@ -248,6 +367,8 @@ _bt_getroot(Relation rel, int access)
md.level = 0;
md.fastroot = rootblkno;
md.fastlevel = 0;
md.oldest_btpo_xact = InvalidTransactionId;
md.last_cleanup_num_heap_tuples = -1.0;
XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
@ -373,12 +494,14 @@ _bt_gettrueroot(Relation rel)
errmsg("index \"%s\" is not a btree",
RelationGetRelationName(rel))));
if (metad->btm_version != BTREE_VERSION)
if (metad->btm_version < BTREE_MIN_VERSION ||
metad->btm_version > BTREE_VERSION)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("version mismatch in index \"%s\": file version %d, code version %d",
errmsg("version mismatch in index \"%s\": file version %d, "
"current version %d, minimal supported version %d",
RelationGetRelationName(rel),
metad->btm_version, BTREE_VERSION)));
metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
/* if no root page initialized yet, fail */
if (metad->btm_root == P_NONE)
@ -460,12 +583,14 @@ _bt_getrootheight(Relation rel)
errmsg("index \"%s\" is not a btree",
RelationGetRelationName(rel))));
if (metad->btm_version != BTREE_VERSION)
if (metad->btm_version < BTREE_MIN_VERSION ||
metad->btm_version > BTREE_VERSION)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("version mismatch in index \"%s\": file version %d, code version %d",
errmsg("version mismatch in index \"%s\": file version %d, "
"current version %d, minimal supported version %d",
RelationGetRelationName(rel),
metad->btm_version, BTREE_VERSION)));
metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
/*
* If there's no root page yet, _bt_getroot() doesn't expect a cache
@ -1784,6 +1909,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
/* And update the metapage, if needed */
if (BufferIsValid(metabuf))
{
/* upgrade metapage if needed */
if (metad->btm_version < BTREE_VERSION)
_bt_upgrademetapage(metapg);
metad->btm_fastroot = rightsib;
metad->btm_fastlevel = targetlevel;
MarkBufferDirty(metabuf);
@ -1834,6 +1962,8 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
xlmeta.level = metad->btm_level;
xlmeta.fastroot = metad->btm_fastroot;
xlmeta.fastlevel = metad->btm_fastlevel;
xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
XLogRegisterBufData(4, (char *) &xlmeta, sizeof(xl_btree_metadata));
xlinfo = XLOG_BTREE_UNLINK_PAGE_META;

View File

@ -19,11 +19,14 @@
#include "postgres.h"
#include "access/nbtree.h"
#include "access/nbtxlog.h"
#include "access/relscan.h"
#include "access/xlog.h"
#include "commands/vacuum.h"
#include "miscadmin.h"
#include "nodes/execnodes.h"
#include "pgstat.h"
#include "postmaster/autovacuum.h"
#include "storage/condition_variable.h"
#include "storage/indexfsm.h"
#include "storage/ipc.h"
@ -45,6 +48,7 @@ typedef struct
BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */
BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */
BlockNumber totFreePages; /* true total # of free pages */
TransactionId oldestBtpoXact;
MemoryContext pagedelcontext;
} BTVacState;
@ -89,7 +93,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc;
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state,
BTCycleId cycleid);
BTCycleId cycleid, TransactionId *oldestBtpoXact);
static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
BlockNumber orig_blkno);
@ -773,6 +777,70 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan)
SpinLockRelease(&btscan->btps_mutex);
}
/*
* _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
* btbulkdelete() wasn't called.
*/
static bool
_bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
{
Buffer metabuf;
Page metapg;
BTPageOpaque metaopaque;
BTMetaPageData *metad;
bool result = false;
metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
metad = BTPageGetMeta(metapg);
if (metad->btm_version < BTREE_VERSION)
{
/*
* Do cleanup if metapage needs upgrade, because we don't have
* cleanup-related meta-information yet.
*/
result = true;
}
else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
RecentGlobalXmin))
{
/*
* If oldest btpo.xact in the deleted pages is older than
* RecentGlobalXmin, then at least one deleted page can be recycled.
*/
result = true;
}
else
{
StdRdOptions *relopts;
float8 cleanup_scale_factor;
/*
* If table receives large enough amount of insertions and no cleanup
* was performed, then index might appear to have stalled statistics.
* In order to evade that, we perform cleanup when table receives
* vacuum_cleanup_index_scale_factor fractions of insertions.
*/
relopts = (StdRdOptions *) info->index->rd_options;
cleanup_scale_factor = (relopts &&
relopts->vacuum_cleanup_index_scale_factor >= 0)
? relopts->vacuum_cleanup_index_scale_factor
: vacuum_cleanup_index_scale_factor;
if (cleanup_scale_factor < 0 ||
metad->btm_last_cleanup_num_heap_tuples < 0 ||
info->num_heap_tuples > (1.0 + cleanup_scale_factor) *
metad->btm_last_cleanup_num_heap_tuples)
result = true;
}
_bt_relbuf(info->index, metabuf);
return result;
}
/*
* Bulk deletion of all index entries pointing to a set of heap tuples.
* The set of target tuples is specified via a callback routine that tells
@ -795,9 +863,20 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
/* The ENSURE stuff ensures we clean up shared memory on failure */
PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
{
TransactionId oldestBtpoXact;
cycleid = _bt_start_vacuum(rel);
btvacuumscan(info, stats, callback, callback_state, cycleid);
btvacuumscan(info, stats, callback, callback_state, cycleid,
&oldestBtpoXact);
/*
* Update cleanup-related information in metapage. These information
* is used only for cleanup but keeping up them to date can avoid
* unnecessary cleanup even after bulkdelete.
*/
_bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
info->num_heap_tuples);
}
PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
_bt_end_vacuum(rel);
@ -819,17 +898,28 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
/*
* If btbulkdelete was called, we need not do anything, just return the
* stats from the latest btbulkdelete call. If it wasn't called, we must
* still do a pass over the index, to recycle any newly-recyclable pages
* and to obtain index statistics.
* stats from the latest btbulkdelete call. If it wasn't called, we might
* still need to do a pass over the index, to recycle any newly-recyclable
* pages and to obtain index statistics. _bt_vacuum_needs_cleanup checks
* is there are newly-recyclable or stalled index statistics.
*
* Since we aren't going to actually delete any leaf items, there's no
* need to go through all the vacuum-cycle-ID pushups.
*/
if (stats == NULL)
{
TransactionId oldestBtpoXact;
/* Check if we need a cleanup */
if (!_bt_vacuum_needs_cleanup(info))
return NULL;
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
btvacuumscan(info, stats, NULL, NULL, 0);
btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact);
/* Update cleanup-related information in the metapage */
_bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
info->num_heap_tuples);
}
/*
@ -862,7 +952,7 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
static void
btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state,
BTCycleId cycleid)
BTCycleId cycleid, TransactionId *oldestBtpoXact)
{
Relation rel = info->index;
BTVacState vstate;
@ -887,6 +977,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
vstate.lastBlockLocked = BTREE_METAPAGE;
vstate.totFreePages = 0;
vstate.oldestBtpoXact = InvalidTransactionId;
/* Create a temporary memory context to run _bt_pagedel in */
vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
@ -991,6 +1082,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
/* update statistics */
stats->num_pages = num_pages;
stats->pages_free = vstate.totFreePages;
if (oldestBtpoXact)
*oldestBtpoXact = vstate.oldestBtpoXact;
}
/*
@ -1070,6 +1164,11 @@ restart:
{
/* Already deleted, but can't recycle yet */
stats->pages_deleted++;
/* Update the oldest btpo.xact */
if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
vstate->oldestBtpoXact = opaque->btpo.xact;
}
else if (P_ISHALFDEAD(opaque))
{
@ -1238,7 +1337,12 @@ restart:
/* count only this page, else may double-count parent */
if (ndel)
{
stats->pages_deleted++;
if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
vstate->oldestBtpoXact = opaque->btpo.xact;
}
MemoryContextSwitchTo(oldcontext);
/* pagedel released buffer, so we shouldn't */

View File

@ -108,6 +108,8 @@ _bt_restore_meta(XLogReaderState *record, uint8 block_id)
md->btm_level = xlrec->level;
md->btm_fastroot = xlrec->fastroot;
md->btm_fastlevel = xlrec->fastlevel;
md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact;
md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples;
pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
pageop->btpo_flags = BTP_META;
@ -985,7 +987,6 @@ btree_xlog_reuse_page(XLogReaderState *record)
}
}
void
btree_redo(XLogReaderState *record)
{
@ -1027,6 +1028,9 @@ btree_redo(XLogReaderState *record)
case XLOG_BTREE_REUSE_PAGE:
btree_xlog_reuse_page(record);
break;
case XLOG_BTREE_META_CLEANUP:
_bt_restore_meta(record, 0);
break;
default:
elog(PANIC, "btree_redo: unknown op code %u", info);
}

View File

@ -138,3 +138,5 @@ int VacuumPageDirty = 0;
int VacuumCostBalance = 0; /* working state for vacuum */
bool VacuumCostActive = false;
double vacuum_cleanup_index_scale_factor;

View File

@ -3208,6 +3208,16 @@ static struct config_real ConfigureNamesReal[] =
NULL, NULL, NULL
},
{
{"vacuum_cleanup_index_scale_factor", PGC_SIGHUP, AUTOVACUUM,
gettext_noop("Number of tuple inserts prior to index cleanup as a fraction of reltuples."),
NULL
},
&vacuum_cleanup_index_scale_factor,
0.1, 0.0, 100.0,
NULL, NULL, NULL
},
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0.0, 0.0, 0.0, NULL, NULL, NULL

View File

@ -102,6 +102,11 @@ typedef struct BTMetaPageData
uint32 btm_level; /* tree level of the root page */
BlockNumber btm_fastroot; /* current "fast" root location */
uint32 btm_fastlevel; /* tree level of the "fast" root page */
/* following fields are available since page version 3 */
TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among of
* deleted pages */
float4 btm_last_cleanup_num_heap_tuples; /* number of heap tuples
* during last cleanup */
} BTMetaPageData;
#define BTPageGetMeta(p) \
@ -109,7 +114,8 @@ typedef struct BTMetaPageData
#define BTREE_METAPAGE 0 /* first page is meta */
#define BTREE_MAGIC 0x053162 /* magic number of btree pages */
#define BTREE_VERSION 2 /* current version number */
#define BTREE_VERSION 3 /* current version number */
#define BTREE_MIN_VERSION 2 /* minimal supported version number */
/*
* Maximum size of a btree index entry, including its tuple header.
@ -481,6 +487,9 @@ extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack);
* prototypes for functions in nbtpage.c
*/
extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
extern void _bt_update_meta_cleanup_info(Relation rel,
TransactionId oldestBtpoXact, float8 numHeapTuples);
extern void _bt_upgrademetapage(Page page);
extern Buffer _bt_getroot(Relation rel, int access);
extern Buffer _bt_gettrueroot(Relation rel);
extern int _bt_getrootheight(Relation rel);

View File

@ -38,6 +38,8 @@
* vacuum */
#define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from
* FSM */
#define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the
* metapage */
/*
* All that we need to regenerate the meta-data page
@ -48,6 +50,8 @@ typedef struct xl_btree_metadata
uint32 level;
BlockNumber fastroot;
uint32 fastlevel;
TransactionId oldest_btpo_xact;
double last_cleanup_num_heap_tuples;
} xl_btree_metadata;
/*

View File

@ -256,6 +256,8 @@ extern int VacuumPageDirty;
extern int VacuumCostBalance;
extern bool VacuumCostActive;
extern double vacuum_cleanup_index_scale_factor;
/* in tcop/postgres.c */

View File

@ -287,6 +287,8 @@ typedef struct StdRdOptions
{
int32 vl_len_; /* varlena header (do not touch directly!) */
int fillfactor; /* page fill factor in percent (0..100) */
/* fraction of newly inserted tuples prior to trigger index cleanup */
float8 vacuum_cleanup_index_scale_factor;
int toast_tuple_target; /* target for tuple toasting */
AutoVacOpts autovacuum; /* autovacuum-related options */
bool user_catalog_table; /* use as an additional catalog relation */

View File

@ -150,3 +150,32 @@ vacuum btree_tall_tbl;
-- need to insert some rows to cause the fast root page to split.
insert into btree_tall_tbl (id, t)
select g, repeat('x', 100) from generate_series(1, 500) g;
--
-- Test vacuum_cleanup_index_scale_factor
--
-- Simple create
create table btree_test(a int);
create index btree_idx1 on btree_test(a) with (vacuum_cleanup_index_scale_factor = 40.0);
select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;
reloptions
------------------------------------------
{vacuum_cleanup_index_scale_factor=40.0}
(1 row)
-- Fail while setting improper values
create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = -10.0);
ERROR: value -10.0 out of bounds for option "vacuum_cleanup_index_scale_factor"
DETAIL: Valid values are between "0.000000" and "100.000000".
create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 100.0);
create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 'string');
ERROR: invalid value for floating point option "vacuum_cleanup_index_scale_factor": string
create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = true);
ERROR: invalid value for floating point option "vacuum_cleanup_index_scale_factor": true
-- Simple ALTER INDEX
alter index btree_idx1 set (vacuum_cleanup_index_scale_factor = 70.0);
select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;
reloptions
------------------------------------------
{vacuum_cleanup_index_scale_factor=70.0}
(1 row)

View File

@ -92,3 +92,22 @@ vacuum btree_tall_tbl;
-- need to insert some rows to cause the fast root page to split.
insert into btree_tall_tbl (id, t)
select g, repeat('x', 100) from generate_series(1, 500) g;
--
-- Test vacuum_cleanup_index_scale_factor
--
-- Simple create
create table btree_test(a int);
create index btree_idx1 on btree_test(a) with (vacuum_cleanup_index_scale_factor = 40.0);
select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;
-- Fail while setting improper values
create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = -10.0);
create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 100.0);
create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = 'string');
create index btree_idx_err on btree_test(a) with (vacuum_cleanup_index_scale_factor = true);
-- Simple ALTER INDEX
alter index btree_idx1 set (vacuum_cleanup_index_scale_factor = 70.0);
select reloptions from pg_class WHERE oid = 'btree_idx1'::regclass;