Allow amcheck to re-find tuples using new search.

Teach contrib/amcheck's bt_index_parent_check() function to take
advantage of the uniqueness property of heapkeyspace indexes in support
of a new verification option: non-pivot tuples (non-highkey tuples on
the leaf level) can optionally be re-found using a new search for each,
that starts from the root page.  If a tuple cannot be re-found, report
that the index is corrupt.

The new "rootdescend" verification option is exhaustive, and can
therefore make a call to bt_index_parent_check() take a lot longer.
Re-finding tuples during verification is mostly intended as an option
for backend developers, since the corruption scenarios that it alone is
uniquely capable of detecting seem fairly far-fetched.

For example, "rootdescend" verification is much more likely to detect
corruption of the least significant byte of a key from a pivot tuple in
the root page of a B-Tree that already has at least three levels.
Typically, only a few tuples on a cousin leaf page are at risk of
"getting overlooked" by index scans in this scenario.  The corrupt key
in the root page is only slightly corrupt: corrupt enough to give wrong
answers to some queries, and yet not corrupt enough to allow the problem
to be detected without verifying agreement between the leaf page and the
root page, skipping at least one internal page level.  The existing
bt_index_parent_check() checks never cross more than a single level.

Author: Peter Geoghegan
Reviewed-By: Heikki Linnakangas
Discussion: https://postgr.es/m/CAH2-Wz=yTWnVu+HeHGKb2AGiADL9eprn-cKYAto4MkKOuiGtRQ@mail.gmail.com
This commit is contained in:
Peter Geoghegan 2019-03-20 10:41:36 -07:00
parent fab2502433
commit c1afd175b5
7 changed files with 160 additions and 16 deletions

View File

@ -4,7 +4,7 @@ MODULE_big = amcheck
OBJS = verify_nbtree.o $(WIN32RES) OBJS = verify_nbtree.o $(WIN32RES)
EXTENSION = amcheck EXTENSION = amcheck
DATA = amcheck--1.0--1.1.sql amcheck--1.0.sql DATA = amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck--1.0.sql
PGFILEDESC = "amcheck - function for verifying relation integrity" PGFILEDESC = "amcheck - function for verifying relation integrity"
REGRESS = check check_btree REGRESS = check check_btree

View File

@ -0,0 +1,19 @@
/* contrib/amcheck/amcheck--1.1--1.2.sql */
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "ALTER EXTENSION amcheck UPDATE TO '1.2'" to load this file. \quit
-- In order to avoid issues with dependencies when updating amcheck to 1.2,
-- create new, overloaded version of the 1.1 function signature
--
-- bt_index_parent_check()
--
CREATE FUNCTION bt_index_parent_check(index regclass,
heapallindexed boolean, rootdescend boolean)
RETURNS VOID
AS 'MODULE_PATHNAME', 'bt_index_parent_check'
LANGUAGE C STRICT PARALLEL RESTRICTED;
-- Don't want this to be available to public
REVOKE ALL ON FUNCTION bt_index_parent_check(regclass, boolean, boolean) FROM PUBLIC;

View File

@ -1,5 +1,5 @@
# amcheck extension # amcheck extension
comment = 'functions for verifying relation integrity' comment = 'functions for verifying relation integrity'
default_version = '1.1' default_version = '1.2'
module_pathname = '$libdir/amcheck' module_pathname = '$libdir/amcheck'
relocatable = true relocatable = true

View File

@ -126,7 +126,8 @@ SELECT bt_index_parent_check('bttest_multi_idx', true);
(1 row) (1 row)
-- --
-- Test for multilevel page deletion/downlink present checks -- Test for multilevel page deletion/downlink present checks, and rootdescend
-- checks
-- --
INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i; INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i;
ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d); ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d);
@ -137,7 +138,7 @@ VACUUM delete_test_table;
-- root" -- root"
DELETE FROM delete_test_table WHERE a < 79990; DELETE FROM delete_test_table WHERE a < 79990;
VACUUM delete_test_table; VACUUM delete_test_table;
SELECT bt_index_parent_check('delete_test_table_pkey', true); SELECT bt_index_parent_check('delete_test_table_pkey', true, true);
bt_index_parent_check bt_index_parent_check
----------------------- -----------------------

View File

@ -78,7 +78,8 @@ INSERT INTO bttest_multi SELECT i, i%2 FROM generate_series(1, 100000) as i;
SELECT bt_index_parent_check('bttest_multi_idx', true); SELECT bt_index_parent_check('bttest_multi_idx', true);
-- --
-- Test for multilevel page deletion/downlink present checks -- Test for multilevel page deletion/downlink present checks, and rootdescend
-- checks
-- --
INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i; INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i;
ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d); ALTER TABLE delete_test_table ADD PRIMARY KEY (a,b,c,d);
@ -89,7 +90,7 @@ VACUUM delete_test_table;
-- root" -- root"
DELETE FROM delete_test_table WHERE a < 79990; DELETE FROM delete_test_table WHERE a < 79990;
VACUUM delete_test_table; VACUUM delete_test_table;
SELECT bt_index_parent_check('delete_test_table_pkey', true); SELECT bt_index_parent_check('delete_test_table_pkey', true, true);
-- --
-- BUG #15597: must not assume consistent input toasting state when forming -- BUG #15597: must not assume consistent input toasting state when forming

View File

@ -75,6 +75,8 @@ typedef struct BtreeCheckState
bool readonly; bool readonly;
/* Also verifying heap has no unindexed tuples? */ /* Also verifying heap has no unindexed tuples? */
bool heapallindexed; bool heapallindexed;
/* Also making sure non-pivot tuples can be found by new search? */
bool rootdescend;
/* Per-page context */ /* Per-page context */
MemoryContext targetcontext; MemoryContext targetcontext;
/* Buffer access strategy */ /* Buffer access strategy */
@ -124,10 +126,11 @@ PG_FUNCTION_INFO_V1(bt_index_check);
PG_FUNCTION_INFO_V1(bt_index_parent_check); PG_FUNCTION_INFO_V1(bt_index_parent_check);
static void bt_index_check_internal(Oid indrelid, bool parentcheck, static void bt_index_check_internal(Oid indrelid, bool parentcheck,
bool heapallindexed); bool heapallindexed, bool rootdescend);
static inline void btree_index_checkable(Relation rel); static inline void btree_index_checkable(Relation rel);
static void bt_check_every_level(Relation rel, Relation heaprel, static void bt_check_every_level(Relation rel, Relation heaprel,
bool heapkeyspace, bool readonly, bool heapallindexed); bool heapkeyspace, bool readonly, bool heapallindexed,
bool rootdescend);
static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
BtreeLevel level); BtreeLevel level);
static void bt_target_page_check(BtreeCheckState *state); static void bt_target_page_check(BtreeCheckState *state);
@ -140,6 +143,7 @@ static void bt_tuple_present_callback(Relation index, HeapTuple htup,
bool tupleIsAlive, void *checkstate); bool tupleIsAlive, void *checkstate);
static IndexTuple bt_normalize_tuple(BtreeCheckState *state, static IndexTuple bt_normalize_tuple(BtreeCheckState *state,
IndexTuple itup); IndexTuple itup);
static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup);
static inline bool offset_is_negative_infinity(BTPageOpaque opaque, static inline bool offset_is_negative_infinity(BTPageOpaque opaque,
OffsetNumber offset); OffsetNumber offset);
static inline bool invariant_l_offset(BtreeCheckState *state, BTScanInsert key, static inline bool invariant_l_offset(BtreeCheckState *state, BTScanInsert key,
@ -177,7 +181,7 @@ bt_index_check(PG_FUNCTION_ARGS)
if (PG_NARGS() == 2) if (PG_NARGS() == 2)
heapallindexed = PG_GETARG_BOOL(1); heapallindexed = PG_GETARG_BOOL(1);
bt_index_check_internal(indrelid, false, heapallindexed); bt_index_check_internal(indrelid, false, heapallindexed, false);
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
@ -196,11 +200,14 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
{ {
Oid indrelid = PG_GETARG_OID(0); Oid indrelid = PG_GETARG_OID(0);
bool heapallindexed = false; bool heapallindexed = false;
bool rootdescend = false;
if (PG_NARGS() == 2) if (PG_NARGS() >= 2)
heapallindexed = PG_GETARG_BOOL(1); heapallindexed = PG_GETARG_BOOL(1);
if (PG_NARGS() == 3)
rootdescend = PG_GETARG_BOOL(2);
bt_index_check_internal(indrelid, true, heapallindexed); bt_index_check_internal(indrelid, true, heapallindexed, rootdescend);
PG_RETURN_VOID(); PG_RETURN_VOID();
} }
@ -209,7 +216,8 @@ bt_index_parent_check(PG_FUNCTION_ARGS)
* Helper for bt_index_[parent_]check, coordinating the bulk of the work. * Helper for bt_index_[parent_]check, coordinating the bulk of the work.
*/ */
static void static void
bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed) bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed,
bool rootdescend)
{ {
Oid heapid; Oid heapid;
Relation indrel; Relation indrel;
@ -267,7 +275,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed)
/* Check index, possibly against table it is an index on */ /* Check index, possibly against table it is an index on */
heapkeyspace = _bt_heapkeyspace(indrel); heapkeyspace = _bt_heapkeyspace(indrel);
bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck, bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck,
heapallindexed); heapallindexed, rootdescend);
/* /*
* Release locks early. That's ok here because nothing in the called * Release locks early. That's ok here because nothing in the called
@ -338,7 +346,7 @@ btree_index_checkable(Relation rel)
*/ */
static void static void
bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
bool readonly, bool heapallindexed) bool readonly, bool heapallindexed, bool rootdescend)
{ {
BtreeCheckState *state; BtreeCheckState *state;
Page metapage; Page metapage;
@ -362,6 +370,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
state->heapkeyspace = heapkeyspace; state->heapkeyspace = heapkeyspace;
state->readonly = readonly; state->readonly = readonly;
state->heapallindexed = heapallindexed; state->heapallindexed = heapallindexed;
state->rootdescend = rootdescend;
if (state->heapallindexed) if (state->heapallindexed)
{ {
@ -430,6 +439,14 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
} }
} }
Assert(!state->rootdescend || state->readonly);
if (state->rootdescend && !state->heapkeyspace)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot verify that tuples from index \"%s\" can each be found by an independent index search",
RelationGetRelationName(rel)),
errhint("Only B-Tree version 4 indexes support rootdescend verification.")));
/* Create context for page */ /* Create context for page */
state->targetcontext = AllocSetContextCreate(CurrentMemoryContext, state->targetcontext = AllocSetContextCreate(CurrentMemoryContext,
"amcheck context", "amcheck context",
@ -922,6 +939,31 @@ bt_target_page_check(BtreeCheckState *state)
if (offset_is_negative_infinity(topaque, offset)) if (offset_is_negative_infinity(topaque, offset))
continue; continue;
/*
* Readonly callers may optionally verify that non-pivot tuples can
* each be found by an independent search that starts from the root
*/
if (state->rootdescend && P_ISLEAF(topaque) &&
!bt_rootdescend(state, itup))
{
char *itid,
*htid;
itid = psprintf("(%u,%u)", state->targetblock, offset);
htid = psprintf("(%u,%u)",
ItemPointerGetBlockNumber(&(itup->t_tid)),
ItemPointerGetOffsetNumber(&(itup->t_tid)));
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("could not find tuple using search from root page in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.",
itid, htid,
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
}
/* Build insertion scankey for current page offset */ /* Build insertion scankey for current page offset */
skey = bt_mkscankey_pivotsearch(state->rel, itup); skey = bt_mkscankey_pivotsearch(state->rel, itup);
@ -1526,6 +1568,9 @@ bt_downlink_check(BtreeCheckState *state, BTScanInsert targetkey,
* internal pages. In more general terms, a negative infinity item is * internal pages. In more general terms, a negative infinity item is
* only negative infinity with respect to the subtree that the page is * only negative infinity with respect to the subtree that the page is
* at the root of. * at the root of.
*
* See also: bt_rootdescend(), which can even detect transitive
* inconsistencies on cousin leaf pages.
*/ */
if (offset_is_negative_infinity(copaque, offset)) if (offset_is_negative_infinity(copaque, offset))
continue; continue;
@ -1926,6 +1971,81 @@ bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
return reformed; return reformed;
} }
/*
* Search for itup in index, starting from fast root page. itup must be a
* non-pivot tuple. This is only supported with heapkeyspace indexes, since
* we rely on having fully unique keys to find a match with only a signle
* visit to a leaf page, barring an interrupted page split, where we may have
* to move right. (A concurrent page split is impossible because caller must
* be readonly caller.)
*
* This routine can detect very subtle transitive consistency issues across
* more than one level of the tree. Leaf pages all have a high key (even the
* rightmost page has a conceptual positive infinity high key), but not a low
* key. Their downlink in parent is a lower bound, which along with the high
* key is almost enough to detect every possible inconsistency. A downlink
* separator key value won't always be available from parent, though, because
* the first items of internal pages are negative infinity items, truncated
* down to zero attributes during internal page splits. While it's true that
* bt_downlink_check() and the high key check can detect most imaginable key
* space problems, there are remaining problems it won't detect with non-pivot
* tuples in cousin leaf pages. Starting a search from the root for every
* existing leaf tuple detects small inconsistencies in upper levels of the
* tree that cannot be detected any other way. (Besides all this, this is
* probably also useful as a direct test of the code used by index scans
* themselves.)
*/
static bool
bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
{
BTScanInsert key;
BTStack stack;
Buffer lbuf;
bool exists;
key = _bt_mkscankey(state->rel, itup);
Assert(key->heapkeyspace && key->scantid != NULL);
/*
* Search from root.
*
* Ideally, we would arrange to only move right within _bt_search() when
* an interrupted page split is detected (i.e. when the incomplete split
* bit is found to be set), but for now we accept the possibility that
* that could conceal an inconsistency.
*/
Assert(state->readonly && state->rootdescend);
exists = false;
stack = _bt_search(state->rel, key, &lbuf, BT_READ, NULL);
if (BufferIsValid(lbuf))
{
BTInsertStateData insertstate;
OffsetNumber offnum;
Page page;
insertstate.itup = itup;
insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
insertstate.itup_key = key;
insertstate.bounds_valid = false;
insertstate.buf = lbuf;
/* Get matching tuple on leaf page */
offnum = _bt_binsrch_insert(state->rel, &insertstate);
/* Compare first >= matching item on leaf page, if any */
page = BufferGetPage(lbuf);
if (offnum <= PageGetMaxOffsetNumber(page) &&
_bt_compare(state->rel, key, page, offnum) == 0)
exists = true;
_bt_relbuf(state->rel, lbuf);
}
_bt_freestack(stack);
pfree(key);
return exists;
}
/* /*
* Is particular offset within page (whose special state is passed by caller) * Is particular offset within page (whose special state is passed by caller)
* the page negative-infinity item? * the page negative-infinity item?

View File

@ -112,7 +112,7 @@ ORDER BY c.relpages DESC LIMIT 10;
<varlistentry> <varlistentry>
<term> <term>
<function>bt_index_parent_check(index regclass, heapallindexed boolean) returns void</function> <function>bt_index_parent_check(index regclass, heapallindexed boolean, rootdescend boolean) returns void</function>
<indexterm> <indexterm>
<primary>bt_index_parent_check</primary> <primary>bt_index_parent_check</primary>
</indexterm> </indexterm>
@ -126,7 +126,10 @@ ORDER BY c.relpages DESC LIMIT 10;
argument is <literal>true</literal>, the function verifies the argument is <literal>true</literal>, the function verifies the
presence of all heap tuples that should be found within the presence of all heap tuples that should be found within the
index, and that there are no missing downlinks in the index index, and that there are no missing downlinks in the index
structure. The checks that can be performed by structure. When the optional <parameter>rootdescend</parameter>
argument is <literal>true</literal>, verification re-finds
tuples on the leaf level by performing a new search from the
root page for each tuple. The checks that can be performed by
<function>bt_index_parent_check</function> are a superset of the <function>bt_index_parent_check</function> are a superset of the
checks that can be performed by <function>bt_index_check</function>. checks that can be performed by <function>bt_index_check</function>.
<function>bt_index_parent_check</function> can be thought of as <function>bt_index_parent_check</function> can be thought of as