mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-01-24 18:55:04 +08:00
Add missing and dangling downlink checks to amcheck
When bt_index_parent_check() is called with the heapallindexed option,
allocate a second Bloom filter to fingerprint block numbers that appear
in the downlinks of internal pages. Use Bloom filter probes when
walking the B-Tree to detect missing downlinks. This can detect subtle
problems with page deletion/VACUUM, such as corruption caused by the bug
just fixed in commit 6db4b499
.
The downlink Bloom filter is bound in size by work_mem. Its optimal
size is typically far smaller than that of the regular heapallindexed
Bloom filter, especially when the index has high fan-out.
Author: Peter Geoghegan
Reviewer: Teodor Sigaev
Discussion: https://postgr.es/m/CAH2-WznUzY4fWTjm1tBB3JpVz8cCfz7k_qVp5BhuPyhivmWJFg@mail.gmail.com
This commit is contained in:
parent
7f58f666cd
commit
4eaf7eaccb
@ -91,6 +91,10 @@ typedef struct BtreeCheckState
|
||||
|
||||
/* Bloom filter fingerprints B-Tree index */
|
||||
bloom_filter *filter;
|
||||
/* Bloom filter fingerprints downlink blocks within tree */
|
||||
bloom_filter *downlinkfilter;
|
||||
/* Right half of incomplete split marker */
|
||||
bool rightsplit;
|
||||
/* Debug counter */
|
||||
int64 heaptuplespresent;
|
||||
} BtreeCheckState;
|
||||
@ -124,6 +128,7 @@ static void bt_target_page_check(BtreeCheckState *state);
|
||||
static ScanKey bt_right_page_check_scankey(BtreeCheckState *state);
|
||||
static void bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
|
||||
ScanKey targetkey);
|
||||
static void bt_downlink_missing_check(BtreeCheckState *state);
|
||||
static void bt_tuple_present_callback(Relation index, HeapTuple htup,
|
||||
Datum *values, bool *isnull,
|
||||
bool tupleIsAlive, void *checkstate);
|
||||
@ -360,6 +365,9 @@ bt_check_every_level(Relation rel, Relation heaprel, bool readonly,
|
||||
* before index fingerprinting begins, so we can later be certain that
|
||||
* index fingerprinting should have reached all tuples returned by
|
||||
* IndexBuildHeapScan().
|
||||
*
|
||||
* In readonly case, we also check for problems with missing downlinks.
|
||||
* A second Bloom filter is used for this.
|
||||
*/
|
||||
if (!state->readonly)
|
||||
{
|
||||
@ -386,6 +394,23 @@ bt_check_every_level(Relation rel, Relation heaprel, bool readonly,
|
||||
errmsg("index \"%s\" cannot be verified using transaction snapshot",
|
||||
RelationGetRelationName(rel))));
|
||||
}
|
||||
else
|
||||
{
|
||||
int64 total_pages;
|
||||
|
||||
/*
|
||||
* Extra readonly downlink check.
|
||||
*
|
||||
* In readonly case, we know that there cannot be a concurrent page
|
||||
* split or a concurrent page deletion, which gives us the
|
||||
* opportunity to verify that every non-ignorable page had a
|
||||
* downlink one level up. We must be tolerant of interrupted page
|
||||
* splits and page deletions, though. This is taken care of in
|
||||
* bt_downlink_missing_check().
|
||||
*/
|
||||
total_pages = (int64) state->rel->rd_rel->relpages;
|
||||
state->downlinkfilter = bloom_create(total_pages, work_mem, seed);
|
||||
}
|
||||
}
|
||||
|
||||
/* Create context for page */
|
||||
@ -426,6 +451,12 @@ bt_check_every_level(Relation rel, Relation heaprel, bool readonly,
|
||||
current.istruerootlevel = true;
|
||||
while (current.leftmost != P_NONE)
|
||||
{
|
||||
/*
|
||||
* Leftmost page on level cannot be right half of incomplete split.
|
||||
* This can go stale immediately in !readonly case.
|
||||
*/
|
||||
state->rightsplit = false;
|
||||
|
||||
/*
|
||||
* Verify this level, and get left most page for next level down, if
|
||||
* not at leaf level
|
||||
@ -449,6 +480,16 @@ bt_check_every_level(Relation rel, Relation heaprel, bool readonly,
|
||||
IndexInfo *indexinfo = BuildIndexInfo(state->rel);
|
||||
HeapScanDesc scan;
|
||||
|
||||
/* Report on extra downlink checks performed in readonly case */
|
||||
if (state->readonly)
|
||||
{
|
||||
ereport(DEBUG1,
|
||||
(errmsg_internal("finished verifying presence of downlink blocks within index \"%s\" with bitset %.2f%% set",
|
||||
RelationGetRelationName(rel),
|
||||
100.0 * bloom_prop_bits_set(state->downlinkfilter))));
|
||||
bloom_free(state->downlinkfilter);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create our own scan for IndexBuildHeapScan(), rather than getting it
|
||||
* to do so for us. This is required so that we can actually use the
|
||||
@ -564,6 +605,25 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
|
||||
|
||||
if (P_IGNORE(opaque))
|
||||
{
|
||||
/*
|
||||
* Since there cannot be a concurrent VACUUM operation in readonly
|
||||
* mode, and since a page has no links within other pages (siblings
|
||||
* and parent) once it is marked fully deleted, it should be
|
||||
* impossible to land on a fully deleted page in readonly mode.
|
||||
* See bt_downlink_check() for further details.
|
||||
*
|
||||
* The bt_downlink_check() P_ISDELETED() check is repeated here so
|
||||
* that pages that are only reachable through sibling links get
|
||||
* checked.
|
||||
*/
|
||||
if (state->readonly && P_ISDELETED(opaque))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||
errmsg("downlink or sibling link points to deleted block in index \"%s\"",
|
||||
RelationGetRelationName(state->rel)),
|
||||
errdetail_internal("Block=%u left block=%u left link from block=%u.",
|
||||
current, leftcurrent, opaque->btpo_prev)));
|
||||
|
||||
if (P_RIGHTMOST(opaque))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||
@ -617,7 +677,7 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
|
||||
/* Internal page -- downlink gets leftmost on next level */
|
||||
itemid = PageGetItemId(state->target, P_FIRSTDATAKEY(opaque));
|
||||
itup = (IndexTuple) PageGetItem(state->target, itemid);
|
||||
nextleveldown.leftmost = ItemPointerGetBlockNumberNoCheck(&(itup->t_tid));
|
||||
nextleveldown.leftmost = BTreeInnerTupleGetDownLink(itup);
|
||||
nextleveldown.level = opaque->btpo.level - 1;
|
||||
}
|
||||
else
|
||||
@ -639,6 +699,10 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
|
||||
*/
|
||||
}
|
||||
|
||||
/*
|
||||
* readonly mode can only ever land on live pages and half-dead pages,
|
||||
* so sibling pointers should always be in mutual agreement
|
||||
*/
|
||||
if (state->readonly && opaque->btpo_prev != leftcurrent)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||
@ -668,6 +732,13 @@ nextpage:
|
||||
errmsg("circular link chain found in block %u of index \"%s\"",
|
||||
current, RelationGetRelationName(state->rel))));
|
||||
|
||||
/*
|
||||
* Record if page that is about to become target is the right half of
|
||||
* an incomplete page split. This can go stale immediately in
|
||||
* !readonly case.
|
||||
*/
|
||||
state->rightsplit = P_INCOMPLETE_SPLIT(opaque);
|
||||
|
||||
leftcurrent = current;
|
||||
current = opaque->btpo_next;
|
||||
|
||||
@ -812,6 +883,16 @@ bt_target_page_check(BtreeCheckState *state)
|
||||
(uint32) state->targetlsn)));
|
||||
}
|
||||
|
||||
/* Fingerprint downlink blocks in heapallindexed + readonly case */
|
||||
if (state->heapallindexed && state->readonly && !P_ISLEAF(topaque))
|
||||
{
|
||||
BlockNumber childblock = BTreeInnerTupleGetDownLink(itup);
|
||||
|
||||
bloom_add_element(state->downlinkfilter,
|
||||
(unsigned char *) &childblock,
|
||||
sizeof(BlockNumber));
|
||||
}
|
||||
|
||||
/*
|
||||
* Don't try to generate scankey using "negative infinity" item on
|
||||
* internal pages. They are always truncated to zero attributes.
|
||||
@ -984,11 +1065,19 @@ bt_target_page_check(BtreeCheckState *state)
|
||||
*/
|
||||
if (!P_ISLEAF(topaque) && state->readonly)
|
||||
{
|
||||
BlockNumber childblock = ItemPointerGetBlockNumberNoCheck(&(itup->t_tid));
|
||||
BlockNumber childblock = BTreeInnerTupleGetDownLink(itup);
|
||||
|
||||
bt_downlink_check(state, childblock, skey);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* * Check if page has a downlink in parent *
|
||||
*
|
||||
* This can only be checked in readonly + heapallindexed case.
|
||||
*/
|
||||
if (state->heapallindexed && state->readonly)
|
||||
bt_downlink_missing_check(state);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1272,6 +1361,40 @@ bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
|
||||
copaque = (BTPageOpaque) PageGetSpecialPointer(child);
|
||||
maxoffset = PageGetMaxOffsetNumber(child);
|
||||
|
||||
/*
|
||||
* Since there cannot be a concurrent VACUUM operation in readonly mode,
|
||||
* and since a page has no links within other pages (siblings and parent)
|
||||
* once it is marked fully deleted, it should be impossible to land on a
|
||||
* fully deleted page.
|
||||
*
|
||||
* It does not quite make sense to enforce that the page cannot even be
|
||||
* half-dead, despite the fact the downlink is modified at the same stage
|
||||
* that the child leaf page is marked half-dead. That's incorrect because
|
||||
* there may occasionally be multiple downlinks from a chain of pages
|
||||
* undergoing deletion, where multiple successive calls are made to
|
||||
* _bt_unlink_halfdead_page() by VACUUM before it can finally safely mark
|
||||
* the leaf page as fully dead. While _bt_mark_page_halfdead() usually
|
||||
* removes the downlink to the leaf page that is marked half-dead, that's
|
||||
* not guaranteed, so it's possible we'll land on a half-dead page with a
|
||||
* downlink due to an interrupted multi-level page deletion.
|
||||
*
|
||||
* We go ahead with our checks if the child page is half-dead. It's safe
|
||||
* to do so because we do not test the child's high key, so it does not
|
||||
* matter that the original high key will have been replaced by a dummy
|
||||
* truncated high key within _bt_mark_page_halfdead(). All other page
|
||||
* items are left intact on a half-dead page, so there is still something
|
||||
* to test.
|
||||
*/
|
||||
if (P_ISDELETED(copaque))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||
errmsg("downlink to deleted page found in index \"%s\"",
|
||||
RelationGetRelationName(state->rel)),
|
||||
errdetail_internal("Parent block=%u child block=%u parent page lsn=%X/%X.",
|
||||
state->targetblock, childblock,
|
||||
(uint32) (state->targetlsn >> 32),
|
||||
(uint32) state->targetlsn)));
|
||||
|
||||
for (offset = P_FIRSTDATAKEY(copaque);
|
||||
offset <= maxoffset;
|
||||
offset = OffsetNumberNext(offset))
|
||||
@ -1300,6 +1423,191 @@ bt_downlink_check(BtreeCheckState *state, BlockNumber childblock,
|
||||
pfree(child);
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks if page is missing a downlink that it should have.
|
||||
*
|
||||
* A page that lacks a downlink/parent may indicate corruption. However, we
|
||||
* must account for the fact that a missing downlink can occasionally be
|
||||
* encountered in a non-corrupt index. This can be due to an interrupted page
|
||||
* split, or an interrupted multi-level page deletion (i.e. there was a hard
|
||||
* crash or an error during a page split, or while VACUUM was deleting a
|
||||
* multi-level chain of pages).
|
||||
*
|
||||
* Note that this can only be called in readonly mode, so there is no need to
|
||||
* be concerned about concurrent page splits or page deletions.
|
||||
*/
|
||||
static void
|
||||
bt_downlink_missing_check(BtreeCheckState *state)
|
||||
{
|
||||
BTPageOpaque topaque = (BTPageOpaque) PageGetSpecialPointer(state->target);
|
||||
ItemId itemid;
|
||||
IndexTuple itup;
|
||||
Page child;
|
||||
BTPageOpaque copaque;
|
||||
uint32 level;
|
||||
BlockNumber childblk;
|
||||
|
||||
Assert(state->heapallindexed && state->readonly);
|
||||
Assert(!P_IGNORE(topaque));
|
||||
|
||||
/* No next level up with downlinks to fingerprint from the true root */
|
||||
if (P_ISROOT(topaque))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Incomplete (interrupted) page splits can account for the lack of a
|
||||
* downlink. Some inserting transaction should eventually complete the
|
||||
* page split in passing, when it notices that the left sibling page is
|
||||
* P_INCOMPLETE_SPLIT().
|
||||
*
|
||||
* In general, VACUUM is not prepared for there to be no downlink to a page
|
||||
* that it deletes. This is the main reason why the lack of a downlink can
|
||||
* be reported as corruption here. It's not obvious that an invalid
|
||||
* missing downlink can result in wrong answers to queries, though, since
|
||||
* index scans that land on the child may end up consistently moving right.
|
||||
* The handling of concurrent page splits (and page deletions) within
|
||||
* _bt_moveright() cannot distinguish inconsistencies that last for a
|
||||
* moment from inconsistencies that are permanent and irrecoverable.
|
||||
*
|
||||
* VACUUM isn't even prepared to delete pages that have no downlink due to
|
||||
* an incomplete page split, but it can detect and reason about that case
|
||||
* by design, so it shouldn't be taken to indicate corruption. See
|
||||
* _bt_pagedel() for full details.
|
||||
*/
|
||||
if (state->rightsplit)
|
||||
{
|
||||
ereport(DEBUG1,
|
||||
(errcode(ERRCODE_NO_DATA),
|
||||
errmsg("harmless interrupted page split detected in index %s",
|
||||
RelationGetRelationName(state->rel)),
|
||||
errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",
|
||||
state->targetblock, topaque->btpo.level,
|
||||
topaque->btpo_prev,
|
||||
(uint32) (state->targetlsn >> 32),
|
||||
(uint32) state->targetlsn)));
|
||||
return;
|
||||
}
|
||||
|
||||
/* Target's downlink is typically present in parent/fingerprinted */
|
||||
if (!bloom_lacks_element(state->downlinkfilter,
|
||||
(unsigned char *) &state->targetblock,
|
||||
sizeof(BlockNumber)))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Target is probably the "top parent" of a multi-level page deletion.
|
||||
* We'll need to descend the subtree to make sure that descendant pages are
|
||||
* consistent with that, though.
|
||||
*
|
||||
* If the target page (which must be non-ignorable) is a leaf page, then
|
||||
* clearly it can't be the top parent. The lack of a downlink is probably
|
||||
* a symptom of a broad problem that could just as easily cause
|
||||
* inconsistencies anywhere else.
|
||||
*/
|
||||
if (P_ISLEAF(topaque))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||
errmsg("leaf index block lacks downlink in index \"%s\"",
|
||||
RelationGetRelationName(state->rel)),
|
||||
errdetail_internal("Block=%u page lsn=%X/%X.",
|
||||
state->targetblock,
|
||||
(uint32) (state->targetlsn >> 32),
|
||||
(uint32) state->targetlsn)));
|
||||
|
||||
/* Descend from the target page, which is an internal page */
|
||||
elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
|
||||
RelationGetRelationName(state->rel));
|
||||
|
||||
level = topaque->btpo.level;
|
||||
itemid = PageGetItemId(state->target, P_FIRSTDATAKEY(topaque));
|
||||
itup = (IndexTuple) PageGetItem(state->target, itemid);
|
||||
childblk = BTreeInnerTupleGetDownLink(itup);
|
||||
for (;;)
|
||||
{
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
child = palloc_btree_page(state, childblk);
|
||||
copaque = (BTPageOpaque) PageGetSpecialPointer(child);
|
||||
|
||||
if (P_ISLEAF(copaque))
|
||||
break;
|
||||
|
||||
/* Do an extra sanity check in passing on internal pages */
|
||||
if (copaque->btpo.level != level - 1)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||
errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
|
||||
RelationGetRelationName(state->rel)),
|
||||
errdetail_internal("Top parent/target block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
|
||||
state->targetblock, childblk,
|
||||
level - 1, copaque->btpo.level)));
|
||||
|
||||
level = copaque->btpo.level;
|
||||
itemid = PageGetItemId(child, P_FIRSTDATAKEY(copaque));
|
||||
itup = (IndexTuple) PageGetItem(child, itemid);
|
||||
childblk = BTreeInnerTupleGetDownLink(itup);
|
||||
/* Be slightly more pro-active in freeing this memory, just in case */
|
||||
pfree(child);
|
||||
}
|
||||
|
||||
/*
|
||||
* Since there cannot be a concurrent VACUUM operation in readonly mode,
|
||||
* and since a page has no links within other pages (siblings and parent)
|
||||
* once it is marked fully deleted, it should be impossible to land on a
|
||||
* fully deleted page. See bt_downlink_check() for further details.
|
||||
*
|
||||
* The bt_downlink_check() P_ISDELETED() check is repeated here because
|
||||
* bt_downlink_check() does not visit pages reachable through negative
|
||||
* infinity items. Besides, bt_downlink_check() is unwilling to descend
|
||||
* multiple levels. (The similar bt_downlink_check() P_ISDELETED() check
|
||||
* within bt_check_level_from_leftmost() won't reach the page either, since
|
||||
* the leaf's live siblings should have their sibling links updating to
|
||||
* bypass the deletion target page when it is marked fully dead.)
|
||||
*
|
||||
* If this error is raised, it might be due to a previous multi-level page
|
||||
* deletion that failed to realize that it wasn't yet safe to mark the leaf
|
||||
* page as fully dead. A "dangling downlink" will still remain when this
|
||||
* happens. The fact that the dangling downlink's page (the leaf's
|
||||
* parent/ancestor page) lacked a downlink is incidental.
|
||||
*/
|
||||
if (P_ISDELETED(copaque))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||
errmsg_internal("downlink to deleted leaf page found in index \"%s\"",
|
||||
RelationGetRelationName(state->rel)),
|
||||
errdetail_internal("Top parent/target block=%u leaf block=%u top parent/target lsn=%X/%X.",
|
||||
state->targetblock, childblk,
|
||||
(uint32) (state->targetlsn >> 32),
|
||||
(uint32) state->targetlsn)));
|
||||
|
||||
/*
|
||||
* Iff leaf page is half-dead, its high key top parent link should point to
|
||||
* what VACUUM considered to be the top parent page at the instant it was
|
||||
* interrupted. Provided the high key link actually points to the target
|
||||
* page, the missing downlink we detected is consistent with there having
|
||||
* been an interrupted multi-level page deletion. This means that the
|
||||
* subtree with the target page at its root (a page deletion chain) is in a
|
||||
* consistent state, enabling VACUUM to resume deleting the entire chain
|
||||
* the next time it encounters the half-dead leaf page.
|
||||
*/
|
||||
if (P_ISHALFDEAD(copaque) && !P_RIGHTMOST(copaque))
|
||||
{
|
||||
itemid = PageGetItemId(child, P_HIKEY);
|
||||
itup = (IndexTuple) PageGetItem(child, itemid);
|
||||
if (BTreeTupleGetTopParent(itup) == state->targetblock)
|
||||
return;
|
||||
}
|
||||
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||
errmsg("internal index block lacks downlink in index \"%s\"",
|
||||
RelationGetRelationName(state->rel)),
|
||||
errdetail_internal("Block=%u level=%u page lsn=%X/%X.",
|
||||
state->targetblock, topaque->btpo.level,
|
||||
(uint32) (state->targetlsn >> 32),
|
||||
(uint32) state->targetlsn)));
|
||||
}
|
||||
|
||||
/*
|
||||
* Per-tuple callback from IndexBuildHeapScan, used to determine if index has
|
||||
* all the entries that definitely should have been observed in leaf pages of
|
||||
@ -1376,13 +1684,11 @@ bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values,
|
||||
* Note that we rely on deterministic index_form_tuple() TOAST compression.
|
||||
* If index_form_tuple() was ever enhanced to compress datums out-of-line,
|
||||
* or otherwise varied when or how compression was applied, our assumption
|
||||
* would break, leading to false positive reports of corruption. For now,
|
||||
* we don't decompress/normalize toasted values as part of fingerprinting.
|
||||
*
|
||||
* In future, non-pivot index tuples might get use of
|
||||
* BT_N_KEYS_OFFSET_MASK. Then binary representation of index tuple linked
|
||||
* to particular heap tuple might vary and meeds to be normalized before
|
||||
* bloom filter lookup.
|
||||
* would break, leading to false positive reports of corruption. It's also
|
||||
* possible that non-pivot tuples could in the future have alternative
|
||||
* equivalent representations (e.g. by using the INDEX_ALT_TID_MASK bit).
|
||||
* For now, we don't decompress/normalize toasted values as part of
|
||||
* fingerprinting.
|
||||
*/
|
||||
itup = index_form_tuple(RelationGetDescr(index), values, isnull);
|
||||
itup->t_tid = htup->t_self;
|
||||
@ -1393,8 +1699,8 @@ bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values,
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_DATA_CORRUPTED),
|
||||
errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"",
|
||||
ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
|
||||
ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)),
|
||||
ItemPointerGetBlockNumber(&(itup->t_tid)),
|
||||
ItemPointerGetOffsetNumber(&(itup->t_tid)),
|
||||
RelationGetRelationName(state->heaprel),
|
||||
RelationGetRelationName(state->rel)),
|
||||
!state->readonly
|
||||
@ -1520,6 +1826,7 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
|
||||
Buffer buffer;
|
||||
Page page;
|
||||
BTPageOpaque opaque;
|
||||
OffsetNumber maxoffset;
|
||||
|
||||
page = palloc(BLCKSZ);
|
||||
|
||||
@ -1566,9 +1873,13 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||
errmsg("version mismatch in index \"%s\": file version %d, "
|
||||
"current version %d, minimal supported version %d",
|
||||
"current version %d, minimum supported version %d",
|
||||
RelationGetRelationName(state->rel),
|
||||
metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
|
||||
metad->btm_version, BTREE_VERSION,
|
||||
BTREE_MIN_VERSION)));
|
||||
|
||||
/* Finished with metapage checks */
|
||||
return page;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1581,12 +1892,66 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
|
||||
errmsg("invalid leaf page level %u for block %u in index \"%s\"",
|
||||
opaque->btpo.level, blocknum, RelationGetRelationName(state->rel))));
|
||||
|
||||
if (blocknum != BTREE_METAPAGE && !P_ISLEAF(opaque) &&
|
||||
!P_ISDELETED(opaque) && opaque->btpo.level == 0)
|
||||
if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) &&
|
||||
opaque->btpo.level == 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||
errmsg("invalid internal page level 0 for block %u in index \"%s\"",
|
||||
opaque->btpo.level, RelationGetRelationName(state->rel))));
|
||||
blocknum, RelationGetRelationName(state->rel))));
|
||||
|
||||
/*
|
||||
* Sanity checks for number of items on page.
|
||||
*
|
||||
* As noted at the beginning of _bt_binsrch(), an internal page must have
|
||||
* children, since there must always be a negative infinity downlink (there
|
||||
* may also be a highkey). In the case of non-rightmost leaf pages, there
|
||||
* must be at least a highkey.
|
||||
*
|
||||
* This is correct when pages are half-dead, since internal pages are never
|
||||
* half-dead, and leaf pages must have a high key when half-dead (the
|
||||
* rightmost page can never be deleted). It's also correct with fully
|
||||
* deleted pages: _bt_unlink_halfdead_page() doesn't change anything about
|
||||
* the target page other than setting the page as fully dead, and setting
|
||||
* its xact field. In particular, it doesn't change the sibling links in
|
||||
* the deletion target itself, since they're required when index scans land
|
||||
* on the deletion target, and then need to move right (or need to move
|
||||
* left, in the case of backward index scans).
|
||||
*/
|
||||
maxoffset = PageGetMaxOffsetNumber(page);
|
||||
if (maxoffset > MaxIndexTuplesPerPage)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||
errmsg("Number of items on block %u of index \"%s\" exceeds MaxIndexTuplesPerPage (%u)",
|
||||
blocknum, RelationGetRelationName(state->rel),
|
||||
MaxIndexTuplesPerPage)));
|
||||
|
||||
if (!P_ISLEAF(opaque) && maxoffset < P_FIRSTDATAKEY(opaque))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||
errmsg("internal block %u in index \"%s\" lacks high key and/or at least one downlink",
|
||||
blocknum, RelationGetRelationName(state->rel))));
|
||||
|
||||
if (P_ISLEAF(opaque) && !P_RIGHTMOST(opaque) && maxoffset < P_HIKEY)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||
errmsg("non-rightmost leaf block %u in index \"%s\" lacks high key item",
|
||||
blocknum, RelationGetRelationName(state->rel))));
|
||||
|
||||
/*
|
||||
* In general, internal pages are never marked half-dead, except on
|
||||
* versions of Postgres prior to 9.4, where it can be valid transient
|
||||
* state. This state is nonetheless treated as corruption by VACUUM on
|
||||
* from version 9.4 on, so do the same here. See _bt_pagedel() for full
|
||||
* details.
|
||||
*
|
||||
* Internal pages should never have garbage items, either.
|
||||
*/
|
||||
if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||
errmsg("internal page block %u in index \"%s\" is half-dead",
|
||||
blocknum, RelationGetRelationName(state->rel)),
|
||||
errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
|
||||
|
||||
if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque))
|
||||
ereport(ERROR,
|
||||
|
@ -55,7 +55,7 @@
|
||||
<function>bt_index_check</function> tests that its target, a
|
||||
B-Tree index, respects a variety of invariants. Example usage:
|
||||
<screen>
|
||||
test=# SELECT bt_index_check(index => c.oid, heapallindexed => i.indisunique)
|
||||
test=# SELECT bt_index_check(index => c.oid, heapallindexed => i.indisunique),
|
||||
c.relname,
|
||||
c.relpages
|
||||
FROM pg_index i
|
||||
@ -67,7 +67,7 @@ WHERE am.amname = 'btree' AND n.nspname = 'pg_catalog'
|
||||
-- Don't check temp tables, which may be from another session:
|
||||
AND c.relpersistence != 't'
|
||||
-- Function may throw an error when this is omitted:
|
||||
AND i.indisready AND i.indisvalid
|
||||
AND c.relkind = 'i' AND i.indisready AND i.indisvalid
|
||||
ORDER BY c.relpages DESC LIMIT 10;
|
||||
bt_index_check | relname | relpages
|
||||
----------------+---------------------------------+----------
|
||||
@ -126,7 +126,8 @@ ORDER BY c.relpages DESC LIMIT 10;
|
||||
Optionally, when the <parameter>heapallindexed</parameter>
|
||||
argument is <literal>true</literal>, the function verifies the
|
||||
presence of all heap tuples that should be found within the
|
||||
index. The checks that can be performed by
|
||||
index, and that there are no missing downlinks in the index
|
||||
structure. The checks that can be performed by
|
||||
<function>bt_index_parent_check</function> are a superset of the
|
||||
checks that can be performed by <function>bt_index_check</function>.
|
||||
<function>bt_index_parent_check</function> can be thought of as
|
||||
|
Loading…
Reference in New Issue
Block a user