From 93ee38eade1b2b4964354b95b01b09e17d6f098d Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sat, 29 Feb 2020 12:10:17 -0800 Subject: [PATCH] Teach pageinspect about nbtree deduplication. Add a new bt_metap() column to display the metapage's allequalimage field. Also add three new columns to contrib/pageinspect's bt_page_items() function: * Add a boolean column ("dead") that displays the LP_DEAD bit value for each non-pivot tuple. * Add a TID column ("htid") that displays a single heap TID value for each tuple. This is the TID that is returned by BTreeTupleGetHeapTID(), so comparable values are shown for pivot tuples, plain non-pivot tuples, and posting list tuples. * Add a TID array column ("tids") that displays TIDs from each tuple's posting list, if any. This works just like the "tids" column from pageinspect's gin_leafpage_items() function. No version bump for the pageinspect extension, since there hasn't been a stable Postgres release since the last version bump (the last bump was part of commit 58b4cb30). Author: Peter Geoghegan Discussion: https://postgr.es/m/CAH2-WzmSMmU2eNvY9+a4MNP+z02h6sa-uxZvN3un6jY02ZVBSw@mail.gmail.com --- contrib/pageinspect/btreefuncs.c | 146 +++++++++++++++--- contrib/pageinspect/expected/btree.out | 7 + contrib/pageinspect/pageinspect--1.7--1.8.sql | 53 +++++++ doc/src/sgml/pageinspect.sgml | 120 +++++++++----- 4 files changed, 273 insertions(+), 53 deletions(-) diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c index 564c818558..2ddedef714 100644 --- a/contrib/pageinspect/btreefuncs.c +++ b/contrib/pageinspect/btreefuncs.c @@ -31,9 +31,11 @@ #include "access/relation.h" #include "catalog/namespace.h" #include "catalog/pg_am.h" +#include "catalog/pg_type.h" #include "funcapi.h" #include "miscadmin.h" #include "pageinspect.h" +#include "utils/array.h" #include "utils/builtins.h" #include "utils/rel.h" #include "utils/varlena.h" @@ -45,6 +47,8 @@ PG_FUNCTION_INFO_V1(bt_page_stats); #define IS_INDEX(r) ((r)->rd_rel->relkind == RELKIND_INDEX) #define IS_BTREE(r) ((r)->rd_rel->relam == BTREE_AM_OID) +#define DatumGetItemPointer(X) ((ItemPointer) DatumGetPointer(X)) +#define ItemPointerGetDatum(X) PointerGetDatum(X) /* note: BlockNumber is unsigned, hence can't be negative */ #define CHECK_RELATION_BLOCK_RANGE(rel, blkno) { \ @@ -243,6 +247,9 @@ struct user_args { Page page; OffsetNumber offset; + bool leafpage; + bool rightmost; + TupleDesc tupd; }; /*------------------------------------------------------- @@ -252,17 +259,25 @@ struct user_args * ------------------------------------------------------ */ static Datum -bt_page_print_tuples(FuncCallContext *fctx, Page page, OffsetNumber offset) +bt_page_print_tuples(FuncCallContext *fctx, struct user_args *uargs) { - char *values[6]; + Page page = uargs->page; + OffsetNumber offset = uargs->offset; + bool leafpage = uargs->leafpage; + bool rightmost = uargs->rightmost; + bool ispivottuple; + Datum values[9]; + bool nulls[9]; HeapTuple tuple; ItemId id; IndexTuple itup; int j; int off; int dlen; - char *dump; + char *dump, + *datacstring; char *ptr; + ItemPointer htid; id = PageGetItemId(page, offset); @@ -272,18 +287,49 @@ bt_page_print_tuples(FuncCallContext *fctx, Page page, OffsetNumber offset) itup = (IndexTuple) PageGetItem(page, id); j = 0; - values[j++] = psprintf("%d", offset); - values[j++] = psprintf("(%u,%u)", - ItemPointerGetBlockNumberNoCheck(&itup->t_tid), - ItemPointerGetOffsetNumberNoCheck(&itup->t_tid)); - values[j++] = psprintf("%d", (int) IndexTupleSize(itup)); - values[j++] = psprintf("%c", IndexTupleHasNulls(itup) ? 't' : 'f'); - values[j++] = psprintf("%c", IndexTupleHasVarwidths(itup) ? 't' : 'f'); + memset(nulls, 0, sizeof(nulls)); + values[j++] = DatumGetInt16(offset); + values[j++] = ItemPointerGetDatum(&itup->t_tid); + values[j++] = Int32GetDatum((int) IndexTupleSize(itup)); + values[j++] = BoolGetDatum(IndexTupleHasNulls(itup)); + values[j++] = BoolGetDatum(IndexTupleHasVarwidths(itup)); ptr = (char *) itup + IndexInfoFindDataOffset(itup->t_info); dlen = IndexTupleSize(itup) - IndexInfoFindDataOffset(itup->t_info); + + /* + * Make sure that "data" column does not include posting list or pivot + * tuple representation of heap TID(s). + * + * Note: BTreeTupleIsPivot() won't work reliably on !heapkeyspace indexes + * (those built before BTREE_VERSION 4), but we have no way of determining + * if this page came from a !heapkeyspace index. We may only have a bytea + * nbtree page image to go on, so in general there is no metapage that we + * can check. + * + * That's okay here because BTreeTupleIsPivot() can only return false for + * a !heapkeyspace pivot, never true for a !heapkeyspace non-pivot. Since + * heap TID isn't part of the keyspace in a !heapkeyspace index anyway, + * there cannot possibly be a pivot tuple heap TID representation that we + * fail to make an adjustment for. A !heapkeyspace index can have + * BTreeTupleIsPivot() return true (due to things like suffix truncation + * for INCLUDE indexes in Postgres v11), but when that happens + * BTreeTupleGetHeapTID() can be trusted to work reliably (i.e. return + * NULL). + * + * Note: BTreeTupleIsPosting() always works reliably, even with + * !heapkeyspace indexes. + */ + if (BTreeTupleIsPosting(itup)) + dlen -= IndexTupleSize(itup) - BTreeTupleGetPostingOffset(itup); + else if (BTreeTupleIsPivot(itup) && BTreeTupleGetHeapTID(itup) != NULL) + dlen -= MAXALIGN(sizeof(ItemPointerData)); + + if (dlen < 0 || dlen > INDEX_SIZE_MASK) + elog(ERROR, "invalid tuple length %d for tuple at offset number %u", + dlen, offset); dump = palloc0(dlen * 3 + 1); - values[j] = dump; + datacstring = dump; for (off = 0; off < dlen; off++) { if (off > 0) @@ -291,8 +337,62 @@ bt_page_print_tuples(FuncCallContext *fctx, Page page, OffsetNumber offset) sprintf(dump, "%02x", *(ptr + off) & 0xff); dump += 2; } + values[j++] = CStringGetTextDatum(datacstring); + pfree(datacstring); - tuple = BuildTupleFromCStrings(fctx->attinmeta, values); + /* + * We need to work around the BTreeTupleIsPivot() !heapkeyspace limitation + * again. Deduce whether or not tuple must be a pivot tuple based on + * whether or not the page is a leaf page, as well as the page offset + * number of the tuple. + */ + ispivottuple = (!leafpage || (!rightmost && offset == P_HIKEY)); + + /* LP_DEAD bit can never be set for pivot tuples, so show a NULL there */ + if (!ispivottuple) + values[j++] = BoolGetDatum(ItemIdIsDead(id)); + else + { + Assert(!ItemIdIsDead(id)); + nulls[j++] = true; + } + + htid = BTreeTupleGetHeapTID(itup); + if (ispivottuple && !BTreeTupleIsPivot(itup)) + { + /* Don't show bogus heap TID in !heapkeyspace pivot tuple */ + htid = NULL; + } + + if (htid) + values[j++] = ItemPointerGetDatum(htid); + else + nulls[j++] = true; + + if (BTreeTupleIsPosting(itup)) + { + /* Build an array of item pointers */ + ItemPointer tids; + Datum *tids_datum; + int nposting; + + tids = BTreeTupleGetPosting(itup); + nposting = BTreeTupleGetNPosting(itup); + tids_datum = (Datum *) palloc(nposting * sizeof(Datum)); + for (int i = 0; i < nposting; i++) + tids_datum[i] = ItemPointerGetDatum(&tids[i]); + values[j++] = PointerGetDatum(construct_array(tids_datum, + nposting, + TIDOID, + sizeof(ItemPointerData), + false, 's')); + pfree(tids_datum); + } + else + nulls[j++] = true; + + /* Build and return the result tuple */ + tuple = heap_form_tuple(uargs->tupd, values, nulls); return HeapTupleGetDatum(tuple); } @@ -378,12 +478,15 @@ bt_page_items(PG_FUNCTION_ARGS) elog(NOTICE, "page is deleted"); fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); + uargs->leafpage = P_ISLEAF(opaque); + uargs->rightmost = P_RIGHTMOST(opaque); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); + tupleDesc = BlessTupleDesc(tupleDesc); - fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc); + uargs->tupd = tupleDesc; fctx->user_fctx = uargs; @@ -395,7 +498,7 @@ bt_page_items(PG_FUNCTION_ARGS) if (fctx->call_cntr < fctx->max_calls) { - result = bt_page_print_tuples(fctx, uargs->page, uargs->offset); + result = bt_page_print_tuples(fctx, uargs); uargs->offset++; SRF_RETURN_NEXT(fctx, result); } @@ -463,12 +566,15 @@ bt_page_items_bytea(PG_FUNCTION_ARGS) elog(NOTICE, "page is deleted"); fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); + uargs->leafpage = P_ISLEAF(opaque); + uargs->rightmost = P_RIGHTMOST(opaque); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); + tupleDesc = BlessTupleDesc(tupleDesc); - fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc); + uargs->tupd = tupleDesc; fctx->user_fctx = uargs; @@ -480,7 +586,7 @@ bt_page_items_bytea(PG_FUNCTION_ARGS) if (fctx->call_cntr < fctx->max_calls) { - result = bt_page_print_tuples(fctx, uargs->page, uargs->offset); + result = bt_page_print_tuples(fctx, uargs); uargs->offset++; SRF_RETURN_NEXT(fctx, result); } @@ -510,7 +616,7 @@ bt_metap(PG_FUNCTION_ARGS) BTMetaPageData *metad; TupleDesc tupleDesc; int j; - char *values[8]; + char *values[9]; Buffer buffer; Page page; HeapTuple tuple; @@ -557,17 +663,21 @@ bt_metap(PG_FUNCTION_ARGS) /* * Get values of extended metadata if available, use default values - * otherwise. + * otherwise. Note that we rely on the assumption that btm_allequalimage + * is initialized to zero with indexes that were built on versions prior + * to Postgres 13 (just like _bt_metaversion()). */ if (metad->btm_version >= BTREE_NOVAC_VERSION) { values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact); values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples); + values[j++] = metad->btm_allequalimage ? "t" : "f"; } else { values[j++] = "0"; values[j++] = "-1"; + values[j++] = "f"; } tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc), diff --git a/contrib/pageinspect/expected/btree.out b/contrib/pageinspect/expected/btree.out index 07c2dcd771..17bf0c5470 100644 --- a/contrib/pageinspect/expected/btree.out +++ b/contrib/pageinspect/expected/btree.out @@ -12,6 +12,7 @@ fastroot | 1 fastlevel | 0 oldest_xact | 0 last_cleanup_num_tuples | -1 +allequalimage | t SELECT * FROM bt_page_stats('test1_a_idx', 0); ERROR: block 0 is a meta page @@ -41,6 +42,9 @@ itemlen | 16 nulls | f vars | f data | 01 00 00 00 00 00 00 01 +dead | f +htid | (0,1) +tids | SELECT * FROM bt_page_items('test1_a_idx', 2); ERROR: block number out of range @@ -54,6 +58,9 @@ itemlen | 16 nulls | f vars | f data | 01 00 00 00 00 00 00 01 +dead | f +htid | (0,1) +tids | SELECT * FROM bt_page_items(get_raw_page('test1_a_idx', 2)); ERROR: block number 2 is out of range for relation "test1_a_idx" diff --git a/contrib/pageinspect/pageinspect--1.7--1.8.sql b/contrib/pageinspect/pageinspect--1.7--1.8.sql index 2a7c4b3516..e34c214c93 100644 --- a/contrib/pageinspect/pageinspect--1.7--1.8.sql +++ b/contrib/pageinspect/pageinspect--1.7--1.8.sql @@ -14,3 +14,56 @@ CREATE FUNCTION heap_tuple_infomask_flags( RETURNS record AS 'MODULE_PATHNAME', 'heap_tuple_infomask_flags' LANGUAGE C STRICT PARALLEL SAFE; + +-- +-- bt_metap() +-- +DROP FUNCTION bt_metap(text); +CREATE FUNCTION bt_metap(IN relname text, + OUT magic int4, + OUT version int4, + OUT root int4, + OUT level int4, + OUT fastroot int4, + OUT fastlevel int4, + OUT oldest_xact int4, + OUT last_cleanup_num_tuples real, + OUT allequalimage boolean) +AS 'MODULE_PATHNAME', 'bt_metap' +LANGUAGE C STRICT PARALLEL SAFE; + +-- +-- bt_page_items(text, int4) +-- +DROP FUNCTION bt_page_items(text, int4); +CREATE FUNCTION bt_page_items(IN relname text, IN blkno int4, + OUT itemoffset smallint, + OUT ctid tid, + OUT itemlen smallint, + OUT nulls bool, + OUT vars bool, + OUT data text, + OUT dead boolean, + OUT htid tid, + OUT tids tid[]) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'bt_page_items' +LANGUAGE C STRICT PARALLEL SAFE; + +-- +-- bt_page_items(bytea) +-- +DROP FUNCTION bt_page_items(bytea); +CREATE FUNCTION bt_page_items(IN page bytea, + OUT itemoffset smallint, + OUT ctid tid, + OUT itemlen smallint, + OUT nulls bool, + OUT vars bool, + OUT data text, + OUT dead boolean, + OUT htid tid, + OUT tids tid[]) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'bt_page_items_bytea' +LANGUAGE C STRICT PARALLEL SAFE; diff --git a/doc/src/sgml/pageinspect.sgml b/doc/src/sgml/pageinspect.sgml index 7e2e1487d7..fedc94e8b2 100644 --- a/doc/src/sgml/pageinspect.sgml +++ b/doc/src/sgml/pageinspect.sgml @@ -300,13 +300,14 @@ test=# SELECT t_ctid, raw_flags, combined_flags test=# SELECT * FROM bt_metap('pg_cast_oid_index'); -[ RECORD 1 ]-----------+------- magic | 340322 -version | 3 +version | 4 root | 1 level | 0 fastroot | 1 fastlevel | 0 oldest_xact | 582 last_cleanup_num_tuples | 1000 +allequalimage | f @@ -329,11 +330,11 @@ test=# SELECT * FROM bt_page_stats('pg_cast_oid_index', 1); -[ RECORD 1 ]-+----- blkno | 1 type | l -live_items | 256 +live_items | 224 dead_items | 0 -avg_item_size | 12 +avg_item_size | 16 page_size | 8192 -free_size | 4056 +free_size | 3668 btpo_prev | 0 btpo_next | 0 btpo | 0 @@ -356,33 +357,75 @@ btpo_flags | 3 bt_page_items returns detailed information about all of the items on a B-tree index page. For example: -test=# SELECT * FROM bt_page_items('pg_cast_oid_index', 1); - itemoffset | ctid | itemlen | nulls | vars | data -------------+---------+---------+-------+------+------------- - 1 | (0,1) | 12 | f | f | 23 27 00 00 - 2 | (0,2) | 12 | f | f | 24 27 00 00 - 3 | (0,3) | 12 | f | f | 25 27 00 00 - 4 | (0,4) | 12 | f | f | 26 27 00 00 - 5 | (0,5) | 12 | f | f | 27 27 00 00 - 6 | (0,6) | 12 | f | f | 28 27 00 00 - 7 | (0,7) | 12 | f | f | 29 27 00 00 - 8 | (0,8) | 12 | f | f | 2a 27 00 00 + test=# SELECT itemoffset, ctid, itemlen, nulls, vars, data, dead, htid, tids[0:2] AS some_tids + FROM bt_page_items(get_raw_page('tenk2_hundred', 5)); + itemoffset | ctid | itemlen | nulls | vars | data | dead | htid | some_tids +------------+-----------+---------+-------+------+-------------------------+------+--------+--------------------- + 1 | (16,1) | 16 | f | f | 30 00 00 00 00 00 00 00 | | | + 2 | (16,8292) | 616 | f | f | 24 00 00 00 00 00 00 00 | f | (1,6) | {"(1,6)","(10,22)"} + 3 | (16,8292) | 616 | f | f | 25 00 00 00 00 00 00 00 | f | (1,18) | {"(1,18)","(4,22)"} + 4 | (16,8292) | 616 | f | f | 26 00 00 00 00 00 00 00 | f | (4,18) | {"(4,18)","(6,17)"} + 5 | (16,8292) | 616 | f | f | 27 00 00 00 00 00 00 00 | f | (1,2) | {"(1,2)","(1,19)"} + 6 | (16,8292) | 616 | f | f | 28 00 00 00 00 00 00 00 | f | (2,24) | {"(2,24)","(4,11)"} + 7 | (16,8292) | 616 | f | f | 29 00 00 00 00 00 00 00 | f | (2,17) | {"(2,17)","(11,2)"} + 8 | (16,8292) | 616 | f | f | 2a 00 00 00 00 00 00 00 | f | (0,25) | {"(0,25)","(3,20)"} + 9 | (16,8292) | 616 | f | f | 2b 00 00 00 00 00 00 00 | f | (0,10) | {"(0,10)","(0,14)"} + 10 | (16,8292) | 616 | f | f | 2c 00 00 00 00 00 00 00 | f | (1,3) | {"(1,3)","(3,9)"} + 11 | (16,8292) | 616 | f | f | 2d 00 00 00 00 00 00 00 | f | (6,28) | {"(6,28)","(11,1)"} + 12 | (16,8292) | 616 | f | f | 2e 00 00 00 00 00 00 00 | f | (0,27) | {"(0,27)","(1,13)"} + 13 | (16,8292) | 616 | f | f | 2f 00 00 00 00 00 00 00 | f | (4,17) | {"(4,17)","(4,21)"} +(13 rows) - In a B-tree leaf page, ctid points to a heap tuple. - In an internal page, the block number part of ctid - points to another page in the index itself, while the offset part - (the second number) is ignored and is usually 1. + This is a B-tree leaf page. All tuples that point to the table + happen to be posting list tuples (all of which store a total of + 100 6 byte TIDs). There is also a high key tuple + at itemoffset number 1. + ctid is used to store encoded + information about each tuple in this example, though leaf page + tuples often store a heap TID directly in the + ctid field instead. + tids is the list of TIDs stored as a + posting list. + + + In an internal page (not shown), the block number part of + ctid is a downlink, + which is a block number of another page in the index itself. + The offset part (the second number) of + ctid stores encoded information about + the tuple, such as the number of columns present (suffix + truncation may have removed unneeded suffix columns). Truncated + columns are treated as having the value minus + infinity. + + + htid shows a heap TID for the tuple, + regardless of the underlying tuple representation. This value + may match ctid, or may be decoded + from the alternative representations used by posting list tuples + and tuples from internal pages. Tuples in internal pages + usually have the implementation level heap TID column truncated + away, which is represented as a NULL + htid value. Note that the first item on any non-rightmost page (any page with a non-zero value in the btpo_next field) is the page's high key, meaning its data serves as an upper bound on all items appearing on the page, while - its ctid field is meaningless. Also, on non-leaf - pages, the first real data item (the first item that is not a high - key) is a minus infinity item, with no actual value - in its data field. Such an item does have a valid - downlink in its ctid field, however. + its ctid field does not point to + another block. Also, on internal pages, the first real data + item (the first item that is not a high key) reliably has every + column truncated away, leaving no actual value in its + data field. Such an item does have a + valid downlink in its ctid field, + however. + + + For more details about the structure of B-tree indexes, see + . For more details about + deduplication and posting lists, see . @@ -402,17 +445,24 @@ test=# SELECT * FROM bt_page_items('pg_cast_oid_index', 1); with get_raw_page should be passed as argument. So the last example could also be rewritten like this: -test=# SELECT * FROM bt_page_items(get_raw_page('pg_cast_oid_index', 1)); - itemoffset | ctid | itemlen | nulls | vars | data -------------+---------+---------+-------+------+------------- - 1 | (0,1) | 12 | f | f | 23 27 00 00 - 2 | (0,2) | 12 | f | f | 24 27 00 00 - 3 | (0,3) | 12 | f | f | 25 27 00 00 - 4 | (0,4) | 12 | f | f | 26 27 00 00 - 5 | (0,5) | 12 | f | f | 27 27 00 00 - 6 | (0,6) | 12 | f | f | 28 27 00 00 - 7 | (0,7) | 12 | f | f | 29 27 00 00 - 8 | (0,8) | 12 | f | f | 2a 27 00 00 + test=# SELECT itemoffset, ctid, itemlen, nulls, vars, data, dead, htid, tids[0:2] AS some_tids + FROM bt_page_items(get_raw_page('tenk2_hundred', 5)); + itemoffset | ctid | itemlen | nulls | vars | data | dead | htid | some_tids +------------+-----------+---------+-------+------+-------------------------+------+--------+--------------------- + 1 | (16,1) | 16 | f | f | 30 00 00 00 00 00 00 00 | | | + 2 | (16,8292) | 616 | f | f | 24 00 00 00 00 00 00 00 | f | (1,6) | {"(1,6)","(10,22)"} + 3 | (16,8292) | 616 | f | f | 25 00 00 00 00 00 00 00 | f | (1,18) | {"(1,18)","(4,22)"} + 4 | (16,8292) | 616 | f | f | 26 00 00 00 00 00 00 00 | f | (4,18) | {"(4,18)","(6,17)"} + 5 | (16,8292) | 616 | f | f | 27 00 00 00 00 00 00 00 | f | (1,2) | {"(1,2)","(1,19)"} + 6 | (16,8292) | 616 | f | f | 28 00 00 00 00 00 00 00 | f | (2,24) | {"(2,24)","(4,11)"} + 7 | (16,8292) | 616 | f | f | 29 00 00 00 00 00 00 00 | f | (2,17) | {"(2,17)","(11,2)"} + 8 | (16,8292) | 616 | f | f | 2a 00 00 00 00 00 00 00 | f | (0,25) | {"(0,25)","(3,20)"} + 9 | (16,8292) | 616 | f | f | 2b 00 00 00 00 00 00 00 | f | (0,10) | {"(0,10)","(0,14)"} + 10 | (16,8292) | 616 | f | f | 2c 00 00 00 00 00 00 00 | f | (1,3) | {"(1,3)","(3,9)"} + 11 | (16,8292) | 616 | f | f | 2d 00 00 00 00 00 00 00 | f | (6,28) | {"(6,28)","(11,1)"} + 12 | (16,8292) | 616 | f | f | 2e 00 00 00 00 00 00 00 | f | (0,27) | {"(0,27)","(1,13)"} + 13 | (16,8292) | 616 | f | f | 2f 00 00 00 00 00 00 00 | f | (4,17) | {"(4,17)","(4,21)"} +(13 rows) All the other details are the same as explained in the previous item.