mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-02-23 19:39:53 +08:00
Arrange to cache btree metapage data in the relcache entry for the index,
thereby saving a visit to the metapage in most index searches/updates. This wouldn't actually save any I/O (since in the old regime the metapage generally stayed in cache anyway), but it does provide a useful decrease in bufmgr traffic in high-contention scenarios. Per my recent proposal.
This commit is contained in:
parent
89083876c9
commit
d2896a9ed1
src
@ -1,4 +1,4 @@
|
|||||||
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.9 2006/01/17 00:09:00 tgl Exp $
|
$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.10 2006/04/25 22:46:05 tgl Exp $
|
||||||
|
|
||||||
This directory contains a correct implementation of Lehman and Yao's
|
This directory contains a correct implementation of Lehman and Yao's
|
||||||
high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
|
high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
|
||||||
@ -316,7 +316,17 @@ Other things that are handy to know
|
|||||||
|
|
||||||
Page zero of every btree is a meta-data page. This page stores the
|
Page zero of every btree is a meta-data page. This page stores the
|
||||||
location of the root page --- both the true root and the current effective
|
location of the root page --- both the true root and the current effective
|
||||||
root ("fast" root).
|
root ("fast" root). To avoid fetching the metapage for every single index
|
||||||
|
search, we cache a copy of the meta-data information in the index's
|
||||||
|
relcache entry (rd_amcache). This is a bit ticklish since using the cache
|
||||||
|
implies following a root page pointer that could be stale. We require
|
||||||
|
every metapage update to send out a SI "relcache inval" message on the
|
||||||
|
index relation. That ensures that each backend will flush its cached copy
|
||||||
|
not later than the start of its next transaction. Therefore, stale
|
||||||
|
pointers cannot be used for longer than the current transaction, which
|
||||||
|
reduces the problem to the same one already dealt with for concurrent
|
||||||
|
VACUUM --- we can just imagine that each open transaction is potentially
|
||||||
|
"already in flight" to the old root.
|
||||||
|
|
||||||
The algorithm assumes we can fit at least three items per page
|
The algorithm assumes we can fit at least three items per page
|
||||||
(a "high key" and two real data items). Therefore it's unsafe
|
(a "high key" and two real data items). Therefore it's unsafe
|
||||||
|
@ -8,7 +8,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.135 2006/04/13 03:53:05 tgl Exp $
|
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.136 2006/04/25 22:46:05 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -18,6 +18,7 @@
|
|||||||
#include "access/heapam.h"
|
#include "access/heapam.h"
|
||||||
#include "access/nbtree.h"
|
#include "access/nbtree.h"
|
||||||
#include "miscadmin.h"
|
#include "miscadmin.h"
|
||||||
|
#include "utils/inval.h"
|
||||||
|
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
@ -638,9 +639,12 @@ _bt_insertonpg(Relation rel,
|
|||||||
|
|
||||||
END_CRIT_SECTION();
|
END_CRIT_SECTION();
|
||||||
|
|
||||||
/* release pin/lock */
|
/* release buffers; send out relcache inval if metapage changed */
|
||||||
if (BufferIsValid(metabuf))
|
if (BufferIsValid(metabuf))
|
||||||
|
{
|
||||||
|
CacheInvalidateRelcache(rel);
|
||||||
_bt_relbuf(rel, metabuf);
|
_bt_relbuf(rel, metabuf);
|
||||||
|
}
|
||||||
|
|
||||||
_bt_relbuf(rel, buf);
|
_bt_relbuf(rel, buf);
|
||||||
}
|
}
|
||||||
@ -1526,6 +1530,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
|
|||||||
|
|
||||||
END_CRIT_SECTION();
|
END_CRIT_SECTION();
|
||||||
|
|
||||||
|
/* send out relcache inval for metapage change */
|
||||||
|
CacheInvalidateRelcache(rel);
|
||||||
|
|
||||||
/* done with metapage */
|
/* done with metapage */
|
||||||
_bt_relbuf(rel, metabuf);
|
_bt_relbuf(rel, metabuf);
|
||||||
|
|
||||||
|
@ -9,7 +9,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.95 2006/04/01 03:03:36 tgl Exp $
|
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.96 2006/04/25 22:46:05 tgl Exp $
|
||||||
*
|
*
|
||||||
* NOTES
|
* NOTES
|
||||||
* Postgres btree pages look like ordinary relation pages. The opaque
|
* Postgres btree pages look like ordinary relation pages. The opaque
|
||||||
@ -26,6 +26,7 @@
|
|||||||
#include "miscadmin.h"
|
#include "miscadmin.h"
|
||||||
#include "storage/freespace.h"
|
#include "storage/freespace.h"
|
||||||
#include "storage/lmgr.h"
|
#include "storage/lmgr.h"
|
||||||
|
#include "utils/inval.h"
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -99,6 +100,49 @@ _bt_getroot(Relation rel, int access)
|
|||||||
uint32 rootlevel;
|
uint32 rootlevel;
|
||||||
BTMetaPageData *metad;
|
BTMetaPageData *metad;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Try to use previously-cached metapage data to find the root. This
|
||||||
|
* normally saves one buffer access per index search, which is a very
|
||||||
|
* helpful savings in bufmgr traffic and hence contention.
|
||||||
|
*/
|
||||||
|
if (rel->rd_amcache != NULL)
|
||||||
|
{
|
||||||
|
metad = (BTMetaPageData *) rel->rd_amcache;
|
||||||
|
/* We shouldn't have cached it if any of these fail */
|
||||||
|
Assert(metad->btm_magic == BTREE_MAGIC);
|
||||||
|
Assert(metad->btm_version == BTREE_VERSION);
|
||||||
|
Assert(metad->btm_root != P_NONE);
|
||||||
|
|
||||||
|
rootblkno = metad->btm_fastroot;
|
||||||
|
Assert(rootblkno != P_NONE);
|
||||||
|
rootlevel = metad->btm_fastlevel;
|
||||||
|
|
||||||
|
rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
|
||||||
|
rootpage = BufferGetPage(rootbuf);
|
||||||
|
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Since the cache might be stale, we check the page more carefully
|
||||||
|
* here than normal. We *must* check that it's not deleted.
|
||||||
|
* If it's not alone on its level, then we reject too --- this
|
||||||
|
* may be overly paranoid but better safe than sorry. Note we
|
||||||
|
* don't check P_ISROOT, because that's not set in a "fast root".
|
||||||
|
*/
|
||||||
|
if (!P_IGNORE(rootopaque) &&
|
||||||
|
rootopaque->btpo.level == rootlevel &&
|
||||||
|
P_LEFTMOST(rootopaque) &&
|
||||||
|
P_RIGHTMOST(rootopaque))
|
||||||
|
{
|
||||||
|
/* OK, accept cached page as the root */
|
||||||
|
return rootbuf;
|
||||||
|
}
|
||||||
|
_bt_relbuf(rel, rootbuf);
|
||||||
|
/* Cache is stale, throw it away */
|
||||||
|
if (rel->rd_amcache)
|
||||||
|
pfree(rel->rd_amcache);
|
||||||
|
rel->rd_amcache = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
|
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
|
||||||
metapg = BufferGetPage(metabuf);
|
metapg = BufferGetPage(metabuf);
|
||||||
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
|
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
|
||||||
@ -200,6 +244,12 @@ _bt_getroot(Relation rel, int access)
|
|||||||
|
|
||||||
END_CRIT_SECTION();
|
END_CRIT_SECTION();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Send out relcache inval for metapage change (probably unnecessary
|
||||||
|
* here, but let's be safe).
|
||||||
|
*/
|
||||||
|
CacheInvalidateRelcache(rel);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* swap root write lock for read lock. There is no danger of anyone
|
* swap root write lock for read lock. There is no danger of anyone
|
||||||
* else accessing the new root page while it's unlocked, since no one
|
* else accessing the new root page while it's unlocked, since no one
|
||||||
@ -217,6 +267,13 @@ _bt_getroot(Relation rel, int access)
|
|||||||
Assert(rootblkno != P_NONE);
|
Assert(rootblkno != P_NONE);
|
||||||
rootlevel = metad->btm_fastlevel;
|
rootlevel = metad->btm_fastlevel;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Cache the metapage data for next time
|
||||||
|
*/
|
||||||
|
rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
|
||||||
|
sizeof(BTMetaPageData));
|
||||||
|
memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We are done with the metapage; arrange to release it via first
|
* We are done with the metapage; arrange to release it via first
|
||||||
* _bt_relandgetbuf call
|
* _bt_relandgetbuf call
|
||||||
@ -280,6 +337,16 @@ _bt_gettrueroot(Relation rel)
|
|||||||
uint32 rootlevel;
|
uint32 rootlevel;
|
||||||
BTMetaPageData *metad;
|
BTMetaPageData *metad;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We don't try to use cached metapage data here, since (a) this path is
|
||||||
|
* not performance-critical, and (b) if we are here it suggests our cache
|
||||||
|
* is out-of-date anyway. In light of point (b), it's probably safest to
|
||||||
|
* actively flush any cached metapage info.
|
||||||
|
*/
|
||||||
|
if (rel->rd_amcache)
|
||||||
|
pfree(rel->rd_amcache);
|
||||||
|
rel->rd_amcache = NULL;
|
||||||
|
|
||||||
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
|
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
|
||||||
metapg = BufferGetPage(metabuf);
|
metapg = BufferGetPage(metabuf);
|
||||||
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
|
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
|
||||||
@ -1052,9 +1119,12 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
|||||||
|
|
||||||
END_CRIT_SECTION();
|
END_CRIT_SECTION();
|
||||||
|
|
||||||
/* release buffers */
|
/* release buffers; send out relcache inval if metapage changed */
|
||||||
if (BufferIsValid(metabuf))
|
if (BufferIsValid(metabuf))
|
||||||
|
{
|
||||||
|
CacheInvalidateRelcache(rel);
|
||||||
_bt_relbuf(rel, metabuf);
|
_bt_relbuf(rel, metabuf);
|
||||||
|
}
|
||||||
_bt_relbuf(rel, pbuf);
|
_bt_relbuf(rel, pbuf);
|
||||||
_bt_relbuf(rel, rbuf);
|
_bt_relbuf(rel, rbuf);
|
||||||
_bt_relbuf(rel, buf);
|
_bt_relbuf(rel, buf);
|
||||||
|
@ -12,7 +12,7 @@
|
|||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.144 2006/04/01 03:03:37 tgl Exp $
|
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.145 2006/04/25 22:46:05 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -26,6 +26,7 @@
|
|||||||
#include "miscadmin.h"
|
#include "miscadmin.h"
|
||||||
#include "storage/freespace.h"
|
#include "storage/freespace.h"
|
||||||
#include "storage/smgr.h"
|
#include "storage/smgr.h"
|
||||||
|
#include "utils/inval.h"
|
||||||
#include "utils/memutils.h"
|
#include "utils/memutils.h"
|
||||||
|
|
||||||
|
|
||||||
@ -127,6 +128,17 @@ btbuild(PG_FUNCTION_ARGS)
|
|||||||
}
|
}
|
||||||
#endif /* BTREE_BUILD_STATS */
|
#endif /* BTREE_BUILD_STATS */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we are reindexing a pre-existing index, it is critical to send out
|
||||||
|
* a relcache invalidation SI message to ensure all backends re-read the
|
||||||
|
* index metapage. In most circumstances the update-stats operation will
|
||||||
|
* cause that to happen, but at the moment there are corner cases where
|
||||||
|
* no pg_class update will occur, so force an inval here. XXX FIXME:
|
||||||
|
* the upper levels of CREATE INDEX should handle the stats update as
|
||||||
|
* well as guaranteeing relcache inval.
|
||||||
|
*/
|
||||||
|
CacheInvalidateRelcache(index);
|
||||||
|
|
||||||
/* since we just counted the # of tuples, may as well update stats */
|
/* since we just counted the # of tuples, may as well update stats */
|
||||||
IndexCloseAndUpdateStats(heap, reltuples, index, buildstate.indtuples);
|
IndexCloseAndUpdateStats(heap, reltuples, index, buildstate.indtuples);
|
||||||
|
|
||||||
|
8
src/backend/utils/cache/relcache.c
vendored
8
src/backend/utils/cache/relcache.c
vendored
@ -8,7 +8,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.238 2006/03/05 15:58:45 momjian Exp $
|
* $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.239 2006/04/25 22:46:05 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -948,6 +948,7 @@ RelationInitIndexAccessInfo(Relation relation)
|
|||||||
*/
|
*/
|
||||||
relation->rd_indexprs = NIL;
|
relation->rd_indexprs = NIL;
|
||||||
relation->rd_indpred = NIL;
|
relation->rd_indpred = NIL;
|
||||||
|
relation->rd_amcache = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1481,6 +1482,10 @@ RelationReloadClassinfo(Relation relation)
|
|||||||
RelationInitPhysicalAddr(relation);
|
RelationInitPhysicalAddr(relation);
|
||||||
/* Make sure targblock is reset in case rel was truncated */
|
/* Make sure targblock is reset in case rel was truncated */
|
||||||
relation->rd_targblock = InvalidBlockNumber;
|
relation->rd_targblock = InvalidBlockNumber;
|
||||||
|
/* Must free any AM cached data, too */
|
||||||
|
if (relation->rd_amcache)
|
||||||
|
pfree(relation->rd_amcache);
|
||||||
|
relation->rd_amcache = NULL;
|
||||||
/* Okay, now it's valid again */
|
/* Okay, now it's valid again */
|
||||||
relation->rd_isvalid = true;
|
relation->rd_isvalid = true;
|
||||||
}
|
}
|
||||||
@ -3141,6 +3146,7 @@ load_relcache_init_file(void)
|
|||||||
rel->rd_indexlist = NIL;
|
rel->rd_indexlist = NIL;
|
||||||
rel->rd_oidindex = InvalidOid;
|
rel->rd_oidindex = InvalidOid;
|
||||||
rel->rd_createSubid = InvalidSubTransactionId;
|
rel->rd_createSubid = InvalidSubTransactionId;
|
||||||
|
rel->rd_amcache = NULL;
|
||||||
MemSet(&rel->pgstat_info, 0, sizeof(rel->pgstat_info));
|
MemSet(&rel->pgstat_info, 0, sizeof(rel->pgstat_info));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
|
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
|
||||||
* Portions Copyright (c) 1994, Regents of the University of California
|
* Portions Copyright (c) 1994, Regents of the University of California
|
||||||
*
|
*
|
||||||
* $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.88 2006/03/05 15:59:07 momjian Exp $
|
* $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.89 2006/04/25 22:46:05 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -167,6 +167,13 @@ typedef struct RelationData
|
|||||||
* cached, namely those with subtype zero. The arrays are indexed by
|
* cached, namely those with subtype zero. The arrays are indexed by
|
||||||
* strategy or support number, which is a sufficient identifier given that
|
* strategy or support number, which is a sufficient identifier given that
|
||||||
* restriction.
|
* restriction.
|
||||||
|
*
|
||||||
|
* Note: rd_amcache is available for index AMs to cache private data about
|
||||||
|
* an index. This must be just a cache since it may get reset at any time
|
||||||
|
* (in particular, it will get reset by a relcache inval message for the
|
||||||
|
* index). If used, it must point to a single memory chunk palloc'd in
|
||||||
|
* rd_indexcxt. A relcache reset will include freeing that chunk and
|
||||||
|
* setting rd_amcache = NULL.
|
||||||
*/
|
*/
|
||||||
MemoryContext rd_indexcxt; /* private memory cxt for this stuff */
|
MemoryContext rd_indexcxt; /* private memory cxt for this stuff */
|
||||||
RelationAmInfo *rd_aminfo; /* lookup info for funcs found in pg_am */
|
RelationAmInfo *rd_aminfo; /* lookup info for funcs found in pg_am */
|
||||||
@ -175,6 +182,7 @@ typedef struct RelationData
|
|||||||
FmgrInfo *rd_supportinfo; /* lookup info for support procedures */
|
FmgrInfo *rd_supportinfo; /* lookup info for support procedures */
|
||||||
List *rd_indexprs; /* index expression trees, if any */
|
List *rd_indexprs; /* index expression trees, if any */
|
||||||
List *rd_indpred; /* index predicate tree, if any */
|
List *rd_indpred; /* index predicate tree, if any */
|
||||||
|
void *rd_amcache; /* available for use by index AM */
|
||||||
|
|
||||||
/* statistics collection area */
|
/* statistics collection area */
|
||||||
PgStat_Info pgstat_info;
|
PgStat_Info pgstat_info;
|
||||||
|
Loading…
Reference in New Issue
Block a user