Optimize btree insertions for common case of increasing values

Remember the last page of an index insert if it's the rightmost leaf
page. If the next entry belongs on and can fit in the remembered page,
insert the new entry there as long as we can get a lock on the page.
Otherwise, fall back on the more expensive method of searching for
the right place to insert the entry.

This provides a performance improvement for the common case where an
index entry is for monotonically increasing or nearly monotonically
increasing value such as an identity field or a current timestamp.

Pavan Deolasee
Reviewed by Claudio Freire, Simon Riggs and Peter Geoghegan

Discussion: https://postgr.es/m/CABOikdM9DrupjyKZZFM5k8-0RCDs1wk6JzEkg7UgSW6QzOwMZw@mail.gmail.com
This commit is contained in:
Andrew Dunstan 2018-03-26 22:39:24 +10:30
parent c515ff8d0a
commit 2b27273435
3 changed files with 486 additions and 21 deletions

View File

@ -23,6 +23,7 @@
#include "miscadmin.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/smgr.h"
#include "utils/tqual.h"
@ -85,7 +86,6 @@ static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
int keysz, ScanKey scankey);
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
/*
* _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
*
@ -111,18 +111,106 @@ _bt_doinsert(Relation rel, IndexTuple itup,
bool is_unique = false;
int natts = rel->rd_rel->relnatts;
ScanKey itup_scankey;
BTStack stack;
BTStack stack = NULL;
Buffer buf;
OffsetNumber offset;
bool fastpath;
/* we need an insertion scan key to do our search, so build one */
itup_scankey = _bt_mkscankey(rel, itup);
/*
* It's very common to have an index on an auto-incremented or
* monotonically increasing value. In such cases, every insertion happens
* towards the end of the index. We try to optimise that case by caching
* the right-most leaf of the index. If our cached block is still the
* rightmost leaf, has enough free space to accommodate a new entry and
* the insertion key is strictly greater than the first key in this page,
* then we can safely conclude that the new key will be inserted in the
* cached block. So we simply search within the cached block and insert the
* key at the appropriate location. We call it a fastpath.
*
* Testing has revealed, though, that the fastpath can result in increased
* contention on the exclusive-lock on the rightmost leaf page. So we
* conditionally check if the lock is available. If it's not available then
* we simply abandon the fastpath and take the regular path. This makes
* sense because unavailability of the lock also signals that some other
* backend might be concurrently inserting into the page, thus reducing our
* chances to finding an insertion place in this page.
*/
top:
/* find the first page containing this key */
stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE, NULL);
fastpath = false;
offset = InvalidOffsetNumber;
if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
{
Size itemsz;
Page page;
BTPageOpaque lpageop;
/*
* Conditionally acquire exclusive lock on the buffer before doing any
* checks. If we don't get the lock, we simply follow slowpath. If we
* do get the lock, this ensures that the index state cannot change, as
* far as the rightmost part of the index is concerned.
*/
buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
if (ConditionalLockBuffer(buf))
{
_bt_checkpage(rel, buf);
page = BufferGetPage(buf);
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
itemsz = IndexTupleSize(itup);
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this
* but we need to be consistent */
/*
* Check if the page is still the rightmost leaf page, has enough
* free space to accommodate the new tuple, no split is in progress
* and the scankey is greater than or equal to the first key on the
* page.
*/
if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&
!P_INCOMPLETE_SPLIT(lpageop) &&
!P_IGNORE(lpageop) &&
(PageGetFreeSpace(page) > itemsz) &&
PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
_bt_compare(rel, natts, itup_scankey, page,
P_FIRSTDATAKEY(lpageop)) > 0)
{
fastpath = true;
}
else
{
_bt_relbuf(rel, buf);
/*
* Something did not workout. Just forget about the cached
* block and follow the normal path. It might be set again if
* the conditions are favourble.
*/
RelationSetTargetBlock(rel, InvalidBlockNumber);
}
}
else
{
ReleaseBuffer(buf);
/*
* If someone's holding a lock, it's likely to change anyway,
* so don't try again until we get an updated rightmost leaf.
*/
RelationSetTargetBlock(rel, InvalidBlockNumber);
}
}
if (!fastpath)
{
/* find the first page containing this key */
stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE,
NULL);
/* trade in our read lock for a write lock */
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
@ -130,13 +218,14 @@ top:
/*
* If the page was split between the time that we surrendered our read
* lock and acquired our write lock, then this page may no longer be the
* right place for the key we want to insert. In this case, we need to
* move right in the tree. See Lehman and Yao for an excruciatingly
* precise description.
* lock and acquired our write lock, then this page may no longer be
* the right place for the key we want to insert. In this case, we
* need to move right in the tree. See Lehman and Yao for an
* excruciatingly precise description.
*/
buf = _bt_moveright(rel, buf, natts, itup_scankey, false,
true, stack, BT_WRITE, NULL);
}
/*
* If we're not allowing duplicates, make sure the key isn't already in
@ -184,6 +273,7 @@ top:
XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
/* start over... */
if (stack)
_bt_freestack(stack);
goto top;
}
@ -211,6 +301,7 @@ top:
}
/* be tidy */
if (stack)
_bt_freestack(stack);
_bt_freeskey(itup_scankey);
@ -879,7 +970,16 @@ _bt_insertonpg(Relation rel,
XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
if (P_ISLEAF(lpageop))
{
xlinfo = XLOG_BTREE_INSERT_LEAF;
/*
* Cache the block information if we just inserted into the
* rightmost leaf page of the index.
*/
if (P_RIGHTMOST(lpageop))
RelationSetTargetBlock(rel, BufferGetBlockNumber(buf));
}
else
{
/*

View File

@ -1067,6 +1067,255 @@ select tableoid::regclass, * from idxpart order by a;
(8 rows)
drop table idxpart;
-- test fastpath mechanism for index insertion
create table fastpath (a int, b text, c numeric);
create unique index fpindex1 on fastpath(a);
insert into fastpath values (1, 'b1', 100.00);
insert into fastpath values (1, 'b1', 100.00); -- unique key check
ERROR: duplicate key value violates unique constraint "fpindex1"
DETAIL: Key (a)=(1) already exists.
truncate fastpath;
insert into fastpath select generate_series(1,10000), 'b', 100;
-- vacuum the table so as to improve chances of index-only scans. we can't
-- guarantee if index-only scans will be picked up in all cases or not, but
-- that fuzziness actually helps the test.
vacuum fastpath;
set enable_seqscan to false;
set enable_bitmapscan to false;
explain select sum(a) from fastpath where a = 6456;
QUERY PLAN
------------------------------------------------------------------------------------
Aggregate (cost=4.31..4.32 rows=1 width=8)
-> Index Only Scan using fpindex1 on fastpath (cost=0.29..4.30 rows=1 width=4)
Index Cond: (a = 6456)
(3 rows)
explain select sum(a) from fastpath where a >= 5000 and a < 5700;
QUERY PLAN
-------------------------------------------------------------------------------------
Aggregate (cost=5.41..5.42 rows=1 width=8)
-> Index Only Scan using fpindex1 on fastpath (cost=0.29..5.29 rows=50 width=4)
Index Cond: ((a >= 5000) AND (a < 5700))
(3 rows)
select sum(a) from fastpath where a = 6456;
sum
------
6456
(1 row)
select sum(a) from fastpath where a >= 5000 and a < 5700;
sum
---------
3744650
(1 row)
-- drop the only index on the table and compute hashes for
-- a few queries which orders the results in various different ways.
drop index fpindex1;
truncate fastpath;
insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y;
select md5(string_agg(a::text, b order by a, b asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
2ca216010a558a52d7df12f76dfc77ab
(1 row)
select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
6167a852b3e0679886b84a5405b5b53d
(1 row)
select md5(string_agg(a::text, b order by b, a desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
dfcf2bd5e5fea8397d47b2fd14618d31
(1 row)
select md5(string_agg(a::text, b order by b, a asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
2ca216010a558a52d7df12f76dfc77ab
(1 row)
-- now create a multi-column index with both column asc
create index fpindex2 on fastpath(a, b);
truncate fastpath;
insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y;
-- again, vacuum here either forces index-only scans or creates fuzziness
vacuum fastpath;
select md5(string_agg(a::text, b order by a, b asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
2ca216010a558a52d7df12f76dfc77ab
(1 row)
select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
6167a852b3e0679886b84a5405b5b53d
(1 row)
select md5(string_agg(a::text, b order by b, a desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
dfcf2bd5e5fea8397d47b2fd14618d31
(1 row)
select md5(string_agg(a::text, b order by b, a asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
2ca216010a558a52d7df12f76dfc77ab
(1 row)
-- same queries with a different kind of index now. the final result must not
-- change irrespective of what kind of index we have.
drop index fpindex2;
create index fpindex3 on fastpath(a desc, b asc);
truncate fastpath;
insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y;
vacuum fastpath;
select md5(string_agg(a::text, b order by a, b asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
2ca216010a558a52d7df12f76dfc77ab
(1 row)
select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
6167a852b3e0679886b84a5405b5b53d
(1 row)
select md5(string_agg(a::text, b order by b, a desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
dfcf2bd5e5fea8397d47b2fd14618d31
(1 row)
select md5(string_agg(a::text, b order by b, a asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
2ca216010a558a52d7df12f76dfc77ab
(1 row)
-- repeat again
drop index fpindex3;
create index fpindex4 on fastpath(a asc, b desc);
truncate fastpath;
insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y;
vacuum fastpath;
select md5(string_agg(a::text, b order by a, b asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
2ca216010a558a52d7df12f76dfc77ab
(1 row)
select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
6167a852b3e0679886b84a5405b5b53d
(1 row)
select md5(string_agg(a::text, b order by b, a desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
dfcf2bd5e5fea8397d47b2fd14618d31
(1 row)
select md5(string_agg(a::text, b order by b, a asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
2ca216010a558a52d7df12f76dfc77ab
(1 row)
-- and again, this time indexing by (b, a). Note that column "b" has non-unique
-- values.
drop index fpindex4;
create index fpindex5 on fastpath(b asc, a desc);
truncate fastpath;
insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y;
vacuum fastpath;
select md5(string_agg(a::text, b order by a, b asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
2ca216010a558a52d7df12f76dfc77ab
(1 row)
select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
6167a852b3e0679886b84a5405b5b53d
(1 row)
select md5(string_agg(a::text, b order by b, a desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
dfcf2bd5e5fea8397d47b2fd14618d31
(1 row)
select md5(string_agg(a::text, b order by b, a asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
2ca216010a558a52d7df12f76dfc77ab
(1 row)
-- one last time
drop index fpindex5;
create index fpindex6 on fastpath(b desc, a desc);
truncate fastpath;
insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y;
vacuum fastpath;
select md5(string_agg(a::text, b order by a, b asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
2ca216010a558a52d7df12f76dfc77ab
(1 row)
select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
6167a852b3e0679886b84a5405b5b53d
(1 row)
select md5(string_agg(a::text, b order by b, a desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
dfcf2bd5e5fea8397d47b2fd14618d31
(1 row)
select md5(string_agg(a::text, b order by b, a asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
md5
----------------------------------
2ca216010a558a52d7df12f76dfc77ab
(1 row)
drop table fastpath;
-- intentionally leave some objects around
create table idxpart (a int) partition by range (a);
create table idxpart1 partition of idxpart for values from (0) to (100);

View File

@ -574,6 +574,122 @@ insert into idxpart values (857142, 'six');
select tableoid::regclass, * from idxpart order by a;
drop table idxpart;
-- test fastpath mechanism for index insertion
create table fastpath (a int, b text, c numeric);
create unique index fpindex1 on fastpath(a);
insert into fastpath values (1, 'b1', 100.00);
insert into fastpath values (1, 'b1', 100.00); -- unique key check
truncate fastpath;
insert into fastpath select generate_series(1,10000), 'b', 100;
-- vacuum the table so as to improve chances of index-only scans. we can't
-- guarantee if index-only scans will be picked up in all cases or not, but
-- that fuzziness actually helps the test.
vacuum fastpath;
set enable_seqscan to false;
set enable_bitmapscan to false;
explain select sum(a) from fastpath where a = 6456;
explain select sum(a) from fastpath where a >= 5000 and a < 5700;
select sum(a) from fastpath where a = 6456;
select sum(a) from fastpath where a >= 5000 and a < 5700;
-- drop the only index on the table and compute hashes for
-- a few queries which orders the results in various different ways.
drop index fpindex1;
truncate fastpath;
insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y;
select md5(string_agg(a::text, b order by a, b asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by b, a desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by b, a asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
-- now create a multi-column index with both column asc
create index fpindex2 on fastpath(a, b);
truncate fastpath;
insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y;
-- again, vacuum here either forces index-only scans or creates fuzziness
vacuum fastpath;
select md5(string_agg(a::text, b order by a, b asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by b, a desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by b, a asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
-- same queries with a different kind of index now. the final result must not
-- change irrespective of what kind of index we have.
drop index fpindex2;
create index fpindex3 on fastpath(a desc, b asc);
truncate fastpath;
insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y;
vacuum fastpath;
select md5(string_agg(a::text, b order by a, b asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by b, a desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by b, a asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
-- repeat again
drop index fpindex3;
create index fpindex4 on fastpath(a asc, b desc);
truncate fastpath;
insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y;
vacuum fastpath;
select md5(string_agg(a::text, b order by a, b asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by b, a desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by b, a asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
-- and again, this time indexing by (b, a). Note that column "b" has non-unique
-- values.
drop index fpindex4;
create index fpindex5 on fastpath(b asc, a desc);
truncate fastpath;
insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y;
vacuum fastpath;
select md5(string_agg(a::text, b order by a, b asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by b, a desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by b, a asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
-- one last time
drop index fpindex5;
create index fpindex6 on fastpath(b desc, a desc);
truncate fastpath;
insert into fastpath select y.x, 'b' || (y.x/10)::text, 100 from (select generate_series(1,10000) as x) y;
vacuum fastpath;
select md5(string_agg(a::text, b order by a, b asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by a desc, b desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by b, a desc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
select md5(string_agg(a::text, b order by b, a asc)) from fastpath
where a >= 1000 and a < 2000 and b > 'b1' and b < 'b3';
drop table fastpath;
-- intentionally leave some objects around
create table idxpart (a int) partition by range (a);
create table idxpart1 partition of idxpart for values from (0) to (100);