mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-12-21 08:29:39 +08:00
Retain original physical order of tuples in redo of b-tree splits.
It makes no difference to the system, but minimizing the differences between a master and standby makes debugging simpler.
This commit is contained in:
parent
7d98054f0d
commit
7e30c186da
@ -27,13 +27,6 @@
|
|||||||
* had been its upper part (pd_upper to pd_special). We assume that the
|
* had been its upper part (pd_upper to pd_special). We assume that the
|
||||||
* tuples had been added to the page in item-number order, and therefore
|
* tuples had been added to the page in item-number order, and therefore
|
||||||
* the one with highest item number appears first (lowest on the page).
|
* the one with highest item number appears first (lowest on the page).
|
||||||
*
|
|
||||||
* NOTE: the way this routine is coded, the rebuilt page will have the items
|
|
||||||
* in correct itemno sequence, but physically the opposite order from the
|
|
||||||
* original, because we insert them in the opposite of itemno order. This
|
|
||||||
* does not matter in any current btree code, but it's something to keep an
|
|
||||||
* eye on. Is it worth changing just on general principles? See also the
|
|
||||||
* notes in btree_xlog_split().
|
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
_bt_restore_page(Page page, char *from, int len)
|
_bt_restore_page(Page page, char *from, int len)
|
||||||
@ -41,14 +34,35 @@ _bt_restore_page(Page page, char *from, int len)
|
|||||||
IndexTupleData itupdata;
|
IndexTupleData itupdata;
|
||||||
Size itemsz;
|
Size itemsz;
|
||||||
char *end = from + len;
|
char *end = from + len;
|
||||||
|
Item items[MaxIndexTuplesPerPage];
|
||||||
|
uint16 itemsizes[MaxIndexTuplesPerPage];
|
||||||
|
int i;
|
||||||
|
int nitems;
|
||||||
|
|
||||||
for (; from < end;)
|
/*
|
||||||
|
* To get the items back in the original order, we add them to the page
|
||||||
|
* in reverse. To figure out where one tuple ends and another begins,
|
||||||
|
* we have to scan them in forward order first.
|
||||||
|
*/
|
||||||
|
i = 0;
|
||||||
|
while (from < end)
|
||||||
{
|
{
|
||||||
/* Need to copy tuple header due to alignment considerations */
|
/* Need to copy tuple header due to alignment considerations */
|
||||||
memcpy(&itupdata, from, sizeof(IndexTupleData));
|
memcpy(&itupdata, from, sizeof(IndexTupleData));
|
||||||
itemsz = IndexTupleDSize(itupdata);
|
itemsz = IndexTupleDSize(itupdata);
|
||||||
itemsz = MAXALIGN(itemsz);
|
itemsz = MAXALIGN(itemsz);
|
||||||
if (PageAddItem(page, (Item) from, itemsz, FirstOffsetNumber,
|
|
||||||
|
items[i] = (Item) from;
|
||||||
|
itemsizes[i] = itemsz;
|
||||||
|
i++;
|
||||||
|
|
||||||
|
from += itemsz;
|
||||||
|
}
|
||||||
|
nitems = i;
|
||||||
|
|
||||||
|
for (i = nitems - 1; i >= 0; i--)
|
||||||
|
{
|
||||||
|
if (PageAddItem(page, items[i], itemsizes[i], nitems - i,
|
||||||
false, false) == InvalidOffsetNumber)
|
false, false) == InvalidOffsetNumber)
|
||||||
elog(PANIC, "_bt_restore_page: cannot add item to page");
|
elog(PANIC, "_bt_restore_page: cannot add item to page");
|
||||||
from += itemsz;
|
from += itemsz;
|
||||||
@ -332,10 +346,13 @@ btree_xlog_split(bool onleft, bool isroot,
|
|||||||
if (BufferIsValid(lbuf))
|
if (BufferIsValid(lbuf))
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* Note that this code ensures that the items remaining on the
|
* To retain the same physical order of the tuples that they had,
|
||||||
* left page are in the correct item number order, but it does not
|
* we initialize a temporary empty page for the left page and add
|
||||||
* reproduce the physical order they would have had. Is this
|
* all the items to that in item number order. This mirrors how
|
||||||
* worth changing? See also _bt_restore_page().
|
* _bt_split() works. It's not strictly required to retain the
|
||||||
|
* same physical order, as long as the items are in the correct
|
||||||
|
* item number order, but it helps debugging. See also
|
||||||
|
* _bt_restore_page(), which does the same for the right page.
|
||||||
*/
|
*/
|
||||||
Page lpage = (Page) BufferGetPage(lbuf);
|
Page lpage = (Page) BufferGetPage(lbuf);
|
||||||
BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
|
BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
|
||||||
@ -343,45 +360,52 @@ btree_xlog_split(bool onleft, bool isroot,
|
|||||||
if (lsn > PageGetLSN(lpage))
|
if (lsn > PageGetLSN(lpage))
|
||||||
{
|
{
|
||||||
OffsetNumber off;
|
OffsetNumber off;
|
||||||
OffsetNumber maxoff = PageGetMaxOffsetNumber(lpage);
|
Page newlpage;
|
||||||
OffsetNumber deletable[MaxOffsetNumber];
|
OffsetNumber leftoff;
|
||||||
int ndeletable = 0;
|
|
||||||
|
|
||||||
/*
|
newlpage = PageGetTempPageCopySpecial(lpage);
|
||||||
* Remove the items from the left page that were copied to the
|
|
||||||
* right page. Also remove the old high key, if any. (We must
|
|
||||||
* remove everything before trying to insert any items, else
|
|
||||||
* we risk not having enough space.)
|
|
||||||
*/
|
|
||||||
if (!P_RIGHTMOST(lopaque))
|
|
||||||
{
|
|
||||||
deletable[ndeletable++] = P_HIKEY;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* newitemoff is given to us relative to the original
|
|
||||||
* page's item numbering, so adjust it for this deletion.
|
|
||||||
*/
|
|
||||||
newitemoff--;
|
|
||||||
}
|
|
||||||
for (off = xlrec->firstright; off <= maxoff; off++)
|
|
||||||
deletable[ndeletable++] = off;
|
|
||||||
if (ndeletable > 0)
|
|
||||||
PageIndexMultiDelete(lpage, deletable, ndeletable);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Add the new item if it was inserted on left page.
|
|
||||||
*/
|
|
||||||
if (onleft)
|
|
||||||
{
|
|
||||||
if (PageAddItem(lpage, newitem, newitemsz, newitemoff,
|
|
||||||
false, false) == InvalidOffsetNumber)
|
|
||||||
elog(PANIC, "failed to add new item to left page after split");
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Set high key */
|
/* Set high key */
|
||||||
if (PageAddItem(lpage, left_hikey, left_hikeysz,
|
leftoff = P_HIKEY;
|
||||||
|
if (PageAddItem(newlpage, left_hikey, left_hikeysz,
|
||||||
P_HIKEY, false, false) == InvalidOffsetNumber)
|
P_HIKEY, false, false) == InvalidOffsetNumber)
|
||||||
elog(PANIC, "failed to add high key to left page after split");
|
elog(PANIC, "failed to add high key to left page after split");
|
||||||
|
leftoff = OffsetNumberNext(leftoff);
|
||||||
|
|
||||||
|
for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstright; off++)
|
||||||
|
{
|
||||||
|
ItemId itemid;
|
||||||
|
Size itemsz;
|
||||||
|
Item item;
|
||||||
|
|
||||||
|
/* add the new item if it was inserted on left page */
|
||||||
|
if (onleft && off == newitemoff)
|
||||||
|
{
|
||||||
|
if (PageAddItem(newlpage, newitem, newitemsz, leftoff,
|
||||||
|
false, false) == InvalidOffsetNumber)
|
||||||
|
elog(ERROR, "failed to add new item to left page after split");
|
||||||
|
leftoff = OffsetNumberNext(leftoff);
|
||||||
|
}
|
||||||
|
|
||||||
|
itemid = PageGetItemId(lpage, off);
|
||||||
|
itemsz = ItemIdGetLength(itemid);
|
||||||
|
item = PageGetItem(lpage, itemid);
|
||||||
|
if (PageAddItem(newlpage, item, itemsz, leftoff,
|
||||||
|
false, false) == InvalidOffsetNumber)
|
||||||
|
elog(ERROR, "failed to add old item to left page after split");
|
||||||
|
leftoff = OffsetNumberNext(leftoff);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* cope with possibility that newitem goes at the end */
|
||||||
|
if (onleft && off == newitemoff)
|
||||||
|
{
|
||||||
|
if (PageAddItem(newlpage, newitem, newitemsz, leftoff,
|
||||||
|
false, false) == InvalidOffsetNumber)
|
||||||
|
elog(ERROR, "failed to add new item to left page after split");
|
||||||
|
leftoff = OffsetNumberNext(leftoff);
|
||||||
|
}
|
||||||
|
|
||||||
|
PageRestoreTempPage(newlpage, lpage);
|
||||||
|
|
||||||
/* Fix opaque fields */
|
/* Fix opaque fields */
|
||||||
lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT;
|
lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT;
|
||||||
|
Loading…
Reference in New Issue
Block a user