diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README index 67159d8529..a2634a0927 100644 --- a/src/backend/access/gin/README +++ b/src/backend/access/gin/README @@ -210,6 +210,56 @@ fit on one pending-list page must have those pages to itself, even if this results in wasting much of the space on the preceding page and the last page for the tuple.) +Concurrency +----------- + +The entry tree and each posting tree is a B-tree, with right-links connecting +sibling pages at the same level. This is the same structure that is used in +the regular B-tree indexam (invented by Lehman & Yao), but we don't support +scanning a GIN trees backwards, so we don't need left-links. + +To avoid deadlocks, B-tree pages must always be locked in the same order: +left to right, and bottom to top. When searching, the tree is traversed from +top to bottom, so the lock on the parent page must be released before +descending to the next level. Concurrent page splits move the keyspace to +right, so after following a downlink, the page actually containing the key +we're looking for might be somewhere to the right of the page we landed on. +In that case, we follow the right-links until we find the page we're looking +for. + +To delete a page, the page's left sibling, the target page, and its parent, +are locked in that order, and the page is marked as deleted. However, a +concurrent search might already have read a pointer to the page, and might be +just about to follow it. A page can be reached via the right-link of its left +sibling, or via its downlink in the parent. + +To prevent a backend from reaching a deleted page via a right-link, when +following a right-link the lock on the previous page is not released until +the lock on next page has been acquired. + +The downlink is more tricky. A search descending the tree must release the +lock on the parent page before locking the child, or it could deadlock with +a concurrent split of the child page; a page split locks the parent, while +already holding a lock on the child page. However, posting trees are only +fully searched from left to right, starting from the leftmost leaf. (The +tree-structure is only needed by insertions, to quickly find the correct +insert location). So as long as we don't delete the leftmost page on each +level, a search can never follow a downlink to page that's about to be +deleted. + +The previous paragraph's reasoning only applies to searches, and only to +posting trees. To protect from inserters following a downlink to a deleted +page, vacuum simply locks out all concurrent insertions to the posting tree, +by holding a super-exclusive lock on the posting tree root. Inserters hold a +pin on the root page, but searches do not, so while new searches cannot begin +while root page is locked, any already-in-progress scans can continue +concurrently with vacuum. In the entry tree, we never delete pages. + +(This is quite different from the mechanism the btree indexam uses to make +page-deletions safe; it stamps the deleted pages with an XID and keeps the +deleted pages around with the right-link intact until all concurrent scans +have finished.) + Limitations ----------- diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index b160551b5e..d2ac679fc4 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -112,10 +112,8 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack) /* rightmost page */ break; + stack->buffer = ginStepRight(stack->buffer, btree->index, access); stack->blkno = rightlink; - LockBuffer(stack->buffer, GIN_UNLOCK); - stack->buffer = ReleaseAndReadBuffer(stack->buffer, btree->index, stack->blkno); - LockBuffer(stack->buffer, access); page = BufferGetPage(stack->buffer); } @@ -151,6 +149,41 @@ ginFindLeafPage(GinBtree btree, GinBtreeStack *stack) return NULL; } +/* + * Step right from current page. + * + * The next page is locked first, before releasing the current page. This is + * crucial to protect from concurrent page deletion (see comment in + * ginDeletePage). + */ +Buffer +ginStepRight(Buffer buffer, Relation index, int lockmode) +{ + Buffer nextbuffer; + Page page = BufferGetPage(buffer); + bool isLeaf = GinPageIsLeaf(page); + bool isData = GinPageIsData(page); + BlockNumber blkno = GinPageGetOpaque(page)->rightlink; + + nextbuffer = ReadBuffer(index, blkno); + LockBuffer(nextbuffer, lockmode); + UnlockReleaseBuffer(buffer); + + /* Sanity check that the page we stepped to is of similar kind. */ + page = BufferGetPage(nextbuffer); + if (isLeaf != GinPageIsLeaf(page) || isData != GinPageIsData(page)) + elog(ERROR, "right sibling of GIN page is of different type"); + + /* + * Given the proper lock sequence above, we should never land on a + * deleted page. + */ + if (GinPageIsDeleted(page)) + elog(ERROR, "right sibling of GIN page was deleted"); + + return nextbuffer; +} + void freeGinBtreeStack(GinBtreeStack *stack) { @@ -239,12 +272,12 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack, while ((offset = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber) { blkno = GinPageGetOpaque(page)->rightlink; - LockBuffer(buffer, GIN_UNLOCK); - ReleaseBuffer(buffer); if (blkno == InvalidBlockNumber) + { + UnlockReleaseBuffer(buffer); break; - buffer = ReadBuffer(btree->index, blkno); - LockBuffer(buffer, GIN_EXCLUSIVE); + } + buffer = ginStepRight(buffer, btree->index, GIN_EXCLUSIVE); page = BufferGetPage(buffer); } @@ -447,23 +480,21 @@ ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats) { BlockNumber rightlink = GinPageGetOpaque(page)->rightlink; - LockBuffer(parent->buffer, GIN_UNLOCK); - if (rightlink == InvalidBlockNumber) { /* * rightmost page, but we don't find parent, we should use * plain search... */ + LockBuffer(parent->buffer, GIN_UNLOCK); ginFindParents(btree, stack, rootBlkno); parent = stack->parent; page = BufferGetPage(parent->buffer); break; } + parent->buffer = ginStepRight(parent->buffer, btree->index, GIN_EXCLUSIVE); parent->blkno = rightlink; - parent->buffer = ReleaseAndReadBuffer(parent->buffer, btree->index, parent->blkno); - LockBuffer(parent->buffer, GIN_EXCLUSIVE); page = BufferGetPage(parent->buffer); } diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index 022bd27b23..0630e7ee9f 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -105,16 +105,11 @@ moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack) /* * We scanned the whole page, so we should take right page */ - stack->blkno = GinPageGetOpaque(page)->rightlink; - if (GinPageRightMost(page)) return false; /* no more pages */ - LockBuffer(stack->buffer, GIN_UNLOCK); - stack->buffer = ReleaseAndReadBuffer(stack->buffer, - btree->index, - stack->blkno); - LockBuffer(stack->buffer, GIN_SHARE); + stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE); + stack->blkno = BufferGetBlockNumber(stack->buffer); stack->off = FirstOffsetNumber; } @@ -132,7 +127,6 @@ scanPostingTree(Relation index, GinScanEntry scanEntry, GinPostingTreeScan *gdi; Buffer buffer; Page page; - BlockNumber blkno; /* Descend to the leftmost leaf page */ gdi = ginPrepareScanPostingTree(index, rootPostingTree, TRUE); @@ -162,10 +156,7 @@ scanPostingTree(Relation index, GinScanEntry scanEntry, if (GinPageRightMost(page)) break; /* no more pages */ - blkno = GinPageGetOpaque(page)->rightlink; - LockBuffer(buffer, GIN_UNLOCK); - buffer = ReleaseAndReadBuffer(buffer, index, blkno); - LockBuffer(buffer, GIN_SHARE); + buffer = ginStepRight(buffer, index, GIN_SHARE); } UnlockReleaseBuffer(buffer); @@ -544,7 +535,6 @@ static void entryGetNextItem(GinState *ginstate, GinScanEntry entry) { Page page; - BlockNumber blkno; for (;;) { @@ -562,23 +552,18 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry) * It's needed to go by right link. During that we should refind * first ItemPointer greater that stored */ - - blkno = GinPageGetOpaque(page)->rightlink; - - LockBuffer(entry->buffer, GIN_UNLOCK); - if (blkno == InvalidBlockNumber) + if (GinPageRightMost(page)) { - ReleaseBuffer(entry->buffer); + UnlockReleaseBuffer(entry->buffer); ItemPointerSetInvalid(&entry->curItem); entry->buffer = InvalidBuffer; entry->isFinished = TRUE; return; } - entry->buffer = ReleaseAndReadBuffer(entry->buffer, - ginstate->index, - blkno); - LockBuffer(entry->buffer, GIN_SHARE); + entry->buffer = ginStepRight(entry->buffer, + ginstate->index, + GIN_SHARE); page = BufferGetPage(entry->buffer); entry->offset = InvalidOffsetNumber; diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index 5418fa0f74..be9c5e9ce4 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -238,6 +238,9 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, return hasVoidPage; } +/* + * Delete a posting tree page. + */ static void ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkno, BlockNumber parentBlkno, OffsetNumber myoff, bool isParentRoot) @@ -247,39 +250,35 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn Buffer pBuffer; Page page, parentPage; + BlockNumber rightlink; + /* + * Lock the pages in the same order as an insertion would, to avoid + * deadlocks: left, then right, then parent. + */ + lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno, + RBM_NORMAL, gvs->strategy); dBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, deleteBlkno, RBM_NORMAL, gvs->strategy); - - if (leftBlkno != InvalidBlockNumber) - lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno, - RBM_NORMAL, gvs->strategy); - else - lBuffer = InvalidBuffer; - pBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, parentBlkno, RBM_NORMAL, gvs->strategy); + LockBuffer(lBuffer, GIN_EXCLUSIVE); LockBuffer(dBuffer, GIN_EXCLUSIVE); if (!isParentRoot) /* parent is already locked by * LockBufferForCleanup() */ LockBuffer(pBuffer, GIN_EXCLUSIVE); - if (leftBlkno != InvalidBlockNumber) - LockBuffer(lBuffer, GIN_EXCLUSIVE); START_CRIT_SECTION(); - if (leftBlkno != InvalidBlockNumber) - { - BlockNumber rightlink; + /* Unlink the page by changing left sibling's rightlink */ + page = BufferGetPage(dBuffer); + rightlink = GinPageGetOpaque(page)->rightlink; - page = BufferGetPage(dBuffer); - rightlink = GinPageGetOpaque(page)->rightlink; - - page = BufferGetPage(lBuffer); - GinPageGetOpaque(page)->rightlink = rightlink; - } + page = BufferGetPage(lBuffer); + GinPageGetOpaque(page)->rightlink = rightlink; + /* Delete downlink from parent */ parentPage = BufferGetPage(pBuffer); #ifdef USE_ASSERT_CHECKING do @@ -364,10 +363,7 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn if (!isParentRoot) LockBuffer(pBuffer, GIN_UNLOCK); ReleaseBuffer(pBuffer); - - if (leftBlkno != InvalidBlockNumber) - UnlockReleaseBuffer(lBuffer); - + UnlockReleaseBuffer(lBuffer); UnlockReleaseBuffer(dBuffer); END_CRIT_SECTION(); @@ -435,15 +431,12 @@ ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, DataPageDel if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber) { - if (!(me->leftBlkno == InvalidBlockNumber && GinPageRightMost(page))) + /* we never delete the left- or rightmost branch */ + if (me->leftBlkno != InvalidBlockNumber && !GinPageRightMost(page)) { - /* we never delete right most branch */ Assert(!isRoot); - if (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber) - { - ginDeletePage(gvs, blkno, me->leftBlkno, me->parent->blkno, myoff, me->parent->isRoot); - meDelete = TRUE; - } + ginDeletePage(gvs, blkno, me->leftBlkno, me->parent->blkno, myoff, me->parent->isRoot); + meDelete = TRUE; } } diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index e56a92358c..5068d931cc 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -513,6 +513,7 @@ typedef struct GinBtreeData extern GinBtreeStack *ginPrepareFindLeafPage(GinBtree btree, BlockNumber blkno); extern GinBtreeStack *ginFindLeafPage(GinBtree btree, GinBtreeStack *stack); +extern Buffer ginStepRight(Buffer buffer, Relation index, int lockmode); extern void freeGinBtreeStack(GinBtreeStack *stack); extern void ginInsertValue(GinBtree btree, GinBtreeStack *stack, GinStatsData *buildStats);