From 71e5703357fc99028cfeb8404eae99d98f62b36f Mon Sep 17 00:00:00 2001 From: Andrey Date: Thu, 22 Nov 2018 23:14:46 +0500 Subject: [PATCH] Use correct locking protocol during GIN posting tree vacuum --- src/backend/access/gin/README | 17 +- src/backend/access/gin/ginbtree.c | 6 +- src/backend/access/gin/gindatapage.c | 10 +- src/backend/access/gin/ginentrypage.c | 6 +- src/backend/access/gin/ginutil.c | 7 +- src/backend/access/gin/ginvacuum.c | 321 +++++++++++--------------- src/backend/access/gin/ginxlog.c | 1 + src/include/access/gin_private.h | 9 +- src/include/access/ginxlog.h | 1 + 9 files changed, 159 insertions(+), 219 deletions(-) diff --git a/src/backend/access/gin/README b/src/backend/access/gin/README index cc434b1feb..947c8355e5 100644 --- a/src/backend/access/gin/README +++ b/src/backend/access/gin/README @@ -311,20 +311,9 @@ insert location). So as long as we don't delete the leftmost page on each level, a search can never follow a downlink to page that's about to be deleted. -The previous paragraph's reasoning only applies to searches, and only to -posting trees. To protect from inserters following a downlink to a deleted -page, vacuum simply locks out all concurrent insertions to the posting tree, -by holding a super-exclusive lock on the parent page of subtree with deletable -pages. Inserters hold a pin on the root page, but searches do not, so while -new searches cannot begin while root page is locked, any already-in-progress -scans can continue concurrently with vacuum in corresponding subtree of -posting tree. To exclude interference with readers vacuum takes exclusive -locks in a depth-first scan in left-to-right order of page tuples. Leftmost -page is never deleted. Thus before deleting any page we obtain exclusive -lock on any left page, effectively excluding deadlock with any reader, despite -taking parent lock before current and left lock after current. We take left -lock not for a concurrency reasons, but rather in need to mark page dirty. -In the entry tree, we never delete pages. +During vacuum of posting tree leaf pages of posting tree are scanned with +lock coupling, iterating by rightlink. If leaf if empty, we try to lookup +parent page for it and delete it. (This is quite different from the mechanism the btree indexam uses to make page-deletions safe; it stamps the deleted pages with an XID and keeps the diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index d5a568106c..ec59c342d9 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -271,9 +271,9 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack) ginFinishSplit(btree, ptr, false, NULL); } - leftmostBlkno = btree->getLeftMostChild(btree, page); + leftmostBlkno = btree->getLeftMostChild(page); - while ((offset = btree->findChildPtr(btree, page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber) + while ((offset = btree->findChildPtr(page, stack->blkno, InvalidOffsetNumber)) == InvalidOffsetNumber) { blkno = GinPageGetOpaque(page)->rightlink; if (blkno == InvalidBlockNumber) @@ -708,7 +708,7 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack, /* move right if it's needed */ page = BufferGetPage(parent->buffer); - while ((parent->off = btree->findChildPtr(btree, page, stack->blkno, parent->off)) == InvalidOffsetNumber) + while ((parent->off = btree->findChildPtr(page, stack->blkno, parent->off)) == InvalidOffsetNumber) { if (GinPageRightMost(page)) { diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index 9f20513811..529269a2ad 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -262,7 +262,7 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack) { stack->off = FirstOffsetNumber; stack->predictNumber *= GinPageGetOpaque(page)->maxoff; - return btree->getLeftMostChild(btree, page); + return btree->getLeftMostChild(page); } low = FirstOffsetNumber; @@ -312,8 +312,8 @@ dataLocateItem(GinBtree btree, GinBtreeStack *stack) /* * Find link to blkno on non-leaf page, returns offset of PostingItem */ -static OffsetNumber -dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff) +OffsetNumber +dataFindChildPtr(Page page, BlockNumber blkno, OffsetNumber storedOff) { OffsetNumber i, maxoff = GinPageGetOpaque(page)->maxoff; @@ -357,8 +357,8 @@ dataFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber stor /* * Return blkno of leftmost child */ -static BlockNumber -dataGetLeftMostPage(GinBtree btree, Page page) +BlockNumber +dataGetLeftMostPage(Page page) { PostingItem *pitem; diff --git a/src/backend/access/gin/ginentrypage.c b/src/backend/access/gin/ginentrypage.c index 184cc0af3e..df3868820a 100644 --- a/src/backend/access/gin/ginentrypage.c +++ b/src/backend/access/gin/ginentrypage.c @@ -284,7 +284,7 @@ entryLocateEntry(GinBtree btree, GinBtreeStack *stack) { stack->off = FirstOffsetNumber; stack->predictNumber *= PageGetMaxOffsetNumber(page); - return btree->getLeftMostChild(btree, page); + return btree->getLeftMostChild(page); } low = FirstOffsetNumber; @@ -403,7 +403,7 @@ entryLocateLeafEntry(GinBtree btree, GinBtreeStack *stack) } static OffsetNumber -entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff) +entryFindChildPtr(Page page, BlockNumber blkno, OffsetNumber storedOff) { OffsetNumber i, maxoff = PageGetMaxOffsetNumber(page); @@ -444,7 +444,7 @@ entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber sto } static BlockNumber -entryGetLeftMostPage(GinBtree btree, Page page) +entryGetLeftMostPage(Page page) { IndexTuple itup; diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 0a32182dd7..bf354417ee 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -314,7 +314,12 @@ GinNewBuffer(Relation index) if (PageIsNew(page)) return buffer; /* OK to use, if never initialized */ - if (GinPageIsDeleted(page)) + /* + * We should not reuse page until every transaction started before + * deletion is over + */ + if (GinPageIsDeleted(page) + && TransactionIdPrecedes(GinPageGetDeleteXid(page), RecentGlobalDataXmin)) return buffer; /* OK to use */ LockBuffer(buffer, GIN_UNLOCK); diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index 3104bc12b6..217e214982 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -126,33 +126,14 @@ typedef struct DataPageDeleteStack * Delete a posting tree page. */ static void -ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkno, - BlockNumber parentBlkno, OffsetNumber myoff, bool isParentRoot) +ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, Buffer dBuffer, + Buffer lBuffer, Buffer pBuffer, OffsetNumber myoff) { - Buffer dBuffer; - Buffer lBuffer; - Buffer pBuffer; Page page, parentPage; BlockNumber rightlink; - /* - * This function MUST be called only if someone of parent pages hold - * exclusive cleanup lock. This guarantees that no insertions currently - * happen in this subtree. Caller also acquire Exclusive lock on deletable - * page and is acquiring and releasing exclusive lock on left page before. - * Left page was locked and released. Then parent and this page are - * locked. We acquire left page lock here only to mark page dirty after - * changing right pointer. - */ - lBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, leftBlkno, - RBM_NORMAL, gvs->strategy); - dBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, deleteBlkno, - RBM_NORMAL, gvs->strategy); - pBuffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, parentBlkno, - RBM_NORMAL, gvs->strategy); - - LockBuffer(lBuffer, GIN_EXCLUSIVE); + /* We epect that buffers are correctly locked */ page = BufferGetPage(dBuffer); rightlink = GinPageGetOpaque(page)->rightlink; @@ -169,6 +150,9 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn page = BufferGetPage(lBuffer); GinPageGetOpaque(page)->rightlink = rightlink; + /* for each deletead page we remember last xid which could knew it's address */ + GinPageSetDeleteXid(page, ReadNewTransactionId()); + /* Delete downlink from parent */ parentPage = BufferGetPage(pBuffer); #ifdef USE_ASSERT_CHECKING @@ -213,6 +197,7 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn data.parentOffset = myoff; data.rightLink = GinPageGetOpaque(page)->rightlink; + data.deleteXid = GinPageGetDeleteXid(page); XLogRegisterData((char *) &data, sizeof(ginxlogDeletePage)); @@ -222,215 +207,169 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn PageSetLSN(BufferGetPage(lBuffer), recptr); } - ReleaseBuffer(pBuffer); - UnlockReleaseBuffer(lBuffer); - ReleaseBuffer(dBuffer); + UnlockReleaseBuffer(pBuffer); + UnlockReleaseBuffer(dBuffer); END_CRIT_SECTION(); gvs->result->pages_deleted++; } - -/* - * scans posting tree and deletes empty pages - */ -static bool -ginScanToDelete(GinVacuumState *gvs, BlockNumber blkno, bool isRoot, - DataPageDeleteStack *parent, OffsetNumber myoff) +/* Descent from root to leftmost leaf page, remembering parent */ +static Buffer +getPostingTreeLeftmostLeaf(GinVacuumState *gvs, + BlockNumber blkno, + BlockNumber *leftmost_parent_blkno) { - DataPageDeleteStack *me; - Buffer buffer; - Page page; - bool meDelete = false; - bool isempty; - - if (isRoot) - { - me = parent; - } - else + *leftmost_parent_blkno = InvalidBlockNumber; + /* Probably, we can have sanity check here? */ + while (true) { - if (!parent->child) - { - me = (DataPageDeleteStack *) palloc0(sizeof(DataPageDeleteStack)); - me->parent = parent; - parent->child = me; - me->leftBlkno = InvalidBlockNumber; - } - else - me = parent->child; - } - - buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, - RBM_NORMAL, gvs->strategy); - - if (!isRoot) - LockBuffer(buffer, GIN_EXCLUSIVE); - - page = BufferGetPage(buffer); - - Assert(GinPageIsData(page)); - - if (!GinPageIsLeaf(page)) - { - OffsetNumber i; + Buffer buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, + blkno, + RBM_NORMAL, gvs->strategy); + Page page = BufferGetPage(buffer); + LockBuffer(buffer, GIN_SHARE); - me->blkno = blkno; - for (i = FirstOffsetNumber; i <= GinPageGetOpaque(page)->maxoff; i++) + if (GinPageIsLeaf(page)) { - PostingItem *pitem = GinDataPageGetPostingItem(page, i); - - if (ginScanToDelete(gvs, PostingItemGetBlockNumber(pitem), false, me, i)) - i--; + LockBuffer(buffer, GIN_UNLOCK); + LockBufferForCleanup(buffer); + /* Root can become internal page, recheck */ + if (GinPageIsLeaf(page)) + { + return buffer; + } } - } - if (GinPageIsLeaf(page)) - isempty = GinDataLeafPageIsEmpty(page); - else - isempty = GinPageGetOpaque(page)->maxoff < FirstOffsetNumber; + *leftmost_parent_blkno = blkno; + blkno = dataGetLeftMostPage(page); + UnlockReleaseBuffer(buffer); + } +} - if (isempty) +/* + * Find parent for leaf page using information about previously known parent + * Returned buffer must be exclusively locked + */ +static Buffer tryFindParentForLeaf(GinVacuumState *gvs, + BlockNumber *parent_cache, + BlockNumber leaf_blkno, + OffsetNumber *myoff) +{ + Buffer buffer; + Page page; + OffsetNumber offnum; + /* Potentially we scan through all lowest-level internal pages */ + while (true) { - /* we never delete the left- or rightmost branch */ - if (me->leftBlkno != InvalidBlockNumber && !GinPageRightMost(page)) + /* we have nowhere to search anymore */ + if (!BlockNumberIsValid(*parent_cache)) + return InvalidBuffer; + + buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, + *parent_cache, + RBM_NORMAL, gvs->strategy); + page = BufferGetPage(buffer); + LockBuffer(buffer, GIN_EXCLUSIVE); + Assert(!GinPageIsLeaf(page)); + + /* try to find downlink to deletable page */ + offnum = dataFindChildPtr(page, leaf_blkno, InvalidOffsetNumber); + if (OffsetNumberIsValid(offnum)) { - Assert(!isRoot); - ginDeletePage(gvs, blkno, me->leftBlkno, me->parent->blkno, myoff, me->parent->isRoot); - meDelete = true; + /* + * we should not delete highest keys on page + * posting tree uses them for concurrency reasons + * instead of highkeys + */ + if (offnum == PageGetMaxOffsetNumber(page)) + { + UnlockReleaseBuffer(buffer); + return InvalidBuffer; + } + /* OK, return it */ + *myoff = offnum; + return buffer; } + /* + * downlink to deletable page not found - try next + * in case of races vacuum will not be able to delete leaves, + * but this is acceptable + */ + *parent_cache = GinPageGetOpaque(page)->rightlink; + UnlockReleaseBuffer(buffer); } - - if (!isRoot) - LockBuffer(buffer, GIN_UNLOCK); - - ReleaseBuffer(buffer); - - if (!meDelete) - me->leftBlkno = blkno; - - return meDelete; } - -/* - * Scan through posting tree, delete empty tuples from leaf pages. - * Also, this function collects empty subtrees (with all empty leafs). - * For parents of these subtrees CleanUp lock is taken, then we call - * ScanToDelete. This is done for every inner page, which points to - * empty subtree. - */ -static bool -ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot) +static void +ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno) { - Buffer buffer; - Page page; - bool hasVoidPage = false; + BlockNumber parentBlkno; + Buffer buffer, prevBuffer; + Page page, prevPage; MemoryContext oldCxt; - buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, - RBM_NORMAL, gvs->strategy); - page = BufferGetPage(buffer); + /* + * Find leftmost page, then move right with lock coupling to guarantee that all + * tuples are removed. + */ + prevBuffer = getPostingTreeLeftmostLeaf(gvs, rootBlkno, &parentBlkno); + prevPage = BufferGetPage(prevBuffer); - ginTraverseLock(buffer, false); + Assert(GinPageIsLeaf(prevPage)); - Assert(GinPageIsData(page)); + oldCxt = MemoryContextSwitchTo(gvs->tmpCxt); + /* leftmost page is undeletable, just clean it */ + ginVacuumPostingTreeLeaf(gvs->index, prevBuffer, gvs); + MemoryContextSwitchTo(oldCxt); + MemoryContextReset(gvs->tmpCxt); - if (GinPageIsLeaf(page)) + while (!GinPageRightMost(prevPage)) { + BlockNumber blkno = GinPageGetOpaque(prevPage)->rightlink; + buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, + blkno, + RBM_NORMAL, gvs->strategy); + page = BufferGetPage(buffer); + LockBufferForCleanup(buffer); + + /* Cleanup tuples on page */ oldCxt = MemoryContextSwitchTo(gvs->tmpCxt); - ginVacuumPostingTreeLeaf(gvs->index, buffer, gvs); + ginVacuumPostingTreeLeaf(gvs->index, prevBuffer, gvs); MemoryContextSwitchTo(oldCxt); MemoryContextReset(gvs->tmpCxt); - /* if root is a leaf page, we don't desire further processing */ - if (GinDataLeafPageIsEmpty(page)) - hasVoidPage = true; - - UnlockReleaseBuffer(buffer); - - return hasVoidPage; - } - else - { - OffsetNumber i; - bool hasEmptyChild = false; - bool hasNonEmptyChild = false; - OffsetNumber maxoff = GinPageGetOpaque(page)->maxoff; - BlockNumber *children = palloc(sizeof(BlockNumber) * (maxoff + 1)); - - /* - * Read all children BlockNumbers. Not sure it is safe if there are - * many concurrent vacuums. - */ - - for (i = FirstOffsetNumber; i <= maxoff; i++) + if (!GinPageRightMost(page) + && GinDataLeafPageIsEmpty(page) + && BlockNumberIsValid(parentBlkno)) { - PostingItem *pitem = GinDataPageGetPostingItem(page, i); - - children[i] = PostingItemGetBlockNumber(pitem); - } + /* try to delete this page */ + OffsetNumber myoff; + Buffer parentBuffer = tryFindParentForLeaf(gvs, &parentBlkno, blkno, &myoff); + if (BufferIsValid(parentBuffer)) + { + /* All pages are properly locked */ + ginDeletePage(gvs, blkno, buffer, prevBuffer, parentBuffer, myoff); - UnlockReleaseBuffer(buffer); + /* prevPage now has new rightlink, be careful not to release prevPage */ - for (i = FirstOffsetNumber; i <= maxoff; i++) - { - if (ginVacuumPostingTreeLeaves(gvs, children[i], false)) - hasEmptyChild = true; - else - hasNonEmptyChild = true; + continue; + } } + UnlockReleaseBuffer(prevBuffer); - pfree(children); - - vacuum_delay_point(); + prevBuffer = buffer; + prevPage = BufferGetPage(prevBuffer); /* - * All subtree is empty - just return true to indicate that parent - * must do a cleanup, unless we are ROOT and there is way to go upper. + * During delaypoint we hold one cleanup lock. We cannot avoid this lock + * to guaranty that all tuples were vacuumed. */ - - if (hasEmptyChild && !hasNonEmptyChild && !isRoot) - return true; - - if (hasEmptyChild) - { - DataPageDeleteStack root, - *ptr, - *tmp; - - buffer = ReadBufferExtended(gvs->index, MAIN_FORKNUM, blkno, - RBM_NORMAL, gvs->strategy); - LockBufferForCleanup(buffer); - - memset(&root, 0, sizeof(DataPageDeleteStack)); - root.leftBlkno = InvalidBlockNumber; - root.isRoot = true; - - ginScanToDelete(gvs, blkno, true, &root, InvalidOffsetNumber); - - ptr = root.child; - - while (ptr) - { - tmp = ptr->child; - pfree(ptr); - ptr = tmp; - } - - UnlockReleaseBuffer(buffer); - } - - /* Here we have deleted all empty subtrees */ - return false; + vacuum_delay_point(); } -} -static void -ginVacuumPostingTree(GinVacuumState *gvs, BlockNumber rootBlkno) -{ - ginVacuumPostingTreeLeaves(gvs, rootBlkno, true); + UnlockReleaseBuffer(prevBuffer); } /* diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 7701a2d6bf..7821f69077 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -518,6 +518,7 @@ ginRedoDeletePage(XLogReaderState *record) page = BufferGetPage(dbuffer); Assert(GinPageIsData(page)); GinPageGetOpaque(page)->flags = GIN_DELETED; + GinPageSetDeleteXid(page, data->deleteXid); PageSetLSN(page, lsn); MarkBufferDirty(dbuffer); } diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h index 81bf8734ce..f360196af4 100644 --- a/src/include/access/gin_private.h +++ b/src/include/access/gin_private.h @@ -44,6 +44,9 @@ typedef struct GinOptions #define GIN_SHARE BUFFER_LOCK_SHARE #define GIN_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE +#define GinPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid ) +#define GinPageSetDeleteXid(page, val) ( ((PageHeader) (page))->pd_prune_xid = val) + /* * GinState: working data structure describing the index being worked on @@ -144,12 +147,12 @@ typedef struct GinBtreeData { /* search methods */ BlockNumber (*findChildPage) (GinBtree, GinBtreeStack *); - BlockNumber (*getLeftMostChild) (GinBtree, Page); + BlockNumber (*getLeftMostChild) (Page); bool (*isMoveRight) (GinBtree, Page); bool (*findItem) (GinBtree, GinBtreeStack *); /* insert methods */ - OffsetNumber (*findChildPtr) (GinBtree, Page, BlockNumber, OffsetNumber); + OffsetNumber (*findChildPtr) (Page, BlockNumber, OffsetNumber); GinPlaceToPageRC (*beginPlaceToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, void **, Page *, Page *); void (*execPlaceToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, void *); void *(*prepareDownlink) (GinBtree, Buffer); @@ -225,6 +228,8 @@ extern void ginInsertItemPointers(Relation index, BlockNumber rootBlkno, GinStatsData *buildStats); extern GinBtreeStack *ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno, Snapshot snapshot); extern void ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage); +extern BlockNumber dataGetLeftMostPage(Page page); +extern OffsetNumber dataFindChildPtr(Page page, BlockNumber blkno, OffsetNumber storedOff); /* * This is declared in ginvacuum.c, but is passed between ginVacuumItemPointers diff --git a/src/include/access/ginxlog.h b/src/include/access/ginxlog.h index 64a3c9e18b..d7f44d6b83 100644 --- a/src/include/access/ginxlog.h +++ b/src/include/access/ginxlog.h @@ -158,6 +158,7 @@ typedef struct ginxlogDeletePage { OffsetNumber parentOffset; BlockNumber rightLink; + TransactionId deleteXid; /* last Xid which could see page in scan */ } ginxlogDeletePage; #define XLOG_GIN_UPDATE_META_PAGE 0x60 -- 2.17.2 (Apple Git-113)