From ba8c73c2aff010703a97b10065ecc978f41cd77d Mon Sep 17 00:00:00 2001 From: Andrey Borodin Date: Wed, 7 Mar 2018 11:30:11 +0500 Subject: [PATCH 2/2] Physical GiST scan during VACUUM v3 --- src/backend/access/gist/README | 35 ++++ src/backend/access/gist/gistvacuum.c | 337 ++++++++++++++++++++++++++++++----- 2 files changed, 332 insertions(+), 40 deletions(-) diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README index 02228662b8..9548872be8 100644 --- a/src/backend/access/gist/README +++ b/src/backend/access/gist/README @@ -413,6 +413,41 @@ emptied yet; tuples never move upwards in the tree. The final emptying loops through buffers at a given level until all buffers at that level have been emptied, and then moves down to the next level. +Bulk delete algorithm (VACUUM) +------------------------------ + +Function gistbulkdelete() is responsible for marking empty leaf pages as free +so that they can be used it allocate newly split pages. To find this pages +function can choose between two strategies: logical scan or physical scan. + +Physical scan reads the entire index from the first page to last. This scan +maintains graph structure in palloc'ed array to collect block numbers of +internal pages that need cleansing from references to empty leafs. Also, the +array contains offsets on the internal page to potentially free leaf page. This +scan method is chosen when maintenance work memory is sufficient to hold +necessary graph structure. + +The logical scan is chosen when there is not enough maintenance memory to +execute the physical scan. Logical scan traverses GiST index in DFS, looking up +into incomplete split branches. The logical scan can be slower on hard disk +drives. + +The result of both scans are the same: the stack of block numbers of internal +pages with the list of offsets potentially referencing empty leaf pages. After +the scan, for each internal pages under exclusive lock, each potentially free +leaf page is examined. gistbulkdelete() never delete last one reference from +internal page to preserve balanced tree properties. + +The physical scan can return empty leaf pages offsets unordered. Thus, before +executing PageIndexMultiDelete offsets (already locked and checked) are sorted. +This step is not necessary for the logical scan. + +Both scans hold only one lock at a time. Physical scan grabs exclusive lock +instantly, while logical scan takes shared lock and then swaps it to exclusive. +This is done because amount of work on internal page done by physical scan is +lower and amount of internal pages is relatively low compared to the amount of +leaf pages. + Authors: Teodor Sigaev diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index 213e01202b..e2c37a55a6 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -102,8 +102,9 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) typedef struct GistBDItem { - GistNSN parentlsn; - BlockNumber blkno; + GistNSN parentlsn; + BlockNumber blkno; + OffsetNumber parentoffset; struct GistBDItem *next; } GistBDItem; @@ -128,30 +129,204 @@ pushStackIfSplited(Page page, GistBDItem *stack) } /* - * Bulk deletion of all index entries pointing to a set of heap tuples and - * check invalid tuples left after upgrade. - * The set of target tuples is specified via a callback routine that tells - * whether any given heap tuple (identified by ItemPointer) is being deleted. - * - * Result: a palloc'd struct containing statistical info for VACUUM displays. + * During physical scan for every pair parent-child we can either find parent + * first or child first. Every time we open internal page - we mark parent + * block no for every child and set GIST_PS_HAS_PARENT. When scan will get to + * child page, if this page turns out to be empty - we will get back by + * parent link. If we find child first (still without parent link), we mark + * the page as GIST_PS_EMPTY_LEAF if it is ready to be deleted. When we will + * scan it's parent - we will pick it to rescan list. */ -IndexBulkDeleteResult * -gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state) +#define GIST_PS_HAS_PARENT 1 +#define GIST_PS_EMPTY_LEAF 2 + + +/* Physiscal scan item */ +typedef struct GistPSItem { - Relation rel = info->index; - GistBDItem *stack, - *ptr; - BlockNumber recentParent = InvalidBlockNumber; - List *rescanList = NULL; - ListCell *cell; + BlockNumber parent; + List* emptyLeafOffsets; + OffsetNumber parentOffset; + uint16_t flags; +} GistPSItem; + +/* Blocknumber of internal pages with offsets to rescan for deletion */ +typedef struct GistRescanItem +{ + BlockNumber blkno; + List* emptyLeafOffsets; + struct GistRescanItem* next; +} GistRescanItem; + +/* Read all pages sequentially populating array of GistPSItem */ +static GistRescanItem* +gistbulkdeletephysicalcan(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state, BlockNumber npages) +{ + Relation rel = info->index; + GistRescanItem *result = NULL; + BlockNumber blkno; - /* first time through? */ - if (stats == NULL) - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); - /* we'll re-count the tuples each time */ - stats->estimated_count = false; - stats->num_index_tuples = 0; + /* Here we will store whole graph of the index */ + GistPSItem *graph = palloc0(npages * sizeof(GistPSItem)); + + + for (blkno = GIST_ROOT_BLKNO; blkno < npages; blkno++) + { + Buffer buffer; + Page page; + OffsetNumber i, + maxoff; + IndexTuple idxtuple; + ItemId iid; + + vacuum_delay_point(); + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, + info->strategy); + /* + * We are not going to stay here for a long time, calling recursive algorithms. + * Especially for an internal page. So, agressivly grab an exclusive lock. + */ + LockBuffer(buffer, GIST_EXCLUSIVE); + page = (Page) BufferGetPage(buffer); + + if (PageIsNew(page) || GistPageIsDeleted(page)) + { + UnlockReleaseBuffer(buffer); + /* TODO: Should not we record free page here? */ + continue; + } + + maxoff = PageGetMaxOffsetNumber(page); + + if (GistPageIsLeaf(page)) + { + OffsetNumber todelete[MaxOffsetNumber]; + int ntodelete = 0; + + /* + * Remove deletable tuples from page + */ + + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + iid = PageGetItemId(page, i); + idxtuple = (IndexTuple) PageGetItem(page, iid); + + if (callback(&(idxtuple->t_tid), callback_state)) + todelete[ntodelete++] = i; + else + stats->num_index_tuples += 1; + } + + stats->tuples_removed += ntodelete; + + /* We have dead tuples on the page */ + if (ntodelete) + { + START_CRIT_SECTION(); + + MarkBufferDirty(buffer); + + PageIndexMultiDelete(page, todelete, ntodelete); + GistMarkTuplesDeleted(page); + + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + recptr = gistXLogUpdate(buffer, + todelete, ntodelete, + NULL, 0, InvalidBuffer); + PageSetLSN(page, recptr); + } + else + PageSetLSN(page, gistGetFakeLSN(rel)); + END_CRIT_SECTION(); + } + + /* The page is completely empty */ + if (ntodelete == maxoff) + { + /* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */ + if (graph[blkno].flags & GIST_PS_HAS_PARENT) + { + /* Go to parent and append myself */ + BlockNumber parentblockno = graph[blkno].parent; + graph[parentblockno].emptyLeafOffsets = lappend_int(graph[parentblockno].emptyLeafOffsets, (int)graph[blkno].parentOffset); + } + else + { + /* Parent will collect me later */ + graph[blkno].flags |= GIST_PS_EMPTY_LEAF; + } + } + } + else + { + /* For internal pages we remember stucture of the tree */ + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + BlockNumber childblkno; + iid = PageGetItemId(page, i); + idxtuple = (IndexTuple) PageGetItem(page, iid); + childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + + if (graph[childblkno].flags & GIST_PS_EMPTY_LEAF) + { + /* Child has been scanned earlier and is ready to be picked up */ + graph[blkno].emptyLeafOffsets = lappend_int(graph[blkno].emptyLeafOffsets, i); + } + else + { + /* Collect leaf when scan will come close */ + graph[childblkno].parent = blkno; + graph[childblkno].parentOffset = i; + graph[childblkno].flags |= GIST_PS_HAS_PARENT; + } + + + if (GistTupleIsInvalid(idxtuple)) + ereport(LOG, + (errmsg("index \"%s\" contains an inner tuple marked as invalid", + RelationGetRelationName(rel)), + errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), + errhint("Please REINDEX it."))); + } + } + UnlockReleaseBuffer(buffer); + } + + /* Search for internal pages pointing to empty leafs */ + for (blkno = GIST_ROOT_BLKNO; blkno < npages; blkno++) + { + if (graph[blkno].emptyLeafOffsets) + { + GistRescanItem *next = palloc(sizeof(GistRescanItem)); + next->blkno = blkno; + next->emptyLeafOffsets = graph[blkno].emptyLeafOffsets; + next->next = result; + result = next; + } + } + + pfree(graph); + + return result; +} + +/* Logical scan descends from root to leafs in DFS search */ +static GistRescanItem* +gistbulkdeletelogicalscan(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state) +{ + Relation rel = info->index; + BlockNumber recentParent = InvalidBlockNumber; + GistBDItem *stack, + *ptr; + GistRescanItem *result = NULL; + + /* This stack is used to organize DFS */ stack = (GistBDItem *) palloc0(sizeof(GistBDItem)); stack->blkno = GIST_ROOT_BLKNO; @@ -236,11 +411,18 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD END_CRIT_SECTION(); } - if (ntodelete == maxoff && recentParent!=InvalidBlockNumber && - (rescanList == NULL || (BlockNumber)llast_int(rescanList) != recentParent)) + if (ntodelete == maxoff && recentParent!=InvalidBlockNumber) { /* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */ - rescanList = lappend_int(rescanList, recentParent); + if (result == NULL || result->blkno != recentParent) + { + GistRescanItem *next = palloc(sizeof(GistRescanItem)); + next->blkno = recentParent; + next->emptyLeafOffsets = NULL; + next->next = result; + result = next; + } + result->emptyLeafOffsets = lappend_int(result->emptyLeafOffsets, stack->parentoffset); } } else @@ -260,6 +442,7 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); ptr->parentlsn = BufferGetLSNAtomic(buffer); ptr->next = stack->next; + ptr->parentoffset = i; stack->next = ptr; if (GistTupleIsInvalid(idxtuple)) @@ -280,20 +463,82 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD vacuum_delay_point(); } - /* rescan inner pages that had empty child pages */ - foreach(cell,rescanList) + return result; +} + +/* + * This function is used to sort offsets for PageIndexMultiDelete + * When employing physical scan rescan offsets are not ordered. + */ +static int +compare_offsetnumber(const void *x, const void *y) +{ + OffsetNumber a = *((OffsetNumber *)x); + OffsetNumber b = *((OffsetNumber *)y); + return a - b; +} + +/* + * Bulk deletion of all index entries pointing to a set of heap tuples and + * check invalid tuples left after upgrade. + * The set of target tuples is specified via a callback routine that tells + * whether any given heap tuple (identified by ItemPointer) is being deleted. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ +IndexBulkDeleteResult * +gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state) +{ + Relation rel = info->index; + GistRescanItem *rescan; + BlockNumber npages; + bool needLock; + + /* first time through? */ + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + /* we'll re-count the tuples each time */ + stats->estimated_count = false; + stats->num_index_tuples = 0; + + /* + * Need lock unless it's local to this backend. + */ + needLock = !RELATION_IS_LOCAL(rel); + + /* try to find deleted pages */ + if (needLock) + LockRelationForExtension(rel, ExclusiveLock); + npages = RelationGetNumberOfBlocks(rel); + if (needLock) + UnlockRelationForExtension(rel, ExclusiveLock); + + /* If we have enough space to contruct map of whole graph, then we can do sequential reading of all index */ + if (npages * (sizeof(GistPSItem)) > maintenance_work_mem * 1024) { - Buffer buffer; - Page page; - OffsetNumber i, - maxoff; - IndexTuple idxtuple; - ItemId iid; - OffsetNumber todelete[MaxOffsetNumber]; - Buffer buftodelete[MaxOffsetNumber]; - int ntodelete = 0; + rescan = gistbulkdeletelogicalscan(info, stats, callback, callback_state); + } + else + { + rescan = gistbulkdeletephysicalcan(info, stats, callback, callback_state, npages); + } - buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber)lfirst_int(cell), + /* rescan inner pages that had empty child pages */ + while (rescan) + { + Buffer buffer; + Page page; + OffsetNumber i, + maxoff; + IndexTuple idxtuple; + ItemId iid; + OffsetNumber todelete[MaxOffsetNumber]; + Buffer buftodelete[MaxOffsetNumber]; + int ntodelete = 0; + ListCell *cell; + GistRescanItem *oldRescan; + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, rescan->blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_EXCLUSIVE); gistcheckpage(rel, buffer); @@ -303,11 +548,18 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD maxoff = PageGetMaxOffsetNumber(page); - for (i = OffsetNumberNext(FirstOffsetNumber); i <= maxoff; i = OffsetNumberNext(i)) + /* Check that leafs are still empty and decide what to delete */ + foreach(cell, rescan->emptyLeafOffsets) { Buffer leafBuffer; Page leafPage; + i = (OffsetNumber)lfirst_int(cell); + if(i > maxoff) + { + continue; + } + iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); @@ -337,7 +589,9 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD START_CRIT_SECTION(); MarkBufferDirty(buffer); - PageIndexMultiDelete(page, todelete, ntodelete); + /* Prepare possibly onurdered offsets */ + qsort(todelete, ntodelete, sizeof(OffsetNumber), compare_offsetnumber); + PageIndexMultiDelete(page, todelete, ntodelete); if (RelationNeedsWAL(rel)) { @@ -375,11 +629,14 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD } UnlockReleaseBuffer(buffer); + oldRescan = rescan; + rescan = rescan->next; + list_free(oldRescan->emptyLeafOffsets); + pfree(oldRescan); vacuum_delay_point(); } - list_free(rescanList); return stats; } \ No newline at end of file -- 2.14.3 (Apple Git-98)