From 4f8076e5dd749d98eb74d0dfa6cfb127ded1dba0 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sun, 7 Feb 2021 19:24:03 -0800 Subject: [PATCH v4 2/3] Recycle pages deleted during same VACUUM. TODO: Respect work_mem in temporary space that remembers the details of pages deleted during current VACUUM. Author: Peter Geoghegan Discussion: https://postgr.es/m/CAH2-Wzk76_P=67iUscb1UN44-gyZL-KgpsXbSxq_bdcMa7Q+wQ@mail.gmail.com --- src/include/access/nbtree.h | 25 ++++++++++- src/backend/access/nbtree/nbtpage.c | 61 +++++++++++++++---------- src/backend/access/nbtree/nbtree.c | 69 ++++++++++++++++++++++------- 3 files changed, 115 insertions(+), 40 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 44d176255a..0e81ff8356 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -272,6 +272,29 @@ BTPageGetDeleteXid(Page page) return contents->safexid; } +/* + * BTVacState is nbtree.c state used during VACUUM. It is exported for use by + * page deletion related code in nbtpage.c. + */ +typedef struct BTPendingRecycle +{ + BlockNumber blkno; + FullTransactionId safexid; +} BTPendingRecycle; + +typedef struct BTVacState +{ + IndexVacuumInfo *info; + IndexBulkDeleteResult *stats; + IndexBulkDeleteCallback callback; + void *callback_state; + BTCycleId cycleid; + MemoryContext pagedelcontext; + BTPendingRecycle *deleted; + uint32 sizedeleted; + uint32 ndeleted; +} BTVacState; + /* * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost * page. The high key is not a tuple that is used to visit the heap. It is @@ -1142,7 +1165,7 @@ extern void _bt_delitems_vacuum(Relation rel, Buffer buf, extern void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate); -extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf); +extern void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate); /* * prototypes for functions in nbtsearch.c diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index e0d1e585f6..408230cb67 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -50,7 +50,7 @@ static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, - uint32 *ndeleted); + BTVacState *vstate); static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child, BTStack stack, Buffer *subtreeparent, @@ -1763,20 +1763,22 @@ _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib) * should never pass a buffer containing an existing deleted page here. The * lock and pin on caller's buffer will be dropped before we return. * - * Returns the number of pages successfully deleted (zero if page cannot - * be deleted now; could be more than one if parent or right sibling pages - * were deleted too). Note that this does not include pages that we delete - * that the btvacuumscan scan has yet to reach; they'll get counted later - * instead. + * Maintains bulk delete stats for caller, which are taken from vstate. We + * need to cooperate closely with caller here so that whole VACUUM operation + * reliably avoids any double counting of subsidiary-to-leafbuf pages that we + * delete in passing. If such pages happen to be from a block number that is + * ahead of the current scanblkno position, then caller is expected to count + * them directly later on. It's simpler for us to understand caller's + * requirements than it would be for caller to understand when or how a + * deleted page became deleted after the fact. * * NOTE: this leaks memory. Rather than trying to clean up everything * carefully, it's better to run it in a temp context that can be reset * frequently. */ -uint32 -_bt_pagedel(Relation rel, Buffer leafbuf) +void +_bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) { - uint32 ndeleted = 0; BlockNumber rightsib; bool rightsib_empty; Page page; @@ -1784,7 +1786,8 @@ _bt_pagedel(Relation rel, Buffer leafbuf) /* * Save original leafbuf block number from caller. Only deleted blocks - * that are <= scanblkno get counted in ndeleted return value. + * that are <= scanblkno are added to bulk delete stat's pages_deleted + * count. */ BlockNumber scanblkno = BufferGetBlockNumber(leafbuf); @@ -1846,7 +1849,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf) RelationGetRelationName(rel)))); _bt_relbuf(rel, leafbuf); - return ndeleted; + return; } /* @@ -1876,7 +1879,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf) Assert(!P_ISHALFDEAD(opaque)); _bt_relbuf(rel, leafbuf); - return ndeleted; + return; } /* @@ -1925,8 +1928,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf) if (_bt_leftsib_splitflag(rel, leftsib, leafblkno)) { ReleaseBuffer(leafbuf); - Assert(ndeleted == 0); - return ndeleted; + return; } /* we need an insertion scan key for the search, so build one */ @@ -1967,7 +1969,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf) if (!_bt_mark_page_halfdead(rel, leafbuf, stack)) { _bt_relbuf(rel, leafbuf); - return ndeleted; + return; } } @@ -1982,7 +1984,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf) { /* Check for interrupts in _bt_unlink_halfdead_page */ if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno, - &rightsib_empty, &ndeleted)) + &rightsib_empty, vstate)) { /* * _bt_unlink_halfdead_page should never fail, since we @@ -1993,7 +1995,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf) * lock and pin on leafbuf for us. */ Assert(false); - return ndeleted; + return; } } @@ -2030,8 +2032,6 @@ _bt_pagedel(Relation rel, Buffer leafbuf) leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE); } - - return ndeleted; } /* @@ -2266,9 +2266,10 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack) */ static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, - bool *rightsib_empty, uint32 *ndeleted) + bool *rightsib_empty, BTVacState *vstate) { BlockNumber leafblkno = BufferGetBlockNumber(leafbuf); + IndexBulkDeleteResult *stats = vstate->stats; BlockNumber leafleftsib; BlockNumber leafrightsib; BlockNumber target; @@ -2676,12 +2677,24 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, _bt_relbuf(rel, buf); /* - * If btvacuumscan won't revisit this page in a future btvacuumpage call - * and count it as deleted then, we count it as deleted by current - * btvacuumpage call + * Maintain pages_deleted in a way that takes into account how + * btvacuumpage() will count deleted pages that have yet to become + * scanblkno -- only count page when it's not going to get that treatment + * later on. */ if (target <= scanblkno) - (*ndeleted)++; + stats->pages_deleted++; + + if (vstate->ndeleted >= vstate->sizedeleted) + { + vstate->sizedeleted *= 2; + vstate->deleted = + repalloc(vstate->deleted, + sizeof(BTPendingRecycle) * vstate->sizedeleted); + } + vstate->deleted[vstate->ndeleted].blkno = target; + vstate->deleted[vstate->ndeleted].safexid = safexid; + vstate->ndeleted++; return true; } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 1e6da9439f..c3b32bb71c 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -32,23 +32,13 @@ #include "storage/indexfsm.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#include "storage/procarray.h" #include "storage/smgr.h" #include "utils/builtins.h" #include "utils/index_selfuncs.h" #include "utils/memutils.h" -/* Working state needed by btvacuumpage */ -typedef struct -{ - IndexVacuumInfo *info; - IndexBulkDeleteResult *stats; - IndexBulkDeleteCallback callback; - void *callback_state; - BTCycleId cycleid; - MemoryContext pagedelcontext; -} BTVacState; - /* * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started. * @@ -920,6 +910,34 @@ _bt_page_recyclable(BTPageOpaque opaque, Page page) return false; } +/* + * _bt_newly_deleted_pages_recycle() -- Are _bt_pagedel pages recyclable now? + * + * Note that we assume that the array is ordered by safexid. No further + * entries can be safe to recycle once we encounter the first non-recyclable + * entry in the deleted array. + */ +static inline void +_bt_newly_deleted_pages_recycle(Relation rel, BTVacState *vstate) +{ + IndexBulkDeleteResult *stats = vstate->stats; + + /* Recompute VACUUM XID boundaries */ + (void) GetOldestNonRemovableTransactionId(NULL); + + for (int i = 0; i < vstate->ndeleted; i++) + { + BlockNumber blkno = vstate->deleted[i].blkno; + FullTransactionId safexid = vstate->deleted[i].safexid; + + if (!GlobalVisCheckRemovableFullXid(NULL, safexid)) + break; + + RecordFreeIndexPage(rel, blkno); + stats->pages_free++; + } +} + /* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells @@ -1054,6 +1072,11 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, "_bt_pagedel", ALLOCSET_DEFAULT_SIZES); + /* Allocate _bt_newly_deleted_pages_recycle related information */ + vstate.sizedeleted = 16384; + vstate.deleted = palloc(sizeof(BTPendingRecycle) * vstate.sizedeleted); + vstate.ndeleted = 0; + /* * The outer loop iterates over all index pages except the metapage, in * physical order (we hope the kernel will cooperate in providing @@ -1122,7 +1145,18 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, * * Note that if no recyclable pages exist, we don't bother vacuuming the * FSM at all. + * + * Before vacuuming the FSM, try to make the most of the pages we + * ourselves deleted: see if they can be recycled already (try to avoid + * waiting until the next VACUUM operation to recycle). Our approach is + * to check the local array of pages that were newly deleted during this + * VACUUM. */ + if (vstate.ndeleted > 0) + _bt_newly_deleted_pages_recycle(rel, &vstate); + + pfree(vstate.deleted); + if (stats->pages_free > 0) IndexFreeSpaceMapVacuum(rel); @@ -1260,6 +1294,13 @@ backtrack: /* * Already deleted page (which could be leaf or internal). Can't * recycle yet. + * + * This is a deleted page that must have been deleted in a previous + * VACUUM operation, that nevertheless cannot be recycled now. There + * is no good reason to expect that to change any time soon, which is + * why it isn't among the pages that _bt_newly_deleted_pages_recycle + * will consider as candidates to recycle at the end of btvacuumscan + * call. */ stats->pages_deleted++; } @@ -1479,12 +1520,10 @@ backtrack: oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); /* - * We trust the _bt_pagedel return value because it does not include - * any page that a future call here from btvacuumscan is expected to - * count. There will be no double-counting. + * _bt_pagedel maintains the bulk delete stats on our behalf */ Assert(blkno == scanblkno); - stats->pages_deleted += _bt_pagedel(rel, buf); + _bt_pagedel(rel, buf, vstate); MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ -- 2.27.0