From 315483e86611f485136efc6a6f141dd0caf3691c Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Fri, 4 Nov 2022 14:14:42 +0900 Subject: [PATCH v8 5/5] PoC: lazy vacuum integration. The patch includes: * Introducing a new module called TIDStore * Lazy vacuum and parallel vacuum integration. TODOs: * radix tree needs to have the reset funtionality. * should not allow TIDStore to grow beyond the memory limit. * change the progress statistics of pg_stat_progress_vacuum. --- src/backend/access/common/Makefile | 1 + src/backend/access/common/meson.build | 1 + src/backend/access/common/tidstore.c | 273 ++++++++++++++++++++++++++ src/backend/access/heap/vacuumlazy.c | 160 +++++---------- src/backend/commands/vacuum.c | 45 +---- src/backend/commands/vacuumparallel.c | 59 +++--- src/backend/lib/radixtree.c | 9 + src/backend/storage/lmgr/lwlock.c | 2 + src/include/access/tidstore.h | 55 ++++++ src/include/commands/vacuum.h | 24 +-- src/include/lib/radixtree.h | 1 + src/include/storage/lwlock.h | 1 + 12 files changed, 436 insertions(+), 195 deletions(-) create mode 100644 src/backend/access/common/tidstore.c create mode 100644 src/include/access/tidstore.h diff --git a/src/backend/access/common/Makefile b/src/backend/access/common/Makefile index b9aff0ccfd..67b8cc6108 100644 --- a/src/backend/access/common/Makefile +++ b/src/backend/access/common/Makefile @@ -27,6 +27,7 @@ OBJS = \ syncscan.o \ toast_compression.o \ toast_internals.o \ + tidstore.o \ tupconvert.o \ tupdesc.o diff --git a/src/backend/access/common/meson.build b/src/backend/access/common/meson.build index 857beaa32d..76265974b1 100644 --- a/src/backend/access/common/meson.build +++ b/src/backend/access/common/meson.build @@ -13,6 +13,7 @@ backend_sources += files( 'syncscan.c', 'toast_compression.c', 'toast_internals.c', + 'tidstore.c', 'tupconvert.c', 'tupdesc.c', ) diff --git a/src/backend/access/common/tidstore.c b/src/backend/access/common/tidstore.c new file mode 100644 index 0000000000..8793c87fab --- /dev/null +++ b/src/backend/access/common/tidstore.c @@ -0,0 +1,273 @@ +/*------------------------------------------------------------------------- + * + * tidstore.c + * TID (ItemPointer) storage implementation. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/common/tidstore.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/tidstore.h" +#include "lib/radixtree.h" +#include "utils/dsa.h" +#include "utils/memutils.h" + +/* XXX: should be configurable for non-heap AMs */ +#define TIDSTORE_OFFSET_NBITS 11 /* pg_ceil_log2_32(MaxHeapTuplesPerPage) */ + +#define TIDSTORE_VALUE_NBITS 6 /* log(sizeof(uint64) * BITS_PER_BYTE, 2) */ + +/* Get block number from the key */ +#define KEY_GET_BLKNO(key) \ + ((BlockNumber) ((key) >> (TIDSTORE_OFFSET_NBITS - TIDSTORE_VALUE_NBITS))) + +struct TIDStore +{ + /* main storage for TID */ + radix_tree *tree; + + /* # of tids in TIDStore */ + int num_tids; + + /* DSA area and handle for shared TIDStore */ + dsa_pointer handle; + dsa_area *dsa; +}; + +static void tidstore_iter_collect_tids(TIDStoreIter *iter, uint64 key, uint64 val); +static inline uint64 tid_to_key_off(ItemPointer tid, uint32 *off); + +TIDStore * +tidstore_create(dsa_area *dsa) +{ + TIDStore *ts; + + ts = palloc0(sizeof(TIDStore)); + + ts->tree = rt_create(CurrentMemoryContext, dsa); + ts->dsa = dsa; + + if (dsa != NULL) + ts->handle = rt_get_dsa_pointer(ts->tree); + + return ts; +} + +/* Attach the shared TIDStore */ +TIDStore * +tidstore_attach(dsa_area *dsa, dsa_pointer handle) +{ + TIDStore *ts; + + Assert(dsa != NULL); + Assert(DsaPointerIsValid(handle)); + + ts = palloc0(sizeof(TIDStore)); + + ts->tree = rt_attach(dsa, handle); + + return ts; +} + +void +tidstore_detach(TIDStore *ts) +{ + rt_detach(ts->tree); +} + +void +tidstore_free(TIDStore *ts) +{ + rt_free(ts->tree); + pfree(ts); +} + +void +tidstore_reset(TIDStore *ts) +{ + if (ts->dsa != NULL) + { + /* XXX: reset shared radix tree */ + Assert(false); + } + else + { + ts->num_tids = 0; + + rt_free(ts->tree); + ts->tree = rt_create(CurrentMemoryContext, NULL); + } +} + +/* Add TIDs to TIDStore */ +void +tidstore_add_tids(TIDStore *ts, BlockNumber blkno, OffsetNumber *offsets, + int num_offsets) +{ + uint64 last_key = PG_UINT64_MAX; + uint64 key; + uint64 val = 0; + ItemPointerData tid; + + ItemPointerSetBlockNumber(&tid, blkno); + + for (int i = 0; i < num_offsets; i++) + { + uint32 off; + + ItemPointerSetOffsetNumber(&tid, offsets[i]); + + key = tid_to_key_off(&tid, &off); + + if (last_key != PG_UINT64_MAX && last_key != key) + { + rt_set(ts->tree, last_key, val); + val = 0; + } + + last_key = key; + val |= UINT64CONST(1) << off; + ts->num_tids++; + } + + if (last_key != PG_UINT64_MAX) + { + rt_set(ts->tree, last_key, val); + val = 0; + } +} + +/* Return true if the given TID is present in TIDStore */ +bool +tidstore_lookup_tid(TIDStore *ts, ItemPointer tid) +{ + uint64 key; + uint64 val; + uint32 off; + bool found; + + key = tid_to_key_off(tid, &off); + + found = rt_search(ts->tree, key, &val); + + if (!found) + return false; + + return (val & (UINT64CONST(1) << off)) != 0; +} + +TIDStoreIter * +tidstore_begin_iterate(TIDStore *ts) +{ + TIDStoreIter *iter; + + iter = palloc0(sizeof(TIDStoreIter)); + iter->ts = ts; + iter->tree_iter = rt_begin_iterate(ts->tree); + iter->blkno = InvalidBlockNumber; + + return iter; +} + +bool +tidstore_iterate_next(TIDStoreIter *iter) +{ + uint64 key; + uint64 val; + + if (iter->finished) + return false; + + if (BlockNumberIsValid(iter->blkno)) + { + iter->num_offsets = 0; + tidstore_iter_collect_tids(iter, iter->next_key, iter->next_val); + } + + while (rt_iterate_next(iter->tree_iter, &key, &val)) + { + BlockNumber blkno; + + blkno = KEY_GET_BLKNO(key); + + if (BlockNumberIsValid(iter->blkno) && iter->blkno != blkno) + { + /* + * Remember the key-value pair for the next block for the + * next iteration. + */ + iter->next_key = key; + iter->next_val = val; + return true; + } + + /* Collect tids extracted from the key-value pair */ + tidstore_iter_collect_tids(iter, key, val); + } + + iter->finished = true; + return true; +} + +uint64 +tidstore_num_tids(TIDStore *ts) +{ + return ts->num_tids; +} + +uint64 +tidstore_memory_usage(TIDStore *ts) +{ + return (uint64) sizeof(TIDStore) + rt_memory_usage(ts->tree); +} + +tidstore_handle +tidstore_get_handle(TIDStore *ts) +{ + return rt_get_dsa_pointer(ts->tree); +} + +/* Extract TIDs from key-value pair */ +static void +tidstore_iter_collect_tids(TIDStoreIter *iter, uint64 key, uint64 val) +{ + for (int i = 0; i < sizeof(uint64) * BITS_PER_BYTE; i++) + { + uint64 tid_i; + OffsetNumber off; + + if ((val & (UINT64CONST(1) << i)) == 0) + continue; + + tid_i = key << TIDSTORE_VALUE_NBITS; + tid_i |= i; + + off = tid_i & ((UINT64CONST(1) << TIDSTORE_OFFSET_NBITS) - 1); + iter->offsets[iter->num_offsets++] = off; + } + + iter->blkno = KEY_GET_BLKNO(key); +} + +/* Encode a TID to key and val */ +static inline uint64 +tid_to_key_off(ItemPointer tid, uint32 *off) +{ + uint64 upper; + uint64 tid_i; + + tid_i = ItemPointerGetOffsetNumber(tid); + tid_i |= (uint64) ItemPointerGetBlockNumber(tid) << TIDSTORE_OFFSET_NBITS; + + *off = tid_i & ((1 << TIDSTORE_VALUE_NBITS) - 1); + upper = tid_i >> TIDSTORE_VALUE_NBITS; + Assert(*off < (sizeof(uint64) * BITS_PER_BYTE)); + + return upper; +} diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index dfbe37472f..5b013bc3a8 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -40,6 +40,7 @@ #include "access/heapam_xlog.h" #include "access/htup_details.h" #include "access/multixact.h" +#include "access/tidstore.h" #include "access/transam.h" #include "access/visibilitymap.h" #include "access/xact.h" @@ -144,6 +145,8 @@ typedef struct LVRelState Relation *indrels; int nindexes; + int max_bytes; + /* Aggressive VACUUM? (must set relfrozenxid >= FreezeLimit) */ bool aggressive; /* Use visibility map to skip? (disabled by DISABLE_PAGE_SKIPPING) */ @@ -194,7 +197,7 @@ typedef struct LVRelState * lazy_vacuum_heap_rel, which marks the same LP_DEAD line pointers as * LP_UNUSED during second heap pass. */ - VacDeadItems *dead_items; /* TIDs whose index tuples we'll delete */ + TIDStore *dead_items; /* TIDs whose index tuples we'll delete */ BlockNumber rel_pages; /* total number of pages */ BlockNumber scanned_pages; /* # pages examined (not skipped via VM) */ BlockNumber removed_pages; /* # pages removed by relation truncation */ @@ -265,8 +268,9 @@ static bool lazy_scan_noprune(LVRelState *vacrel, Buffer buf, static void lazy_vacuum(LVRelState *vacrel); static bool lazy_vacuum_all_indexes(LVRelState *vacrel); static void lazy_vacuum_heap_rel(LVRelState *vacrel); -static int lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, - Buffer buffer, int index, Buffer *vmbuffer); +static void lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, + OffsetNumber *offsets, int num_offsets, + Buffer buffer, Buffer *vmbuffer); static bool lazy_check_wraparound_failsafe(LVRelState *vacrel); static void lazy_cleanup_all_indexes(LVRelState *vacrel); static IndexBulkDeleteResult *lazy_vacuum_one_index(Relation indrel, @@ -397,6 +401,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, vacrel->indname = NULL; vacrel->phase = VACUUM_ERRCB_PHASE_UNKNOWN; vacrel->verbose = verbose; + vacrel->max_bytes = IsAutoVacuumWorkerProcess() && + autovacuum_work_mem != -1 ? + autovacuum_work_mem * 1024L : maintenance_work_mem * 1024L; errcallback.callback = vacuum_error_callback; errcallback.arg = vacrel; errcallback.previous = error_context_stack; @@ -858,7 +865,7 @@ lazy_scan_heap(LVRelState *vacrel) next_unskippable_block, next_failsafe_block = 0, next_fsm_block_to_vacuum = 0; - VacDeadItems *dead_items = vacrel->dead_items; + TIDStore *dead_items = vacrel->dead_items; Buffer vmbuffer = InvalidBuffer; bool next_unskippable_allvis, skipping_current_range; @@ -872,7 +879,7 @@ lazy_scan_heap(LVRelState *vacrel) /* Report that we're scanning the heap, advertising total # of blocks */ initprog_val[0] = PROGRESS_VACUUM_PHASE_SCAN_HEAP; initprog_val[1] = rel_pages; - initprog_val[2] = dead_items->max_items; + initprog_val[2] = vacrel->max_bytes; /* XXX: should use # of tids */ pgstat_progress_update_multi_param(3, initprog_index, initprog_val); /* Set up an initial range of skippable blocks using the visibility map */ @@ -942,8 +949,8 @@ lazy_scan_heap(LVRelState *vacrel) * dead_items TIDs, pause and do a cycle of vacuuming before we tackle * this page. */ - Assert(dead_items->max_items >= MaxHeapTuplesPerPage); - if (dead_items->max_items - dead_items->num_items < MaxHeapTuplesPerPage) + /* XXX: should not allow tidstore to grow beyond max_bytes */ + if (tidstore_memory_usage(vacrel->dead_items) > vacrel->max_bytes) { /* * Before beginning index vacuuming, we release any pin we may @@ -1075,11 +1082,17 @@ lazy_scan_heap(LVRelState *vacrel) if (prunestate.has_lpdead_items) { Size freespace; + TIDStoreIter *iter; - lazy_vacuum_heap_page(vacrel, blkno, buf, 0, &vmbuffer); + iter = tidstore_begin_iterate(vacrel->dead_items); + tidstore_iterate_next(iter); + lazy_vacuum_heap_page(vacrel, blkno, iter->offsets, iter->num_offsets, + buf, &vmbuffer); + Assert(!tidstore_iterate_next(iter)); + pfree(iter); /* Forget the LP_DEAD items that we just vacuumed */ - dead_items->num_items = 0; + tidstore_reset(dead_items); /* * Periodically perform FSM vacuuming to make newly-freed @@ -1116,7 +1129,7 @@ lazy_scan_heap(LVRelState *vacrel) * with prunestate-driven visibility map and FSM steps (just like * the two-pass strategy). */ - Assert(dead_items->num_items == 0); + Assert(tidstore_num_tids(dead_items) == 0); } /* @@ -1269,7 +1282,7 @@ lazy_scan_heap(LVRelState *vacrel) * Do index vacuuming (call each index's ambulkdelete routine), then do * related heap vacuuming */ - if (dead_items->num_items > 0) + if (tidstore_num_tids(dead_items) > 0) lazy_vacuum(vacrel); /* @@ -1903,25 +1916,16 @@ retry: */ if (lpdead_items > 0) { - VacDeadItems *dead_items = vacrel->dead_items; - ItemPointerData tmp; + TIDStore *dead_items = vacrel->dead_items; Assert(!prunestate->all_visible); Assert(prunestate->has_lpdead_items); vacrel->lpdead_item_pages++; + tidstore_add_tids(dead_items, blkno, deadoffsets, lpdead_items); - ItemPointerSetBlockNumber(&tmp, blkno); - - for (int i = 0; i < lpdead_items; i++) - { - ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]); - dead_items->items[dead_items->num_items++] = tmp; - } - - Assert(dead_items->num_items <= dead_items->max_items); pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, - dead_items->num_items); + tidstore_num_tids(dead_items)); } /* Finally, add page-local counts to whole-VACUUM counts */ @@ -2128,8 +2132,7 @@ lazy_scan_noprune(LVRelState *vacrel, } else { - VacDeadItems *dead_items = vacrel->dead_items; - ItemPointerData tmp; + TIDStore *dead_items = vacrel->dead_items; /* * Page has LP_DEAD items, and so any references/TIDs that remain in @@ -2138,17 +2141,10 @@ lazy_scan_noprune(LVRelState *vacrel, */ vacrel->lpdead_item_pages++; - ItemPointerSetBlockNumber(&tmp, blkno); - - for (int i = 0; i < lpdead_items; i++) - { - ItemPointerSetOffsetNumber(&tmp, deadoffsets[i]); - dead_items->items[dead_items->num_items++] = tmp; - } + tidstore_add_tids(dead_items, blkno, deadoffsets, lpdead_items); - Assert(dead_items->num_items <= dead_items->max_items); pgstat_progress_update_param(PROGRESS_VACUUM_NUM_DEAD_TUPLES, - dead_items->num_items); + tidstore_num_tids(dead_items)); vacrel->lpdead_items += lpdead_items; @@ -2197,7 +2193,7 @@ lazy_vacuum(LVRelState *vacrel) if (!vacrel->do_index_vacuuming) { Assert(!vacrel->do_index_cleanup); - vacrel->dead_items->num_items = 0; + tidstore_reset(vacrel->dead_items); return; } @@ -2226,7 +2222,7 @@ lazy_vacuum(LVRelState *vacrel) BlockNumber threshold; Assert(vacrel->num_index_scans == 0); - Assert(vacrel->lpdead_items == vacrel->dead_items->num_items); + Assert(vacrel->lpdead_items == tidstore_num_tids(vacrel->dead_items)); Assert(vacrel->do_index_vacuuming); Assert(vacrel->do_index_cleanup); @@ -2253,8 +2249,8 @@ lazy_vacuum(LVRelState *vacrel) * cases then this may need to be reconsidered. */ threshold = (double) vacrel->rel_pages * BYPASS_THRESHOLD_PAGES; - bypass = (vacrel->lpdead_item_pages < threshold && - vacrel->lpdead_items < MAXDEADITEMS(32L * 1024L * 1024L)); + bypass = (vacrel->lpdead_item_pages < threshold) && + tidstore_memory_usage(vacrel->dead_items) < (32L * 1024L * 1024L); } if (bypass) @@ -2299,7 +2295,7 @@ lazy_vacuum(LVRelState *vacrel) * Forget the LP_DEAD items that we just vacuumed (or just decided to not * vacuum) */ - vacrel->dead_items->num_items = 0; + /* tidstore_reset(vacrel->dead_items); */ } /* @@ -2371,7 +2367,7 @@ lazy_vacuum_all_indexes(LVRelState *vacrel) * place). */ Assert(vacrel->num_index_scans > 0 || - vacrel->dead_items->num_items == vacrel->lpdead_items); + tidstore_num_tids(vacrel->dead_items) == vacrel->lpdead_items); Assert(allindexes || vacrel->failsafe_active); /* @@ -2408,10 +2404,10 @@ lazy_vacuum_all_indexes(LVRelState *vacrel) static void lazy_vacuum_heap_rel(LVRelState *vacrel) { - int index; BlockNumber vacuumed_pages; Buffer vmbuffer = InvalidBuffer; LVSavedErrInfo saved_err_info; + TIDStoreIter *iter; Assert(vacrel->do_index_vacuuming); Assert(vacrel->do_index_cleanup); @@ -2428,8 +2424,8 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) vacuumed_pages = 0; - index = 0; - while (index < vacrel->dead_items->num_items) + iter = tidstore_begin_iterate(vacrel->dead_items); + while (tidstore_iterate_next(iter)) { BlockNumber tblk; Buffer buf; @@ -2438,12 +2434,13 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) vacuum_delay_point(); - tblk = ItemPointerGetBlockNumber(&vacrel->dead_items->items[index]); + tblk = iter->blkno; vacrel->blkno = tblk; buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, tblk, RBM_NORMAL, vacrel->bstrategy); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); - index = lazy_vacuum_heap_page(vacrel, tblk, buf, index, &vmbuffer); + lazy_vacuum_heap_page(vacrel, tblk, iter->offsets, iter->num_offsets, + buf, &vmbuffer); /* Now that we've vacuumed the page, record its available space */ page = BufferGetPage(buf); @@ -2467,9 +2464,8 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) * We set all LP_DEAD items from the first heap pass to LP_UNUSED during * the second heap pass. No more, no less. */ - Assert(index > 0); Assert(vacrel->num_index_scans > 1 || - (index == vacrel->lpdead_items && + (tidstore_num_tids(vacrel->dead_items) == vacrel->lpdead_items && vacuumed_pages == vacrel->lpdead_item_pages)); ereport(DEBUG2, @@ -2491,11 +2487,10 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) * LP_DEAD item on the page. The return value is the first index immediately * after all LP_DEAD items for the same page in the array. */ -static int -lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, - int index, Buffer *vmbuffer) +static void +lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, OffsetNumber *offsets, + int num_offsets, Buffer buffer, Buffer *vmbuffer) { - VacDeadItems *dead_items = vacrel->dead_items; Page page = BufferGetPage(buffer); OffsetNumber unused[MaxHeapTuplesPerPage]; int uncnt = 0; @@ -2514,16 +2509,11 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, START_CRIT_SECTION(); - for (; index < dead_items->num_items; index++) + for (int i = 0; i < num_offsets; i++) { - BlockNumber tblk; - OffsetNumber toff; ItemId itemid; + OffsetNumber toff = offsets[i]; - tblk = ItemPointerGetBlockNumber(&dead_items->items[index]); - if (tblk != blkno) - break; /* past end of tuples for this block */ - toff = ItemPointerGetOffsetNumber(&dead_items->items[index]); itemid = PageGetItemId(page, toff); Assert(ItemIdIsDead(itemid) && !ItemIdHasStorage(itemid)); @@ -2603,7 +2593,6 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, /* Revert to the previous phase information for error traceback */ restore_vacuum_error_info(vacrel, &saved_err_info); - return index; } /* @@ -3105,46 +3094,6 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected) return vacrel->nonempty_pages; } -/* - * Returns the number of dead TIDs that VACUUM should allocate space to - * store, given a heap rel of size vacrel->rel_pages, and given current - * maintenance_work_mem setting (or current autovacuum_work_mem setting, - * when applicable). - * - * See the comments at the head of this file for rationale. - */ -static int -dead_items_max_items(LVRelState *vacrel) -{ - int64 max_items; - int vac_work_mem = IsAutoVacuumWorkerProcess() && - autovacuum_work_mem != -1 ? - autovacuum_work_mem : maintenance_work_mem; - - if (vacrel->nindexes > 0) - { - BlockNumber rel_pages = vacrel->rel_pages; - - max_items = MAXDEADITEMS(vac_work_mem * 1024L); - max_items = Min(max_items, INT_MAX); - max_items = Min(max_items, MAXDEADITEMS(MaxAllocSize)); - - /* curious coding here to ensure the multiplication can't overflow */ - if ((BlockNumber) (max_items / MaxHeapTuplesPerPage) > rel_pages) - max_items = rel_pages * MaxHeapTuplesPerPage; - - /* stay sane if small maintenance_work_mem */ - max_items = Max(max_items, MaxHeapTuplesPerPage); - } - else - { - /* One-pass case only stores a single heap page's TIDs at a time */ - max_items = MaxHeapTuplesPerPage; - } - - return (int) max_items; -} - /* * Allocate dead_items (either using palloc, or in dynamic shared memory). * Sets dead_items in vacrel for caller. @@ -3155,12 +3104,6 @@ dead_items_max_items(LVRelState *vacrel) static void dead_items_alloc(LVRelState *vacrel, int nworkers) { - VacDeadItems *dead_items; - int max_items; - - max_items = dead_items_max_items(vacrel); - Assert(max_items >= MaxHeapTuplesPerPage); - /* * Initialize state for a parallel vacuum. As of now, only one worker can * be used for an index, so we invoke parallelism only if there are at @@ -3186,7 +3129,6 @@ dead_items_alloc(LVRelState *vacrel, int nworkers) else vacrel->pvs = parallel_vacuum_init(vacrel->rel, vacrel->indrels, vacrel->nindexes, nworkers, - max_items, vacrel->verbose ? INFO : DEBUG2, vacrel->bstrategy); @@ -3199,11 +3141,7 @@ dead_items_alloc(LVRelState *vacrel, int nworkers) } /* Serial VACUUM case */ - dead_items = (VacDeadItems *) palloc(vac_max_items_to_alloc_size(max_items)); - dead_items->max_items = max_items; - dead_items->num_items = 0; - - vacrel->dead_items = dead_items; + vacrel->dead_items = tidstore_create(NULL); } /* diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 7ccde07de9..03ce9c3b6e 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -2295,16 +2295,16 @@ get_vacoptval_from_boolean(DefElem *def) */ IndexBulkDeleteResult * vac_bulkdel_one_index(IndexVacuumInfo *ivinfo, IndexBulkDeleteResult *istat, - VacDeadItems *dead_items) + TIDStore *dead_items) { /* Do bulk deletion */ istat = index_bulk_delete(ivinfo, istat, vac_tid_reaped, (void *) dead_items); ereport(ivinfo->message_level, - (errmsg("scanned index \"%s\" to remove %d row versions", + (errmsg("scanned index \"%s\" to remove " UINT64_FORMAT " row versions", RelationGetRelationName(ivinfo->index), - dead_items->num_items))); + tidstore_num_tids(dead_items)))); return istat; } @@ -2335,18 +2335,6 @@ vac_cleanup_one_index(IndexVacuumInfo *ivinfo, IndexBulkDeleteResult *istat) return istat; } -/* - * Returns the total required space for VACUUM's dead_items array given a - * max_items value. - */ -Size -vac_max_items_to_alloc_size(int max_items) -{ - Assert(max_items <= MAXDEADITEMS(MaxAllocSize)); - - return offsetof(VacDeadItems, items) + sizeof(ItemPointerData) * max_items; -} - /* * vac_tid_reaped() -- is a particular tid deletable? * @@ -2357,32 +2345,9 @@ vac_max_items_to_alloc_size(int max_items) static bool vac_tid_reaped(ItemPointer itemptr, void *state) { - VacDeadItems *dead_items = (VacDeadItems *) state; - int64 litem, - ritem, - item; - ItemPointer res; - - litem = itemptr_encode(&dead_items->items[0]); - ritem = itemptr_encode(&dead_items->items[dead_items->num_items - 1]); - item = itemptr_encode(itemptr); - - /* - * Doing a simple bound check before bsearch() is useful to avoid the - * extra cost of bsearch(), especially if dead items on the heap are - * concentrated in a certain range. Since this function is called for - * every index tuple, it pays to be really fast. - */ - if (item < litem || item > ritem) - return false; - - res = (ItemPointer) bsearch((void *) itemptr, - (void *) dead_items->items, - dead_items->num_items, - sizeof(ItemPointerData), - vac_cmp_itemptr); + TIDStore *dead_items = (TIDStore *) state; - return (res != NULL); + return tidstore_lookup_tid(dead_items, itemptr); } /* diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index f26d796e52..641c98d80b 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -44,7 +44,7 @@ * use small integers. */ #define PARALLEL_VACUUM_KEY_SHARED 1 -#define PARALLEL_VACUUM_KEY_DEAD_ITEMS 2 +#define PARALLEL_VACUUM_KEY_DSA 2 #define PARALLEL_VACUUM_KEY_QUERY_TEXT 3 #define PARALLEL_VACUUM_KEY_BUFFER_USAGE 4 #define PARALLEL_VACUUM_KEY_WAL_USAGE 5 @@ -103,6 +103,9 @@ typedef struct PVShared /* Counter for vacuuming and cleanup */ pg_atomic_uint32 idx; + + /* Handle of the shared TIDStore */ + tidstore_handle dead_items_handle; } PVShared; /* Status used during parallel index vacuum or cleanup */ @@ -166,7 +169,7 @@ struct ParallelVacuumState PVIndStats *indstats; /* Shared dead items space among parallel vacuum workers */ - VacDeadItems *dead_items; + TIDStore *dead_items; /* Points to buffer usage area in DSM */ BufferUsage *buffer_usage; @@ -222,20 +225,22 @@ static void parallel_vacuum_error_callback(void *arg); */ ParallelVacuumState * parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, - int nrequested_workers, int max_items, - int elevel, BufferAccessStrategy bstrategy) + int nrequested_workers, int elevel, + BufferAccessStrategy bstrategy) { ParallelVacuumState *pvs; ParallelContext *pcxt; PVShared *shared; - VacDeadItems *dead_items; + TIDStore *dead_items; PVIndStats *indstats; BufferUsage *buffer_usage; WalUsage *wal_usage; + void *area_space; + dsa_area *dead_items_dsa; bool *will_parallel_vacuum; Size est_indstats_len; Size est_shared_len; - Size est_dead_items_len; + Size dsa_minsize = dsa_minimum_size(); int nindexes_mwm = 0; int parallel_workers = 0; int querylen; @@ -283,9 +288,8 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, shm_toc_estimate_chunk(&pcxt->estimator, est_shared_len); shm_toc_estimate_keys(&pcxt->estimator, 1); - /* Estimate size for dead_items -- PARALLEL_VACUUM_KEY_DEAD_ITEMS */ - est_dead_items_len = vac_max_items_to_alloc_size(max_items); - shm_toc_estimate_chunk(&pcxt->estimator, est_dead_items_len); + /* Estimate size for dead tuple DSA -- PARALLEL_VACUUM_KEY_DSA */ + shm_toc_estimate_chunk(&pcxt->estimator, dsa_minsize); shm_toc_estimate_keys(&pcxt->estimator, 1); /* @@ -351,6 +355,15 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_INDEX_STATS, indstats); pvs->indstats = indstats; + /* Prepare DSA space for dead items */ + area_space = shm_toc_allocate(pcxt->toc, dsa_minsize); + shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_DSA, area_space); + dead_items_dsa = dsa_create_in_place(area_space, dsa_minsize, + LWTRANCHE_PARALLEL_VACUUM_DSA, + pcxt->seg); + dead_items = tidstore_create(dead_items_dsa); + pvs->dead_items = dead_items; + /* Prepare shared information */ shared = (PVShared *) shm_toc_allocate(pcxt->toc, est_shared_len); MemSet(shared, 0, est_shared_len); @@ -360,6 +373,7 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, (nindexes_mwm > 0) ? maintenance_work_mem / Min(parallel_workers, nindexes_mwm) : maintenance_work_mem; + shared->dead_items_handle = tidstore_get_handle(dead_items); pg_atomic_init_u32(&(shared->cost_balance), 0); pg_atomic_init_u32(&(shared->active_nworkers), 0); @@ -368,15 +382,6 @@ parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_SHARED, shared); pvs->shared = shared; - /* Prepare the dead_items space */ - dead_items = (VacDeadItems *) shm_toc_allocate(pcxt->toc, - est_dead_items_len); - dead_items->max_items = max_items; - dead_items->num_items = 0; - MemSet(dead_items->items, 0, sizeof(ItemPointerData) * max_items); - shm_toc_insert(pcxt->toc, PARALLEL_VACUUM_KEY_DEAD_ITEMS, dead_items); - pvs->dead_items = dead_items; - /* * Allocate space for each worker's BufferUsage and WalUsage; no need to * initialize @@ -434,6 +439,8 @@ parallel_vacuum_end(ParallelVacuumState *pvs, IndexBulkDeleteResult **istats) istats[i] = NULL; } + tidstore_free(pvs->dead_items); + DestroyParallelContext(pvs->pcxt); ExitParallelMode(); @@ -442,7 +449,7 @@ parallel_vacuum_end(ParallelVacuumState *pvs, IndexBulkDeleteResult **istats) } /* Returns the dead items space */ -VacDeadItems * +TIDStore * parallel_vacuum_get_dead_items(ParallelVacuumState *pvs) { return pvs->dead_items; @@ -940,7 +947,9 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) Relation *indrels; PVIndStats *indstats; PVShared *shared; - VacDeadItems *dead_items; + TIDStore *dead_items; + void *area_space; + dsa_area *dead_items_area; BufferUsage *buffer_usage; WalUsage *wal_usage; int nindexes; @@ -984,10 +993,10 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) PARALLEL_VACUUM_KEY_INDEX_STATS, false); - /* Set dead_items space */ - dead_items = (VacDeadItems *) shm_toc_lookup(toc, - PARALLEL_VACUUM_KEY_DEAD_ITEMS, - false); + /* Set dead items */ + area_space = shm_toc_lookup(toc, PARALLEL_VACUUM_KEY_DSA, false); + dead_items_area = dsa_attach_in_place(area_space, seg); + dead_items = tidstore_attach(dead_items_area, shared->dead_items_handle); /* Set cost-based vacuum delay */ VacuumCostActive = (VacuumCostDelay > 0); @@ -1033,6 +1042,8 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber], &wal_usage[ParallelWorkerNumber]); + tidstore_detach(pvs.dead_items); + /* Pop the error context stack */ error_context_stack = errcallback.previous; diff --git a/src/backend/lib/radixtree.c b/src/backend/lib/radixtree.c index 3b06f22af5..a428046d71 100644 --- a/src/backend/lib/radixtree.c +++ b/src/backend/lib/radixtree.c @@ -1731,6 +1731,7 @@ rt_attach(dsa_area *dsa, dsa_pointer dp) tree = (radix_tree *) palloc0(sizeof(radix_tree)); tree->ctl_dp = dp; + tree->dsa = dsa; tree->ctl = (radix_tree_control *) dsa_get_address(dsa, dp); /* XXX: do we need to set a callback on exit to detach dsa? */ @@ -1738,6 +1739,14 @@ rt_attach(dsa_area *dsa, dsa_pointer dp) return tree; } +void +rt_detach(radix_tree *tree) +{ + Assert(RadixTreeIsShared(tree)); + dsa_detach(tree->dsa); + pfree(tree); +} + /* * Free the given radix tree. */ diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index 0fc0cf6ebb..f94608f45a 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -183,6 +183,8 @@ static const char *const BuiltinTrancheNames[] = { "PgStatsHash", /* LWTRANCHE_PGSTATS_DATA: */ "PgStatsData", + /* LWTRANCHE_PARALLEL_VACUUM_DSA: */ + "ParallelVacuumDSA", }; StaticAssertDecl(lengthof(BuiltinTrancheNames) == diff --git a/src/include/access/tidstore.h b/src/include/access/tidstore.h new file mode 100644 index 0000000000..40b8021f9b --- /dev/null +++ b/src/include/access/tidstore.h @@ -0,0 +1,55 @@ +/*------------------------------------------------------------------------- + * + * tidstore.h + * TID storage. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/tidstore.h + * + *------------------------------------------------------------------------- + */ +#ifndef TIDSTORE_H +#define TIDSTORE_H + +#include "lib/radixtree.h" +#include "storage/itemptr.h" + +typedef dsa_pointer tidstore_handle; + +typedef struct TIDStore TIDStore; + +typedef struct TIDStoreIter +{ + TIDStore *ts; + + rt_iter *tree_iter; + + bool finished; + + uint64 next_key; + uint64 next_val; + + BlockNumber blkno; + OffsetNumber offsets[MaxOffsetNumber]; /* XXX: usually don't use up */ + int num_offsets; +} TIDStoreIter; + +extern TIDStore *tidstore_create(dsa_area *dsa); +extern TIDStore *tidstore_attach(dsa_area *dsa, dsa_pointer handle); +extern void tidstore_detach(TIDStore *ts); +extern void tidstore_free(TIDStore *ts); +extern void tidstore_reset(TIDStore *ts); +extern void tidstore_add_tids(TIDStore *ts, BlockNumber blkno, OffsetNumber *offsets, + int num_offsets); +extern bool tidstore_lookup_tid(TIDStore *ts, ItemPointer tid); +extern TIDStoreIter * tidstore_begin_iterate(TIDStore *ts); +extern bool tidstore_iterate_next(TIDStoreIter *iter); +extern uint64 tidstore_num_tids(TIDStore *ts); +extern uint64 tidstore_memory_usage(TIDStore *ts); +extern tidstore_handle tidstore_get_handle(TIDStore *ts); + +#endif /* TIDSTORE_H */ + diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 5d816ba7f4..d221528f16 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -17,6 +17,7 @@ #include "access/htup.h" #include "access/genam.h" #include "access/parallel.h" +#include "access/tidstore.h" #include "catalog/pg_class.h" #include "catalog/pg_statistic.h" #include "catalog/pg_type.h" @@ -235,21 +236,6 @@ typedef struct VacuumParams int nworkers; } VacuumParams; -/* - * VacDeadItems stores TIDs whose index tuples are deleted by index vacuuming. - */ -typedef struct VacDeadItems -{ - int max_items; /* # slots allocated in array */ - int num_items; /* current # of entries */ - - /* Sorted array of TIDs to delete from indexes */ - ItemPointerData items[FLEXIBLE_ARRAY_MEMBER]; -} VacDeadItems; - -#define MAXDEADITEMS(avail_mem) \ - (((avail_mem) - offsetof(VacDeadItems, items)) / sizeof(ItemPointerData)) - /* GUC parameters */ extern PGDLLIMPORT int default_statistics_target; /* PGDLLIMPORT for PostGIS */ extern PGDLLIMPORT int vacuum_freeze_min_age; @@ -306,18 +292,16 @@ extern Relation vacuum_open_relation(Oid relid, RangeVar *relation, LOCKMODE lmode); extern IndexBulkDeleteResult *vac_bulkdel_one_index(IndexVacuumInfo *ivinfo, IndexBulkDeleteResult *istat, - VacDeadItems *dead_items); + TIDStore *dead_items); extern IndexBulkDeleteResult *vac_cleanup_one_index(IndexVacuumInfo *ivinfo, IndexBulkDeleteResult *istat); -extern Size vac_max_items_to_alloc_size(int max_items); /* in commands/vacuumparallel.c */ extern ParallelVacuumState *parallel_vacuum_init(Relation rel, Relation *indrels, int nindexes, int nrequested_workers, - int max_items, int elevel, - BufferAccessStrategy bstrategy); + int elevel, BufferAccessStrategy bstrategy); extern void parallel_vacuum_end(ParallelVacuumState *pvs, IndexBulkDeleteResult **istats); -extern VacDeadItems *parallel_vacuum_get_dead_items(ParallelVacuumState *pvs); +extern TIDStore *parallel_vacuum_get_dead_items(ParallelVacuumState *pvs); extern void parallel_vacuum_bulkdel_all_indexes(ParallelVacuumState *pvs, long num_table_tuples, int num_index_scans); diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h index d9d8355c21..e3f90adebd 100644 --- a/src/include/lib/radixtree.h +++ b/src/include/lib/radixtree.h @@ -29,6 +29,7 @@ extern rt_iter *rt_begin_iterate(radix_tree *tree); extern dsa_pointer rt_get_dsa_pointer(radix_tree *tree); extern radix_tree *rt_attach(dsa_area *dsa, dsa_pointer dp); +extern void rt_detach(radix_tree *tree); extern bool rt_iterate_next(rt_iter *iter, uint64 *key_p, uint64 *value_p); extern void rt_end_iterate(rt_iter *iter); diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index ca4eca76f4..0999e4fc10 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -193,6 +193,7 @@ typedef enum BuiltinTrancheIds LWTRANCHE_PGSTATS_DSA, LWTRANCHE_PGSTATS_HASH, LWTRANCHE_PGSTATS_DATA, + LWTRANCHE_PARALLEL_VACUUM_DSA, LWTRANCHE_FIRST_USER_DEFINED } BuiltinTrancheIds; -- 2.31.1