From 3a7dbf780a81a6ec36905259d456b46c50473827 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 18 Jul 2022 14:35:44 -0700 Subject: [PATCH v9 4/5] Add eager and lazy VM strategies to VACUUM. Acquire an in-memory immutable "snapshot" of the target rel's visibility map at the start of each VACUUM, and use the snapshot to determine when and how VACUUM will skip pages. The data structure we use is a local copy of the visibility map at the start of VACUUM. It spills to disk as required. In practice VACUUM only uses a temp file for relations that are large enough to have more than a single visibility map page. Non-aggressive VACUUMs now make an up-front choice about VM snapshot strategy: they decide whether or not to prioritize early advancement of relfrozenxid (eager strategy) over avoiding work by skipping all-visible pages (lazy strategy). VACUUM decides on its skipping and freezing strategies together, shortly before the first pass over the heap begins, since the concepts are closely related, and work in tandem. Note that the eager VM strategy often has a significant impact on the total number of pages frozen by VACUUM, even when lazy freezing is also used. (In general VACUUM tends to use either lazy or eager strategies across the board, though notable exceptions exist.) Also make the VACUUM command's DISABLE_PAGE_SKIPPING option stop forcing aggressive mode. As a consequence, the option will no longer have any impact on when or how VACUUM waits for a cleanup lock the hard way. The option now makes VACUUM distrust the visibility map, and nothing more. DISABLE_PAGE_SKIPPING now works by making VACUUM opt to use a dedicated "no skipping" VM snapshot strategy. This lays the groundwork for completely removing aggressive mode VACUUMs in a later commit; vmsnap strategies supersede the "early aggressive VACUUM" concept previously implemented by vacuum_freeze_table_age, which is now just a compatibility option (its new default of -1 is interpreted as "just use autovacuum_freeze_max_age"). VACUUM makes a choice about which VM skip strategy to use by considering how close table age is to autovacuum_freeze_max_age (actually vacuum_freeze_table_age) directly, in a way that is roughly comparable to our previous approach. But table age is now just one factor considered alongside several other factors. Also add explicit I/O prefetching of heap pages, which is controlled by maintenance_io_concurrency. We prefetch at the point that the next block in line is requested by VACUUM. Prefetching is under the direct control of the visibility map snapshot code, since VACUUM's vmsnap is now an authoritative guide to which pages VACUUM will scan. VACUUM's final scanned_pages is "locked in" when it decides on skipping strategy (so scanned_pages is finalized before the first heap pass even begins). Prefetching should totally avoid the loss of performance that might otherwise result from removing SKIP_PAGES_THRESHOLD in this commit. SKIP_PAGES_THRESHOLD was intended to force OS readahead and encourage relfrozenxid advancement. See commit bf136cf6 from around the time the visibility map first went in for full details. Also teach VACUUM to use scanned_pages (not rel_pages) to cap the size of the dead_items array. This approach is strictly better, since there is no question of scanning any pages other than the precise set of pages already locked in by vmsnap by the time dead_items is allocated. Author: Peter Geoghegan Reviewed-By: Andres Freund Reviewed-By: Jeff Davis Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com --- src/include/access/visibilitymap.h | 17 + src/include/commands/vacuum.h | 5 +- src/backend/access/heap/vacuumlazy.c | 450 ++++++++------- src/backend/access/heap/visibilitymap.c | 541 ++++++++++++++++++ src/backend/commands/cluster.c | 3 +- src/backend/commands/vacuum.c | 81 +-- src/backend/utils/misc/guc_tables.c | 8 +- src/backend/utils/misc/postgresql.conf.sample | 9 +- doc/src/sgml/config.sgml | 34 +- doc/src/sgml/ref/vacuum.sgml | 9 +- src/test/regress/expected/reloptions.out | 8 +- src/test/regress/sql/reloptions.sql | 8 +- 12 files changed, 906 insertions(+), 267 deletions(-) diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h index 55f67edb6..358b6f0fa 100644 --- a/src/include/access/visibilitymap.h +++ b/src/include/access/visibilitymap.h @@ -26,6 +26,17 @@ #define VM_ALL_FROZEN(r, b, v) \ ((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0) +/* Snapshot of visibility map at a point in time */ +typedef struct vmsnapshot vmsnapshot; + +/* VACUUM skipping strategy */ +typedef enum vmstrategy +{ + VMSNAP_SKIP_NONE = 0, + VMSNAP_SKIP_ALL_VISIBLE, + VMSNAP_SKIP_ALL_FROZEN +} vmstrategy; + extern bool visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags); extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk, @@ -35,6 +46,12 @@ extern void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, uint8 flags); extern uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); +extern vmsnapshot *visibilitymap_snap_acquire(Relation rel, BlockNumber rel_pages, + BlockNumber *scanned_pages_skipallvis, + BlockNumber *scanned_pages_skipallfrozen); +extern void visibilitymap_snap_strategy(vmsnapshot *vmsnap, vmstrategy strat); +extern BlockNumber visibilitymap_snap_next(vmsnapshot *vmsnap, bool *allvisible); +extern void visibilitymap_snap_release(vmsnapshot *vmsnap); extern void visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen); extern BlockNumber visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks); diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index de28d581a..4dcef3e67 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -187,7 +187,7 @@ typedef struct VacAttrStats #define VACOPT_FULL 0x10 /* FULL (non-concurrent) vacuum */ #define VACOPT_SKIP_LOCKED 0x20 /* skip if cannot get lock */ #define VACOPT_PROCESS_TOAST 0x40 /* process the TOAST table, if any */ -#define VACOPT_DISABLE_PAGE_SKIPPING 0x80 /* don't skip any pages */ +#define VACOPT_DISABLE_PAGE_SKIPPING 0x80 /* don't skip using VM */ /* * Values used by index_cleanup and truncate params. @@ -336,7 +336,8 @@ extern void vac_update_relstats(Relation relation, bool *minmulti_updated, bool in_outer_xact); extern bool vacuum_get_cutoffs(Relation rel, const VacuumParams *params, - struct VacuumCutoffs *cutoffs); + struct VacuumCutoffs *cutoffs, + double *tableagefrac); extern bool vacuum_xid_failsafe_check(const struct VacuumCutoffs *cutoffs); extern void vac_update_datfrozenxid(void); extern void vacuum_delay_point(void); diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 307842582..60c1e2cec 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -11,8 +11,8 @@ * We are willing to use at most maintenance_work_mem (or perhaps * autovacuum_work_mem) memory space to keep track of dead TIDs. We initially * allocate an array of TIDs of that size, with an upper limit that depends on - * table size (this limit ensures we don't allocate a huge area uselessly for - * vacuuming small tables). If the array threatens to overflow, we must call + * the number of pages we'll scan (this limit ensures we don't allocate a huge + * area for TIDs uselessly). If the array threatens to overflow, we must call * lazy_vacuum to vacuum indexes (and to vacuum the pages that we've pruned). * This frees up the memory space dedicated to storing dead TIDs. * @@ -109,10 +109,18 @@ ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ)) /* - * Before we consider skipping a page that's marked as clean in - * visibility map, we must've seen at least this many clean pages. + * Thresholds (expressed as a proportion of rel_pages) that influence VACUUM's + * choice of skipping strategy */ -#define SKIP_PAGES_THRESHOLD ((BlockNumber) 32) +#define SKIPALLVIS_MIN_PAGES 0.05 /* 5% of rel_pages */ +#define SKIPALLVIS_MAX_PAGES 0.70 + +/* + * tableagefrac-wise cutoffs that control when VACUUM decides on skipping + * using SKIPALLVIS_MIN_PAGES and SKIPALLVIS_MAX_PAGES cutoffs respectively + */ +#define TABLEAGEFRAC_MIDPOINT 0.5 /* half way to antiwraparound AV */ +#define TABLEAGEFRAC_HIGHPOINT 0.9 /* * Size of the prefetch window for lazy vacuum backwards truncation scan. @@ -150,8 +158,6 @@ typedef struct LVRelState /* Aggressive VACUUM? (must set relfrozenxid >= FreezeLimit) */ bool aggressive; - /* Use visibility map to skip? (disabled by DISABLE_PAGE_SKIPPING) */ - bool skipwithvm; /* Eagerly freeze all tuples on pages about to be set all-visible? */ bool eager_freeze_strategy; /* Wraparound failsafe has been triggered? */ @@ -170,7 +176,9 @@ typedef struct LVRelState /* Tracks oldest extant XID/MXID for setting relfrozenxid/relminmxid */ TransactionId NewRelfrozenXid; MultiXactId NewRelminMxid; - bool skippedallvis; + /* Immutable snapshot of visibility map (as of time that VACUUM began) */ + vmsnapshot *vmsnap; + vmstrategy vmstrat; /* Error reporting state */ char *relnamespace; @@ -243,11 +251,9 @@ typedef struct LVSavedErrInfo /* non-export function prototypes */ static void lazy_scan_heap(LVRelState *vacrel); -static void lazy_scan_strategy(LVRelState *vacrel); -static BlockNumber lazy_scan_skip(LVRelState *vacrel, Buffer *vmbuffer, - BlockNumber next_block, - bool *next_unskippable_allvis, - bool *skipping_current_range); +static BlockNumber lazy_scan_strategy(LVRelState *vacrel, + const VacuumParams *params, + double tableagefrac); static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, bool sharelock, Buffer vmbuffer); @@ -277,7 +283,8 @@ static bool should_attempt_truncation(LVRelState *vacrel); static void lazy_truncate_heap(LVRelState *vacrel); static BlockNumber count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected); -static void dead_items_alloc(LVRelState *vacrel, int nworkers); +static void dead_items_alloc(LVRelState *vacrel, int nworkers, + BlockNumber scanned_pages); static void dead_items_cleanup(LVRelState *vacrel); static bool heap_page_is_all_visible(LVRelState *vacrel, Buffer buf, TransactionId *visibility_cutoff_xid, bool *all_frozen); @@ -309,10 +316,11 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, LVRelState *vacrel; bool verbose, instrument, - skipwithvm, frozenxid_updated, minmulti_updated; + double tableagefrac; BlockNumber orig_rel_pages, + scanned_pages, new_rel_pages, new_rel_allvisible; PGRUsage ru0; @@ -452,43 +460,34 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * future we might want to teach lazy_scan_prune to recompute vistest from * time to time, to increase the number of dead tuples it can prune away.) */ - vacrel->aggressive = vacuum_get_cutoffs(rel, params, &vacrel->cutoffs); + vacrel->aggressive = vacuum_get_cutoffs(rel, params, &vacrel->cutoffs, + &tableagefrac); vacrel->rel_pages = orig_rel_pages = RelationGetNumberOfBlocks(rel); vacrel->vistest = GlobalVisTestFor(rel); /* Initialize state used to track oldest extant XID/MXID */ vacrel->NewRelfrozenXid = vacrel->cutoffs.OldestXmin; vacrel->NewRelminMxid = vacrel->cutoffs.OldestMxact; - vacrel->skippedallvis = false; - skipwithvm = true; - if (params->options & VACOPT_DISABLE_PAGE_SKIPPING) - { - /* - * Force aggressive mode, and disable skipping blocks using the - * visibility map (even those set all-frozen) - */ - vacrel->aggressive = true; - skipwithvm = false; - } - - vacrel->skipwithvm = skipwithvm; /* - * Determine freezing strategy used by VACUUM + * Now determine skipping and freezing strategies used by this VACUUM. + * + * This process is driven in part by information from VACUUM's visibility + * map snapshot, which will be acquired in passing. lazy_scan_heap will + * use the same immutable VM snapshot to determine which pages to skip. + * Using an immutable structure (instead of the live visibility map) helps + * VACUUM avoid scanning concurrently modified pages. These pages can + * only have deleted tuples that OldestXmin will consider RECENTLY_DEAD. */ - lazy_scan_strategy(vacrel); + scanned_pages = lazy_scan_strategy(vacrel, params, tableagefrac); if (verbose) - { - if (vacrel->aggressive) - ereport(INFO, - (errmsg("aggressively vacuuming \"%s.%s.%s\"", - get_database_name(MyDatabaseId), - vacrel->relnamespace, vacrel->relname))); - else - ereport(INFO, - (errmsg("vacuuming \"%s.%s.%s\"", - get_database_name(MyDatabaseId), - vacrel->relnamespace, vacrel->relname))); - } + ereport(INFO, + (errmsg("vacuuming \"%s.%s.%s\"", + get_database_name(MyDatabaseId), + vacrel->relnamespace, vacrel->relname), + errdetail("Table has %u pages in total, of which %u pages (%.2f%% of total) will be scanned.", + orig_rel_pages, scanned_pages, + orig_rel_pages == 0 ? 100.0 : + 100.0 * scanned_pages / orig_rel_pages))); /* * Allocate dead_items array memory using dead_items_alloc. This handles @@ -498,13 +497,14 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * is already dangerously old.) */ lazy_check_wraparound_failsafe(vacrel); - dead_items_alloc(vacrel, params->nworkers); + dead_items_alloc(vacrel, params->nworkers, scanned_pages); /* * Call lazy_scan_heap to perform all required heap pruning, index * vacuuming, and heap vacuuming (plus related processing) */ lazy_scan_heap(vacrel); + Assert(vacrel->scanned_pages == scanned_pages); /* * Free resources managed by dead_items_alloc. This ends parallel mode in @@ -551,12 +551,11 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, MultiXactIdPrecedesOrEquals(vacrel->aggressive ? vacrel->cutoffs.MultiXactCutoff : vacrel->cutoffs.relminmxid, vacrel->NewRelminMxid)); - if (vacrel->skippedallvis) + if (vacrel->vmstrat == VMSNAP_SKIP_ALL_VISIBLE) { /* - * Must keep original relfrozenxid in a non-aggressive VACUUM that - * chose to skip an all-visible page range. The state that tracks new - * values will have missed unfrozen XIDs from the pages we skipped. + * Must keep original relfrozenxid when lazy_scan_strategy call + * decided to skip all-visible pages */ Assert(!vacrel->aggressive); vacrel->NewRelfrozenXid = InvalidTransactionId; @@ -601,6 +600,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, vacrel->missed_dead_tuples); pgstat_progress_end_command(); + /* Done with rel's visibility map snapshot */ + visibilitymap_snap_release(vacrel->vmsnap); + if (instrument) { TimestampTz endtime = GetCurrentTimestamp(); @@ -628,10 +630,6 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, initStringInfo(&buf); if (verbose) { - /* - * Aggressiveness already reported earlier, in dedicated - * VACUUM VERBOSE ereport - */ Assert(!params->is_wraparound); msgfmt = _("finished vacuuming \"%s.%s.%s\": index scans: %d\n"); } @@ -827,13 +825,12 @@ lazy_scan_heap(LVRelState *vacrel) { BlockNumber rel_pages = vacrel->rel_pages, blkno, - next_unskippable_block, + next_block_to_scan, next_failsafe_block = 0, next_fsm_block_to_vacuum = 0; + bool next_all_visible; VacDeadItems *dead_items = vacrel->dead_items; Buffer vmbuffer = InvalidBuffer; - bool next_unskippable_allvis, - skipping_current_range; const int initprog_index[] = { PROGRESS_VACUUM_PHASE, PROGRESS_VACUUM_TOTAL_HEAP_BLKS, @@ -847,42 +844,29 @@ lazy_scan_heap(LVRelState *vacrel) initprog_val[2] = dead_items->max_items; pgstat_progress_update_multi_param(3, initprog_index, initprog_val); - /* Set up an initial range of skippable blocks using the visibility map */ - next_unskippable_block = lazy_scan_skip(vacrel, &vmbuffer, 0, - &next_unskippable_allvis, - &skipping_current_range); + next_block_to_scan = visibilitymap_snap_next(vacrel->vmsnap, + &next_all_visible); for (blkno = 0; blkno < rel_pages; blkno++) { Buffer buf; Page page; - bool all_visible_according_to_vm; + bool all_visible_according_to_vmsnap; LVPagePruneState prunestate; - if (blkno == next_unskippable_block) + if (blkno < next_block_to_scan) { - /* - * Can't skip this page safely. Must scan the page. But - * determine the next skippable range after the page first. - */ - all_visible_according_to_vm = next_unskippable_allvis; - next_unskippable_block = lazy_scan_skip(vacrel, &vmbuffer, - blkno + 1, - &next_unskippable_allvis, - &skipping_current_range); - - Assert(next_unskippable_block >= blkno + 1); + Assert(blkno != rel_pages - 1); + continue; } - else - { - /* Last page always scanned (may need to set nonempty_pages) */ - Assert(blkno < rel_pages - 1); - if (skipping_current_range) - continue; - - /* Current range is too small to skip -- just scan the page */ - all_visible_according_to_vm = true; - } + /* + * Determine the next page in line to be scanned according to vmsnap + * before scanning this page + */ + all_visible_according_to_vmsnap = next_all_visible; + next_block_to_scan = visibilitymap_snap_next(vacrel->vmsnap, + &next_all_visible); + Assert(next_block_to_scan > blkno); vacrel->scanned_pages++; @@ -1092,10 +1076,9 @@ lazy_scan_heap(LVRelState *vacrel) } /* - * Handle setting visibility map bit based on information from the VM - * (as of last lazy_scan_skip() call), and from prunestate + * Update visibility map status of this page where required */ - if (!all_visible_according_to_vm && prunestate.all_visible) + if (!all_visible_according_to_vmsnap && prunestate.all_visible) { uint8 flags = VISIBILITYMAP_ALL_VISIBLE; @@ -1123,12 +1106,10 @@ lazy_scan_heap(LVRelState *vacrel) } /* - * As of PostgreSQL 9.2, the visibility map bit should never be set if - * the page-level bit is clear. However, it's possible that the bit - * got cleared after lazy_scan_skip() was called, so we must recheck - * with buffer lock before concluding that the VM is corrupt. + * The authoritative visibility map bit should never be set if the + * page-level bit is clear */ - else if (all_visible_according_to_vm && !PageIsAllVisible(page) + else if (all_visible_according_to_vmsnap && !PageIsAllVisible(page) && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer)) { elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", @@ -1167,7 +1148,7 @@ lazy_scan_heap(LVRelState *vacrel) * mark it as all-frozen. Note that all_frozen is only valid if * all_visible is true, so we must check both prunestate fields. */ - else if (all_visible_according_to_vm && prunestate.all_visible && + else if (all_visible_according_to_vmsnap && prunestate.all_visible && prunestate.all_frozen && !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer)) { @@ -1260,128 +1241,204 @@ lazy_scan_heap(LVRelState *vacrel) } /* - * lazy_scan_strategy() -- Determine freezing strategy. + * lazy_scan_strategy() -- Determine freezing/skipping strategy. * * Our traditional/lazy freezing strategy is useful when putting off the work * of freezing totally avoids work that turns out to have been unnecessary. * On the other hand we eagerly freeze pages when that strategy spreads out * the burden of freezing over time. + * + * Also determines if the ongoing VACUUM operation should skip all-visible + * pages to save work in the near term, or if we should prefer to advance + * relfrozenxid/relminmxid in the near term instead. + * + * Freezing and skipping strategies are structured as two independent choices, + * but they are not independent in any practical sense (it's just mechanical). + * Eager and lazy behaviors go hand in hand, since the choice of each strategy + * is driven by the same information, and similar considerations about the + * needs of the table. Moreover, choosing eager skipping behavior is often + * expected to directly result in freezing many more pages, since VACUUM can + * only _consider_ freezing pages that it actually scans in the first place. + * All-visible pages are only eligible for freezing when not skipped over. + * + * The single most important justification for the eager behaviors is system + * level performance stability. It is often better to freeze all-visible + * pages before we're truly forced to (just to advance relfrozenxid) as a way + * of avoiding big spikes, where VACUUM has to freeze many pages all at once. + * + * Returns final scanned_pages for the VACUUM operation. The exact number of + * pages that lazy_scan_heap scans depends in part on the skipping strategy + * decided here. */ -static void -lazy_scan_strategy(LVRelState *vacrel) +static BlockNumber +lazy_scan_strategy(LVRelState *vacrel, const VacuumParams *params, + double tableagefrac) { - BlockNumber rel_pages = vacrel->rel_pages; + BlockNumber rel_pages = vacrel->rel_pages, + force_eager_skip_threshold, + scanned_pages_skipallvis, + scanned_pages_skipallfrozen; Assert(vacrel->scanned_pages == 0); + /* Acquire a VM snapshot for VACUUM operation */ + vacrel->vmsnap = visibilitymap_snap_acquire(vacrel->rel, rel_pages, + &scanned_pages_skipallvis, + &scanned_pages_skipallfrozen); + vacrel->vmstrat = VMSNAP_SKIP_NONE; + + /* + * The eager freezing strategy is used when a physical table size + * threshold controlled by the freeze_strategy_threshold GUC/reloption is + * crossed. Also freeze eagerly whenever table age is close to requiring + * (or is actually undergoing) an antiwraparound autovacuum. + */ vacrel->eager_freeze_strategy = - rel_pages >= vacrel->cutoffs.freeze_strategy_threshold; -} + (tableagefrac >= TABLEAGEFRAC_HIGHPOINT || + rel_pages >= vacrel->cutoffs.freeze_strategy_threshold); -/* - * lazy_scan_skip() -- set up range of skippable blocks using visibility map. - * - * lazy_scan_heap() calls here every time it needs to set up a new range of - * blocks to skip via the visibility map. Caller passes the next block in - * line. We return a next_unskippable_block for this range. When there are - * no skippable blocks we just return caller's next_block. The all-visible - * status of the returned block is set in *next_unskippable_allvis for caller, - * too. Block usually won't be all-visible (since it's unskippable), but it - * can be during aggressive VACUUMs (as well as in certain edge cases). - * - * Sets *skipping_current_range to indicate if caller should skip this range. - * Costs and benefits drive our decision. Very small ranges won't be skipped. - * - * Note: our opinion of which blocks can be skipped can go stale immediately. - * It's okay if caller "misses" a page whose all-visible or all-frozen marking - * was concurrently cleared, though. All that matters is that caller scan all - * pages whose tuples might contain XIDs < OldestXmin, or MXIDs < OldestMxact. - * (Actually, non-aggressive VACUUMs can choose to skip all-visible pages with - * older XIDs/MXIDs. The vacrel->skippedallvis flag will be set here when the - * choice to skip such a range is actually made, making everything safe.) - */ -static BlockNumber -lazy_scan_skip(LVRelState *vacrel, Buffer *vmbuffer, BlockNumber next_block, - bool *next_unskippable_allvis, bool *skipping_current_range) -{ - BlockNumber rel_pages = vacrel->rel_pages, - next_unskippable_block = next_block, - nskippable_blocks = 0; - bool skipsallvis = false; - - *next_unskippable_allvis = true; - while (next_unskippable_block < rel_pages) + /* + * Force the use of VMSNAP_SKIP_ALL_FROZEN when rel_pages is now at least + * twice freeze_strategy_threshold. + * + * "Staggering" the freezing and skipping behaviors like this is intended + * to give VACUUM the benefit of the lazy strategies where they are useful + * (when vacuuming smaller tables), while avoiding sharp discontinuities + * in the overhead of freezing when transitioning to eager behaviors. It + * is useful to make a gradual transition for tables that start out small, + * but continue to grow. We can mostly avoid any large once-off freezing + * spikes this way. (Recall that use of the VMSNAP_SKIP_ALL_FROZEN vmsnap + * strategy is often enough to significantly increase the number of pages + * frozen, even when VACUUM also opts to use the lazy freezing strategy.) + * + * force_eager_skip_threshold is useful because it is an _absolute_ cutoff + * that doesn't depend on short-term costs, nor on tableagefrac. VACUUM + * thereby avoids concentrated build-ups of unfrozen pages in any table. + * This is important during bulk loading, where very few transactions will + * leave behind very many heap pages that we should freeze proactively. + * + * Laziness is only valuable when it totally avoids unnecessary freezing, + * which is much less likely to work out (and much more likely to lead to + * disruptive "catch-up" freezing) with a larger table. + */ + force_eager_skip_threshold = vacrel->cutoffs.freeze_strategy_threshold; + if (force_eager_skip_threshold < MaxBlockNumber / 2) + force_eager_skip_threshold *= 2; + if (tableagefrac >= TABLEAGEFRAC_HIGHPOINT || + rel_pages >= force_eager_skip_threshold) { - uint8 mapbits = visibilitymap_get_status(vacrel->rel, - next_unskippable_block, - vmbuffer); - - if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) == 0) - { - Assert((mapbits & VISIBILITYMAP_ALL_FROZEN) == 0); - *next_unskippable_allvis = false; - break; - } + vacrel->vmstrat = VMSNAP_SKIP_ALL_FROZEN; + } + else + { + BlockNumber nextra, + nextra_min_threshold, + nextra_max_threshold, + prefer_laziness_threshold; /* - * Caller must scan the last page to determine whether it has tuples - * (caller must have the opportunity to set vacrel->nonempty_pages). - * This rule avoids having lazy_truncate_heap() take access-exclusive - * lock on rel to attempt a truncation that fails anyway, just because - * there are tuples on the last page (it is likely that there will be - * tuples on other nearby pages as well, but those can be skipped). + * Neither tableagefrac nor rel_pages crossed the thresholds that + * automatically force use of the VMSNAP_SKIP_ALL_FROZEN strategy. + * Advancing relfrozenxid/relminmxid eagerly may still make sense, but + * we now need to apply more information to decide what to do. * - * Implement this by always treating the last block as unsafe to skip. + * Determine the number of "extra" scanned_pages incurred by using + * VMSNAP_SKIP_ALL_FROZEN instead of VMSNAP_SKIP_ALL_VISIBLE, which is + * the "extra" cost that our eager VMSNAP_SKIP_ALL_FROZEN strategy + * incurs, if we actually opt to use it. + * + * Also determine guideline "extra" scanned_pages thresholds. These + * represent minimum and maximum sensible thresholds for rel. */ - if (next_unskippable_block == rel_pages - 1) - break; + nextra = scanned_pages_skipallfrozen - scanned_pages_skipallvis; + Assert(rel_pages >= nextra); + nextra_min_threshold = (double) rel_pages * SKIPALLVIS_MIN_PAGES; + nextra_max_threshold = (double) rel_pages * SKIPALLVIS_MAX_PAGES; + Assert(nextra_max_threshold >= nextra_min_threshold); - /* DISABLE_PAGE_SKIPPING makes all skipping unsafe */ - if (!vacrel->skipwithvm) - break; - - /* - * Aggressive VACUUM caller can't skip pages just because they are - * all-visible. They may still skip all-frozen pages, which can't - * contain XIDs < OldestXmin (XIDs that aren't already frozen by now). - */ - if ((mapbits & VISIBILITYMAP_ALL_FROZEN) == 0) + if (tableagefrac < TABLEAGEFRAC_MIDPOINT) { - if (vacrel->aggressive) - break; + /* + * The table's age is still below table age mid point, so table + * age is still of only minimal concern. We're still willing to + * act eagerly when it's _very_ cheap to do so. Specifically, + * when VMSNAP_SKIP_ALL_FROZEN requires VACUUM to scan a number of + * extra pages not exceeding 5% of rel_pages. + */ + prefer_laziness_threshold = nextra_min_threshold; + } + else + { + double tableagefrac_high_delta, + min_scale_up; /* - * All-visible block is safe to skip in non-aggressive case. But - * remember that the final range contains such a block for later. + * Our tableagefrac is some point between TABLEAGEFRAC_MIDPOINT + * and TABLEAGEFRAC_HIGHPOINT. This means that table age is + * starting to become a concern, but not to the extent that we're + * forced to use VMSNAP_SKIP_ALL_FROZEN strategy (not yet). We'll + * need to weigh both costs and benefits to decide on a strategy. + * + * If tableagefrac is only barely over the midway point, then + * we'll choose an "extra blocks" threshold of ~5% of rel_pages. + * The opposite extreme occurs when tableagefrac is very near to + * the high point. That will make our "extra blocks" threshold + * very aggressive: we'll go with VMSNAP_SKIP_ALL_FROZEN when + * doing so requires we scan a number of extra blocks as high as + * ~70% of rel_pages. Our final "extra blocks" threshold is most + * likely to fall between the two extremes (when we end up here). + * + * Note that the "extra blocks" thresholds we'll use increases at + * an accelerating rate as tableagefrac itself increases (assuming + * a fixed rel_pages, though if rel_pages actually grows then it's + * probably even more likely that VMSNAP_SKIP_ALL_FROZEN will get + * used before long). + * + * Note also that it is unlikely that tables that require regular + * vacuuming will ever have a VACUUM whose tableagefrac actually + * reaches TABLEAGEFRAC_HIGHPOINT, barring cases where table age + * based settings like autovacuum_freeze_max_age are set to very + * low values (which includes VACUUM FREEZE). */ - skipsallvis = true; + Assert(tableagefrac < TABLEAGEFRAC_HIGHPOINT); + tableagefrac_high_delta = TABLEAGEFRAC_HIGHPOINT - tableagefrac; + min_scale_up = 1.0 - (tableagefrac_high_delta / + (TABLEAGEFRAC_HIGHPOINT - TABLEAGEFRAC_MIDPOINT)); + + prefer_laziness_threshold = + (nextra_min_threshold * (1.0 - min_scale_up)) + + (nextra_max_threshold * min_scale_up); } - vacuum_delay_point(); - next_unskippable_block++; - nskippable_blocks++; + prefer_laziness_threshold = Max(32, prefer_laziness_threshold); + if (nextra >= prefer_laziness_threshold) + vacrel->vmstrat = VMSNAP_SKIP_ALL_VISIBLE; + else + vacrel->vmstrat = VMSNAP_SKIP_ALL_FROZEN; } /* - * We only skip a range with at least SKIP_PAGES_THRESHOLD consecutive - * pages. Since we're reading sequentially, the OS should be doing - * readahead for us, so there's no gain in skipping a page now and then. - * Skipping such a range might even discourage sequential detection. - * - * This test also enables more frequent relfrozenxid advancement during - * non-aggressive VACUUMs. If the range has any all-visible pages then - * skipping makes updating relfrozenxid unsafe, which is a real downside. + * Override choice of skipping strategy (force vmsnap to scan every page + * in the range of rel_pages) in DISABLE_PAGE_SKIPPING case. Also + * defensively force all-frozen in aggressive VACUUMs. */ - if (nskippable_blocks < SKIP_PAGES_THRESHOLD) - *skipping_current_range = false; - else - { - *skipping_current_range = true; - if (skipsallvis) - vacrel->skippedallvis = true; - } + Assert(vacrel->vmstrat != VMSNAP_SKIP_NONE); + if (params->options & VACOPT_DISABLE_PAGE_SKIPPING) + vacrel->vmstrat = VMSNAP_SKIP_NONE; + else if (vacrel->aggressive) + vacrel->vmstrat = VMSNAP_SKIP_ALL_FROZEN; - return next_unskippable_block; + /* Inform vmsnap infrastructure of our chosen strategy */ + visibilitymap_snap_strategy(vacrel->vmsnap, vacrel->vmstrat); + + /* Return appropriate scanned_pages for final strategy chosen */ + if (vacrel->vmstrat == VMSNAP_SKIP_ALL_VISIBLE) + return scanned_pages_skipallvis; + if (vacrel->vmstrat == VMSNAP_SKIP_ALL_FROZEN) + return scanned_pages_skipallfrozen; + + /* DISABLE_PAGE_SKIPPING/VMSNAP_SKIP_NONE case */ + return rel_pages; } /* @@ -2821,6 +2878,14 @@ lazy_cleanup_one_index(Relation indrel, IndexBulkDeleteResult *istat, * Also don't attempt it if we are doing early pruning/vacuuming, because a * scan which cannot find a truncated heap page cannot determine that the * snapshot is too old to read that page. + * + * Note that we effectively rely on visibilitymap_snap_next() having forced + * VACUUM to scan the final page (rel_pages - 1) in all cases. Without that, + * we'd tend to needlessly acquire an AccessExclusiveLock just to attempt rel + * truncation that is bound to fail. VACUUM cannot set vacrel->nonempty_pages + * in pages that it skips using the VM, so we must avoid interpreting skipped + * pages as empty pages when it makes little sense. Observing that the final + * page has tuples is a simple way of avoiding pathological locking behavior. */ static bool should_attempt_truncation(LVRelState *vacrel) @@ -3111,14 +3176,13 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected) /* * Returns the number of dead TIDs that VACUUM should allocate space to - * store, given a heap rel of size vacrel->rel_pages, and given current - * maintenance_work_mem setting (or current autovacuum_work_mem setting, - * when applicable). + * store, given the expected scanned_pages for this VACUUM operation, + * and given current maintenance_work_mem/autovacuum_work_mem setting. * * See the comments at the head of this file for rationale. */ static int -dead_items_max_items(LVRelState *vacrel) +dead_items_max_items(LVRelState *vacrel, BlockNumber scanned_pages) { int64 max_items; int vac_work_mem = IsAutoVacuumWorkerProcess() && @@ -3127,15 +3191,13 @@ dead_items_max_items(LVRelState *vacrel) if (vacrel->nindexes > 0) { - BlockNumber rel_pages = vacrel->rel_pages; - max_items = MAXDEADITEMS(vac_work_mem * 1024L); max_items = Min(max_items, INT_MAX); max_items = Min(max_items, MAXDEADITEMS(MaxAllocSize)); /* curious coding here to ensure the multiplication can't overflow */ - if ((BlockNumber) (max_items / MaxHeapTuplesPerPage) > rel_pages) - max_items = rel_pages * MaxHeapTuplesPerPage; + if ((BlockNumber) (max_items / MaxHeapTuplesPerPage) > scanned_pages) + max_items = scanned_pages * MaxHeapTuplesPerPage; /* stay sane if small maintenance_work_mem */ max_items = Max(max_items, MaxHeapTuplesPerPage); @@ -3157,12 +3219,12 @@ dead_items_max_items(LVRelState *vacrel) * DSM when required. */ static void -dead_items_alloc(LVRelState *vacrel, int nworkers) +dead_items_alloc(LVRelState *vacrel, int nworkers, BlockNumber scanned_pages) { VacDeadItems *dead_items; int max_items; - max_items = dead_items_max_items(vacrel); + max_items = dead_items_max_items(vacrel, scanned_pages); Assert(max_items >= MaxHeapTuplesPerPage); /* diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 4ed70275e..27045032a 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -16,6 +16,10 @@ * visibilitymap_pin_ok - check whether correct map page is already pinned * visibilitymap_set - set a bit in a previously pinned page * visibilitymap_get_status - get status of bits + * visibilitymap_snap_acquire - acquire snapshot of visibility map + * visibilitymap_snap_strategy - set VACUUM's skipping strategy + * visibilitymap_snap_next - get next block to scan from vmsnap + * visibilitymap_snap_release - release previously acquired snapshot * visibilitymap_count - count number of bits set in visibility map * visibilitymap_prepare_truncate - * prepare for truncation of the visibility map @@ -52,6 +56,10 @@ * * VACUUM will normally skip pages for which the visibility map bit is set; * such pages can't contain any dead tuples and therefore don't need vacuuming. + * VACUUM uses a snapshot of the visibility map to avoid scanning pages whose + * visibility map bit gets concurrently unset. This also provides us with a + * convenient way of performing I/O prefetching on behalf of VACUUM, since the + * pages that VACUUM's first heap pass will scan are fully predetermined. * * LOCKING * @@ -92,10 +100,12 @@ #include "access/xlogutils.h" #include "miscadmin.h" #include "port/pg_bitutils.h" +#include "storage/buffile.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" #include "storage/smgr.h" #include "utils/inval.h" +#include "utils/spccache.h" /*#define TRACE_VISIBILITYMAP */ @@ -124,9 +134,87 @@ #define FROZEN_MASK64 UINT64CONST(0xaaaaaaaaaaaaaaaa) /* The upper bit of each * bit pair */ +/* + * Prefetching of heap pages takes place as VACUUM requests the next block in + * line from its visibility map snapshot + * + * XXX MIN_PREFETCH_SIZE of 32 is a little on the high side, but matches + * hard-coded constant used by vacuumlazy.c when prefetching for rel + * truncation. Might be better to increase the maintenance_io_concurrency + * default, or to do nothing like this at all. + */ +#define STAGED_BUFSIZE (MAX_IO_CONCURRENCY * 2) +#define MIN_PREFETCH_SIZE ((BlockNumber) 32) + +typedef struct vmsnapblock +{ + BlockNumber scanned_block; + bool all_visible; +} vmsnapblock; + +/* + * Snapshot of visibility map at the start of a VACUUM operation + */ +struct vmsnapshot +{ + /* Target heap rel */ + Relation rel; + /* Skipping strategy used by VACUUM operation */ + vmstrategy strat; + /* Per-strategy final scanned_pages */ + BlockNumber rel_pages; + BlockNumber scanned_pages_skipallvis; + BlockNumber scanned_pages_skipallfrozen; + + /* + * Materialized visibility map state. + * + * VM snapshots spill to a temp file when required. + */ + BlockNumber nvmpages; + BufFile *file; + + /* + * Prefetch distance, used to perform I/O prefetching of heap pages + */ + int prefetch_distance; + + /* Current VM page cached */ + BlockNumber curvmpage; + char *rawmap; + PGAlignedBlock vmpage; + + /* Staging area for blocks returned to VACUUM */ + vmsnapblock staged[STAGED_BUFSIZE]; + int current_nblocks_staged; + + /* + * Next block from range of rel_pages to consider placing in staged block + * array (it will be placed there if it's going to be scanned by VACUUM) + */ + BlockNumber next_block; + + /* + * Number of blocks that we still need to return, and number of blocks + * that we still need to prefetch + */ + BlockNumber scanned_pages_to_return; + BlockNumber scanned_pages_to_prefetch; + + /* offset of next block in line to return (from staged) */ + int next_return_idx; + /* offset of next block in line to prefetch (from staged) */ + int next_prefetch_idx; + /* offset of first garbage/invalid element (from staged) */ + int first_invalid_idx; +}; + + /* prototypes for internal routines */ static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend); static void vm_extend(Relation rel, BlockNumber vm_nblocks); +static void vm_snap_stage_blocks(vmsnapshot *vmsnap); +static uint8 vm_snap_get_status(vmsnapshot *vmsnap, BlockNumber heapBlk); /* @@ -373,6 +461,350 @@ visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) return result; } +/* + * visibilitymap_snap_acquire - get read-only snapshot of visibility map + * + * Initializes VACUUM caller's snapshot, allocating memory in current context. + * Used by VACUUM to determine which pages it must scan up front. + * + * Set scanned_pages_skipallvis and scanned_pages_skipallfrozen to help VACUUM + * decide on its skipping strategy. These are VACUUM's scanned_pages when it + * opts to skip all eligible pages and scanned_pages when it opts to just skip + * all-frozen pages, respectively. + * + * Caller finalizes skipping strategy by calling visibilitymap_snap_strategy. + * This determines the kind of blocks visibilitymap_snap_next should indicate + * need to be scanned by VACUUM. + */ +vmsnapshot * +visibilitymap_snap_acquire(Relation rel, BlockNumber rel_pages, + BlockNumber *scanned_pages_skipallvis, + BlockNumber *scanned_pages_skipallfrozen) +{ + BlockNumber nvmpages = 0, + mapBlockLast = 0, + all_visible = 0, + all_frozen = 0; + uint8 mapbits_last_page = 0; + vmsnapshot *vmsnap; + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "visibilitymap_snap_acquire %s %u", + RelationGetRelationName(rel), rel_pages); +#endif + + /* + * Allocate space for VM pages up to and including those required to have + * bits for the would-be heap block that is just beyond rel_pages + */ + if (rel_pages > 0) + { + mapBlockLast = HEAPBLK_TO_MAPBLOCK(rel_pages - 1); + nvmpages = mapBlockLast + 1; + } + + /* Allocate and initialize VM snapshot state */ + vmsnap = palloc0(sizeof(vmsnapshot)); + vmsnap->rel = rel; + vmsnap->strat = VMSNAP_SKIP_NONE; /* for now */ + vmsnap->rel_pages = rel_pages; /* scanned_pages for VMSNAP_SKIP_NONE */ + vmsnap->scanned_pages_skipallvis = 0; + vmsnap->scanned_pages_skipallfrozen = 0; + + /* + * vmsnap temp file state. + * + * Only relations large enough to need more than one visibility map page + * use a temp file (cannot wholly rely on vmsnap's single page cache). + */ + vmsnap->nvmpages = nvmpages; + vmsnap->file = NULL; + if (nvmpages > 1) + vmsnap->file = BufFileCreateTemp(false); + vmsnap->prefetch_distance = 0; +#ifdef USE_PREFETCH + vmsnap->prefetch_distance = + get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); +#endif + vmsnap->prefetch_distance = Max(vmsnap->prefetch_distance, MIN_PREFETCH_SIZE); + + /* cache of VM pages read from temp file */ + vmsnap->curvmpage = 0; + vmsnap->rawmap = NULL; + + /* staged blocks array state */ + vmsnap->current_nblocks_staged = 0; + vmsnap->next_block = 0; + vmsnap->scanned_pages_to_return = 0; + vmsnap->scanned_pages_to_prefetch = 0; + /* Offsets into staged blocks array */ + vmsnap->next_return_idx = 0; + vmsnap->next_prefetch_idx = 0; + vmsnap->first_invalid_idx = 0; + + for (BlockNumber mapBlock = 0; mapBlock <= mapBlockLast; mapBlock++) + { + Buffer mapBuffer; + char *map; + uint64 *umap; + + mapBuffer = vm_readbuf(rel, mapBlock, false); + if (!BufferIsValid(mapBuffer)) + { + /* + * Not all VM pages available. Remember that, so that we'll treat + * relevant heap pages as not all-visible/all-frozen when asked. + */ + vmsnap->nvmpages = mapBlock; + break; + } + + /* Cache page locally */ + LockBuffer(mapBuffer, BUFFER_LOCK_SHARE); + memcpy(vmsnap->vmpage.data, BufferGetPage(mapBuffer), BLCKSZ); + UnlockReleaseBuffer(mapBuffer); + + /* Finish off this VM page using snapshot's vmpage cache */ + vmsnap->curvmpage = mapBlock; + vmsnap->rawmap = map = PageGetContents(vmsnap->vmpage.data); + umap = (uint64 *) map; + + if (mapBlock == mapBlockLast) + { + uint32 mapByte; + uint8 mapOffset; + + /* + * The last VM page requires some extra steps. + * + * First get the status of the last heap page (page in the range + * of rel_pages) in passing. + */ + Assert(mapBlock == HEAPBLK_TO_MAPBLOCK(rel_pages - 1)); + mapByte = HEAPBLK_TO_MAPBYTE(rel_pages - 1); + mapOffset = HEAPBLK_TO_OFFSET(rel_pages - 1); + mapbits_last_page = ((map[mapByte] >> mapOffset) & + VISIBILITYMAP_VALID_BITS); + + /* + * Also defensively "truncate" our local copy of the last page in + * order to reliably exclude heap pages beyond the range of + * rel_pages. This is sheer paranoia. + */ + mapByte = HEAPBLK_TO_MAPBYTE(rel_pages); + mapOffset = HEAPBLK_TO_OFFSET(rel_pages); + if (mapByte != 0 || mapOffset != 0) + { + MemSet(&map[mapByte + 1], 0, MAPSIZE - (mapByte + 1)); + map[mapByte] &= (1 << mapOffset) - 1; + } + } + + /* Maintain count of all-frozen and all-visible pages */ + for (int i = 0; i < MAPSIZE / sizeof(uint64); i++) + { + all_visible += pg_popcount64(umap[i] & VISIBLE_MASK64); + all_frozen += pg_popcount64(umap[i] & FROZEN_MASK64); + } + + /* Finally, write out vmpage cache VM page to vmsnap's temp file */ + if (vmsnap->file) + BufFileWrite(vmsnap->file, vmsnap->vmpage.data, BLCKSZ); + } + + /* + * Done copying all VM pages from authoritative VM into a VM snapshot. + * + * Figure out the final scanned_pages for the two skipping policies that + * we might use: skipallvis (skip both all-frozen and all-visible) and + * skipallfrozen (just skip all-frozen). + */ + Assert(all_frozen <= all_visible && all_visible <= rel_pages); + *scanned_pages_skipallvis = rel_pages - all_visible; + *scanned_pages_skipallfrozen = rel_pages - all_frozen; + + /* + * When the last page is skippable in principle, it still won't be treated + * as skippable by visibilitymap_snap_next, which recognizes the last page + * as a special case. Compensate by incrementing each skipping strategy's + * scanned_pages as needed to avoid counting the last page as skippable. + */ + if (mapbits_last_page & VISIBILITYMAP_ALL_VISIBLE) + (*scanned_pages_skipallvis)++; + if (mapbits_last_page & VISIBILITYMAP_ALL_FROZEN) + (*scanned_pages_skipallfrozen)++; + + vmsnap->scanned_pages_skipallvis = *scanned_pages_skipallvis; + vmsnap->scanned_pages_skipallfrozen = *scanned_pages_skipallfrozen; + + return vmsnap; +} + +/* + * visibilitymap_snap_strategy -- determine VACUUM's skipping strategy. + * + * VACUUM chooses a vmsnap strategy according to priorities around advancing + * relfrozenxid. See visibilitymap_snap_acquire. + */ +void +visibilitymap_snap_strategy(vmsnapshot *vmsnap, vmstrategy strat) +{ + int nprefetch; + + /* Remember final skipping strategy */ + vmsnap->strat = strat; + + if (vmsnap->strat == VMSNAP_SKIP_ALL_VISIBLE) + vmsnap->scanned_pages_to_return = vmsnap->scanned_pages_skipallvis; + else if (vmsnap->strat == VMSNAP_SKIP_ALL_FROZEN) + vmsnap->scanned_pages_to_return = vmsnap->scanned_pages_skipallfrozen; + else + vmsnap->scanned_pages_to_return = vmsnap->rel_pages; + + vmsnap->scanned_pages_to_prefetch = vmsnap->scanned_pages_to_return; + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "visibilitymap_snap_strategy %s %d %u", + RelationGetRelationName(vmsnap->rel), (int) strat, + vmsnap->scanned_pages_to_return); +#endif + + /* + * Stage blocks (may have to read from temp file). + * + * We rely on the assumption that we'll always have a large enough staged + * blocks array to accommodate any possible prefetch distance. + */ + vm_snap_stage_blocks(vmsnap); + + nprefetch = Min(vmsnap->current_nblocks_staged, vmsnap->prefetch_distance); +#ifdef USE_PREFETCH + for (int i = 0; i < nprefetch; i++) + { + BlockNumber block = vmsnap->staged[i].scanned_block; + + PrefetchBuffer(vmsnap->rel, MAIN_FORKNUM, block); + } +#endif + + vmsnap->scanned_pages_to_prefetch -= nprefetch; + vmsnap->next_prefetch_idx += nprefetch; +} + +/* + * visibilitymap_snap_next -- get next block to scan from vmsnap. + * + * Returns next block in line for VACUUM to scan according to vmsnap. Caller + * skips any and all blocks preceding returned block. + * + * The all-visible status of returned block is set in *all_visible. Block + * usually won't be set all-visible (else VACUUM wouldn't need to scan it), + * but it can be in certain corner cases. This includes the VMSNAP_SKIP_NONE + * case, as well as a special case that VACUUM expects us to handle: the final + * block (rel_pages - 1) is always returned here (regardless of our strategy). + * + * VACUUM always scans the last page to determine whether it has tuples. This + * is useful as a way of avoiding certain pathological cases with heap rel + * truncation. + */ +BlockNumber +visibilitymap_snap_next(vmsnapshot *vmsnap, bool *allvisible) +{ + BlockNumber next_block_to_scan; + vmsnapblock block; + + *allvisible = true; + if (vmsnap->scanned_pages_to_return == 0) + return InvalidBlockNumber; + + /* Prepare to return this block */ + block = vmsnap->staged[vmsnap->next_return_idx++]; + *allvisible = block.all_visible; + next_block_to_scan = block.scanned_block; + vmsnap->current_nblocks_staged--; + vmsnap->scanned_pages_to_return--; + + /* + * Did the staged blocks array just run out of blocks to return to caller, + * or do we need to stage more blocks for I/O prefetching purposes? + */ + Assert(vmsnap->next_prefetch_idx <= vmsnap->first_invalid_idx); + if ((vmsnap->current_nblocks_staged == 0 && + vmsnap->scanned_pages_to_return > 0) || + (vmsnap->next_prefetch_idx == vmsnap->first_invalid_idx && + vmsnap->scanned_pages_to_prefetch > 0)) + { + if (vmsnap->current_nblocks_staged > 0) + { + /* + * We've run out of prefetchable blocks, but still have some + * non-returned blocks. Shift existing blocks to the start of the + * array. The newly staged blocks go after these ones. + */ + memmove(&vmsnap->staged[0], + &vmsnap->staged[vmsnap->next_return_idx], + sizeof(vmsnapblock) * vmsnap->current_nblocks_staged); + } + + /* + * Reset offsets in staged blocks array, while accounting for likely + * presence of preexisting blocks that have already been prefetched + * but have yet to be returned to VACUUM caller + */ + vmsnap->next_prefetch_idx -= vmsnap->next_return_idx; + vmsnap->first_invalid_idx -= vmsnap->next_return_idx; + vmsnap->next_return_idx = 0; + + /* Stage more blocks (may have to read from temp file) */ + vm_snap_stage_blocks(vmsnap); + } + + /* + * By here we're guaranteed to have at least one prefetchable block in the + * staged blocks array (unless we've already prefetched all blocks that + * will ever be returned to VACUUM caller) + */ + if (vmsnap->next_prefetch_idx < vmsnap->first_invalid_idx) + { +#ifdef USE_PREFETCH + /* Still have remaining blocks to prefetch, so prefetch next one */ + vmsnapblock prefetch = vmsnap->staged[vmsnap->next_prefetch_idx++]; + + PrefetchBuffer(vmsnap->rel, MAIN_FORKNUM, prefetch.scanned_block); +#else + vmsnap->next_prefetch_idx++; +#endif + Assert(vmsnap->current_nblocks_staged > 1); + Assert(vmsnap->scanned_pages_to_prefetch > 0); + vmsnap->scanned_pages_to_prefetch--; + } + else + { + Assert(vmsnap->scanned_pages_to_prefetch == 0); + } + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "visibilitymap_snap_next %s %u", + RelationGetRelationName(vmsnap->rel), next_block_to_scan); +#endif + + return next_block_to_scan; +} + +/* + * visibilitymap_snap_release - release previously acquired snapshot + * + * Frees resources allocated in visibilitymap_snap_acquire for VACUUM. + */ +void +visibilitymap_snap_release(vmsnapshot *vmsnap) +{ + Assert(vmsnap->scanned_pages_to_return == 0); + if (vmsnap->file) + BufFileClose(vmsnap->file); + pfree(vmsnap); +} + /* * visibilitymap_count - count number of bits set in visibility map * @@ -677,3 +1109,112 @@ vm_extend(Relation rel, BlockNumber vm_nblocks) UnlockRelationForExtension(rel, ExclusiveLock); } + +/* + * Stage some heap blocks from vmsnap to return to VACUUM caller. + * + * Called when we completely run out of staged blocks to return to VACUUM, or + * when vmsnap still has some pending staged blocks, but too few to be able to + * prefetch incrementally as the remaining blocks are returned to VACUUM. + */ +static void +vm_snap_stage_blocks(vmsnapshot *vmsnap) +{ + Assert(vmsnap->current_nblocks_staged < STAGED_BUFSIZE); + Assert(vmsnap->first_invalid_idx < STAGED_BUFSIZE); + Assert(vmsnap->next_return_idx <= vmsnap->first_invalid_idx); + Assert(vmsnap->next_prefetch_idx <= vmsnap->first_invalid_idx); + + while (vmsnap->next_block < vmsnap->rel_pages && + vmsnap->current_nblocks_staged < STAGED_BUFSIZE) + { + bool all_visible = true; + vmsnapblock stage; + + for (;;) + { + uint8 mapbits = vm_snap_get_status(vmsnap, + vmsnap->next_block); + + if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) == 0) + { + Assert((mapbits & VISIBILITYMAP_ALL_FROZEN) == 0); + all_visible = false; + break; + } + + /* + * Stop staging blocks just before final page, which must always + * be scanned by VACUUM + */ + if (vmsnap->next_block == vmsnap->rel_pages - 1) + break; + + /* VMSNAP_SKIP_NONE forcing VACUUM to scan every page? */ + if (vmsnap->strat == VMSNAP_SKIP_NONE) + break; + + /* + * Check if it would be unsafe to scan page because it's just + * all-visible, and we're using VISIBILITYMAP_ALL_FROZEN strategy. + */ + if (vmsnap->strat == VMSNAP_SKIP_ALL_FROZEN && + (mapbits & VISIBILITYMAP_ALL_FROZEN) == 0) + break; + + /* VACUUM will skip this page -- so don't stage it for later */ + vmsnap->next_block++; + } + + /* VACUUM will scan this block, so stage it for later */ + stage.scanned_block = vmsnap->next_block++; + stage.all_visible = all_visible; + vmsnap->staged[vmsnap->first_invalid_idx++] = stage; + vmsnap->current_nblocks_staged++; + } +} + +/* + * Get status of bits from vm snapshot + */ +static uint8 +vm_snap_get_status(vmsnapshot *vmsnap, BlockNumber heapBlk) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); + uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk); + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_snap_get_status %u", heapBlk); +#endif + + /* + * If we didn't see the VM page when the snapshot was first acquired we + * defensively assume heapBlk not all-visible or all-frozen + */ + Assert(heapBlk <= vmsnap->rel_pages); + if (mapBlock >= vmsnap->nvmpages) + return 0; + + /* Read from temp file when required */ + if (mapBlock != vmsnap->curvmpage) + { + size_t nread; + + if (BufFileSeekBlock(vmsnap->file, mapBlock) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not seek to block %u of vmsnap temporary file", + mapBlock))); + nread = BufFileRead(vmsnap->file, vmsnap->vmpage.data, BLCKSZ); + if (nread != BLCKSZ) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read block %u of vmsnap temporary file: read only %zu of %zu bytes", + mapBlock, nread, (size_t) BLCKSZ))); + vmsnap->curvmpage = mapBlock; + vmsnap->rawmap = PageGetContents(vmsnap->vmpage.data); + } + + return ((vmsnap->rawmap[mapByte] >> mapOffset) & VISIBILITYMAP_VALID_BITS); +} diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index b0e310604..5b20d5618 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -825,6 +825,7 @@ copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose, TupleDesc newTupDesc PG_USED_FOR_ASSERTS_ONLY; VacuumParams params; struct VacuumCutoffs cutoffs; + double tableagefrac; bool use_sort; double num_tuples = 0, tups_vacuumed = 0, @@ -913,7 +914,7 @@ copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose, * not to be aggressive about this. */ memset(¶ms, 0, sizeof(VacuumParams)); - vacuum_get_cutoffs(OldHeap, ¶ms, &cutoffs); + vacuum_get_cutoffs(OldHeap, ¶ms, &cutoffs, &tableagefrac); /* * FreezeXid will become the table's new relfrozenxid, and that mustn't go diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 420b85be6..e2f586687 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -920,7 +920,16 @@ get_all_vacuum_rels(int options) * * The target relation and VACUUM parameters are our inputs. * - * Output parameters are the cutoffs that VACUUM caller should use. + * Output parameters are the cutoffs that VACUUM caller should use, and + * tableagefrac, which indicates how close rel is to requiring that VACUUM + * advance relfrozenxid and/or relminmxid. + * + * The tableagefrac value 1.0 represents the point that autovacuum.c scheduling + * (and VACUUM itself) considers relfrozenxid advancement strictly necessary. + * Lower values provide useful context, and influence whether VACUUM will opt + * to advance relfrozenxid before the point that it is strictly necessary. + * VACUUM can (and often does) opt to advance relfrozenxid proactively. It is + * especially likely with tables where the _added_ costs happen to be low. * * Return value indicates if vacuumlazy.c caller should make its VACUUM * operation aggressive. An aggressive VACUUM must advance relfrozenxid up to @@ -929,7 +938,7 @@ get_all_vacuum_rels(int options) */ bool vacuum_get_cutoffs(Relation rel, const VacuumParams *params, - struct VacuumCutoffs *cutoffs) + struct VacuumCutoffs *cutoffs, double *tableagefrac) { int freeze_min_age, multixact_freeze_min_age, @@ -938,11 +947,11 @@ vacuum_get_cutoffs(Relation rel, const VacuumParams *params, effective_multixact_freeze_max_age, freeze_strategy_threshold; TransactionId nextXID, - safeOldestXmin, - aggressiveXIDCutoff; + safeOldestXmin; MultiXactId nextMXID, - safeOldestMxact, - aggressiveMXIDCutoff; + safeOldestMxact; + double XIDFrac, + MXIDFrac; /* Use mutable copies of freeze age parameters */ freeze_min_age = params->freeze_min_age; @@ -1074,48 +1083,48 @@ vacuum_get_cutoffs(Relation rel, const VacuumParams *params, cutoffs->freeze_strategy_threshold = freeze_strategy_threshold; /* - * Finally, figure out if caller needs to do an aggressive VACUUM or not. - * * Determine the table freeze age to use: as specified by the caller, or - * the value of the vacuum_freeze_table_age GUC, but in any case not more - * than autovacuum_freeze_max_age * 0.95, so that if you have e.g nightly - * VACUUM schedule, the nightly VACUUM gets a chance to freeze XIDs before - * anti-wraparound autovacuum is launched. + * the value of the vacuum_freeze_table_age GUC. The GUC's default value + * of -1 is interpreted as "just use autovacuum_freeze_max_age value". + * Also clamp using autovacuum_freeze_max_age. */ if (freeze_table_age < 0) freeze_table_age = vacuum_freeze_table_age; - freeze_table_age = Min(freeze_table_age, autovacuum_freeze_max_age * 0.95); - Assert(freeze_table_age >= 0); - aggressiveXIDCutoff = nextXID - freeze_table_age; - if (!TransactionIdIsNormal(aggressiveXIDCutoff)) - aggressiveXIDCutoff = FirstNormalTransactionId; - if (TransactionIdPrecedesOrEquals(rel->rd_rel->relfrozenxid, - aggressiveXIDCutoff)) - return true; + if (freeze_table_age < 0 || freeze_table_age > autovacuum_freeze_max_age) + freeze_table_age = autovacuum_freeze_max_age; /* * Similar to the above, determine the table freeze age to use for * multixacts: as specified by the caller, or the value of the - * vacuum_multixact_freeze_table_age GUC, but in any case not more than - * effective_multixact_freeze_max_age * 0.95, so that if you have e.g. - * nightly VACUUM schedule, the nightly VACUUM gets a chance to freeze - * multixacts before anti-wraparound autovacuum is launched. + * vacuum_multixact_freeze_table_age GUC. The GUC's default value of -1 + * is interpreted as "just use effective_multixact_freeze_max_age value". + * Also clamp using effective_multixact_freeze_max_age. */ if (multixact_freeze_table_age < 0) multixact_freeze_table_age = vacuum_multixact_freeze_table_age; - multixact_freeze_table_age = - Min(multixact_freeze_table_age, - effective_multixact_freeze_max_age * 0.95); - Assert(multixact_freeze_table_age >= 0); - aggressiveMXIDCutoff = nextMXID - multixact_freeze_table_age; - if (aggressiveMXIDCutoff < FirstMultiXactId) - aggressiveMXIDCutoff = FirstMultiXactId; - if (MultiXactIdPrecedesOrEquals(rel->rd_rel->relminmxid, - aggressiveMXIDCutoff)) - return true; + if (multixact_freeze_table_age < 0 || + multixact_freeze_table_age > effective_multixact_freeze_max_age) + multixact_freeze_table_age = effective_multixact_freeze_max_age; - /* Non-aggressive VACUUM */ - return false; + /* + * Finally, set tableagefrac for VACUUM. This can come from either XID or + * XMID table age (whichever is greater currently). + */ + XIDFrac = (double) (nextXID - cutoffs->relfrozenxid) / + ((double) freeze_table_age + 0.5); + MXIDFrac = (double) (nextMXID - cutoffs->relminmxid) / + ((double) multixact_freeze_table_age + 0.5); + *tableagefrac = Max(XIDFrac, MXIDFrac); + + /* + * Make sure that antiwraparound autovacuums reliably advance relfrozenxid + * to the satisfaction of autovacuum.c, even when the reloption version of + * autovacuum_freeze_max_age happens to be in use + */ + if (params->is_wraparound) + *tableagefrac = 1.0; + + return (*tableagefrac >= 1.0); } /* diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 549a2e969..554e2bd0c 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -2476,10 +2476,10 @@ struct config_int ConfigureNamesInt[] = { {"vacuum_freeze_table_age", PGC_USERSET, CLIENT_CONN_STATEMENT, gettext_noop("Age at which VACUUM should scan whole table to freeze tuples."), - NULL + gettext_noop("-1 to use autovacuum_freeze_max_age value.") }, &vacuum_freeze_table_age, - 150000000, 0, 2000000000, + -1, -1, 2000000000, NULL, NULL, NULL }, @@ -2496,10 +2496,10 @@ struct config_int ConfigureNamesInt[] = { {"vacuum_multixact_freeze_table_age", PGC_USERSET, CLIENT_CONN_STATEMENT, gettext_noop("Multixact age at which VACUUM should scan whole table to freeze tuples."), - NULL + gettext_noop("-1 to use autovacuum_multixact_freeze_max_age value.") }, &vacuum_multixact_freeze_table_age, - 150000000, 0, 2000000000, + -1, -1, 2000000000, NULL, NULL, NULL }, diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 4763cb6bb..bb50a5486 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -658,6 +658,13 @@ # autovacuum, -1 means use # vacuum_cost_limit +# - AUTOVACUUM compatibility options (legacy) - + +#vacuum_freeze_table_age = -1 # target maximum XID age, or -1 to + # use autovacuum_freeze_max_age +#vacuum_multixact_freeze_table_age = -1 # target maximum MXID age, or -1 to + # use autovacuum_multixact_freeze_max_age + #------------------------------------------------------------------------------ # CLIENT CONNECTION DEFAULTS @@ -691,11 +698,9 @@ #lock_timeout = 0 # in milliseconds, 0 is disabled #idle_in_transaction_session_timeout = 0 # in milliseconds, 0 is disabled #idle_session_timeout = 0 # in milliseconds, 0 is disabled -#vacuum_freeze_table_age = 150000000 #vacuum_freeze_strategy_threshold = 4GB #vacuum_freeze_min_age = 50000000 #vacuum_failsafe_age = 1600000000 -#vacuum_multixact_freeze_table_age = 150000000 #vacuum_multixact_freeze_min_age = 5000000 #vacuum_multixact_failsafe_age = 1600000000 #bytea_output = 'hex' # hex, escape diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 094f9a35d..02186ce36 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -9112,20 +9112,28 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; - VACUUM performs an aggressive scan if the table's - pg_class.relfrozenxid field has reached - the age specified by this setting. An aggressive scan differs from - a regular VACUUM in that it visits every page that might - contain unfrozen XIDs or MXIDs, not just those that might contain dead - tuples. The default is 150 million transactions. Although users can - set this value anywhere from zero to two billion, VACUUM - will silently limit the effective value to 95% of - , so that a - periodic manual VACUUM has a chance to run before an - anti-wraparound autovacuum is launched for the table. For more - information see - . + VACUUM reliably advances + relfrozenxid to a recent value if + the table's + pg_class.relfrozenxid + field has reached the age specified by this setting. + The default is -1. If -1 is specified, the value + of is used. + Although users can set this value anywhere from zero to two + billion, VACUUM will silently limit the + effective value to . For more + information see . + + + The meaning of this parameter, and its default value, changed + in PostgreSQL 16. Freezing and advancing + pg_class.relfrozenxid + now take place more proactively, in every + VACUUM operation. + + diff --git a/doc/src/sgml/ref/vacuum.sgml b/doc/src/sgml/ref/vacuum.sgml index f61433c7d..9cae899d5 100644 --- a/doc/src/sgml/ref/vacuum.sgml +++ b/doc/src/sgml/ref/vacuum.sgml @@ -154,13 +154,8 @@ VACUUM [ FULL ] [ FREEZE ] [ VERBOSE ] [ ANALYZE ] [ visibility map. Pages where - all tuples are known to be frozen can always be skipped, and those - where all tuples are known to be visible to all transactions may be - skipped except when performing an aggressive vacuum. Furthermore, - except when performing an aggressive vacuum, some pages may be skipped - in order to avoid waiting for other sessions to finish using them. + Normally, VACUUM will skip pages based on the + visibility map. This option disables all page-skipping behavior, and is intended to be used only when the contents of the visibility map are suspect, which should happen only if there is a hardware or software diff --git a/src/test/regress/expected/reloptions.out b/src/test/regress/expected/reloptions.out index b6aef6f65..0e569d300 100644 --- a/src/test/regress/expected/reloptions.out +++ b/src/test/regress/expected/reloptions.out @@ -102,8 +102,8 @@ SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass; INSERT INTO reloptions_test VALUES (1, NULL), (NULL, NULL); ERROR: null value in column "i" of relation "reloptions_test" violates not-null constraint DETAIL: Failing row contains (null, null). --- Do an aggressive vacuum to prevent page-skipping. -VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) reloptions_test; +-- Do a VACUUM FREEZE to prevent skipping any pruning. +VACUUM FREEZE reloptions_test; SELECT pg_relation_size('reloptions_test') > 0; ?column? ---------- @@ -128,8 +128,8 @@ SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass; INSERT INTO reloptions_test VALUES (1, NULL), (NULL, NULL); ERROR: null value in column "i" of relation "reloptions_test" violates not-null constraint DETAIL: Failing row contains (null, null). --- Do an aggressive vacuum to prevent page-skipping. -VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) reloptions_test; +-- Do a VACUUM FREEZE to prevent skipping any pruning. +VACUUM FREEZE reloptions_test; SELECT pg_relation_size('reloptions_test') = 0; ?column? ---------- diff --git a/src/test/regress/sql/reloptions.sql b/src/test/regress/sql/reloptions.sql index 4252b0202..b2bed8ed8 100644 --- a/src/test/regress/sql/reloptions.sql +++ b/src/test/regress/sql/reloptions.sql @@ -61,8 +61,8 @@ CREATE TEMP TABLE reloptions_test(i INT NOT NULL, j text) autovacuum_enabled=false); SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass; INSERT INTO reloptions_test VALUES (1, NULL), (NULL, NULL); --- Do an aggressive vacuum to prevent page-skipping. -VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) reloptions_test; +-- Do a VACUUM FREEZE to prevent skipping any pruning. +VACUUM FREEZE reloptions_test; SELECT pg_relation_size('reloptions_test') > 0; SELECT reloptions FROM pg_class WHERE oid = @@ -72,8 +72,8 @@ SELECT reloptions FROM pg_class WHERE oid = ALTER TABLE reloptions_test RESET (vacuum_truncate); SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass; INSERT INTO reloptions_test VALUES (1, NULL), (NULL, NULL); --- Do an aggressive vacuum to prevent page-skipping. -VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) reloptions_test; +-- Do a VACUUM FREEZE to prevent skipping any pruning. +VACUUM FREEZE reloptions_test; SELECT pg_relation_size('reloptions_test') = 0; -- Test toast.* options -- 2.38.1