From cb7775cae4af79ff43de22e9ed0a9af992fe58da Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 18 Jul 2022 14:35:44 -0700 Subject: [PATCH v16 2/3] Add eager and lazy VM strategies to VACUUM. Acquire an in-memory immutable "snapshot" of the target rel's visibility map at the start of each VACUUM. This is a local copy of the visibility map at the start of VACUUM, which can spill to a temp file as and when required. Tables that are small enough to only need a single visibility map page don't need to use a temp file. VACUUM now uses its VM snapshot (not the authoritative VM) to determine which pages to scan. VACUUM no longer scans pages that were concurrently unset in the VM, since all of the pages it will scan are known and fixed before scanning even begins. VACUUM decides on its VM snapshot scanning strategy up-front, based on information about costs taken from the snapshot, and relfrozenxid age. Lazy scanning allows VACUUM to skip all-visible pages, whereas eager scanning allows VACUUM to advance relfrozenxid. This works in tandem with VACUUM's freezing strategies. This work often result in VACUUM advancing relfrozenxid at a cadence that is driven by underlying physical costs, not table age (through settings like autovacuum_freeze_max_age). Antiwraparound autovacuums will be far less common as a result. Freezing now drives relfrozenxid, rather than relfrozenxid driving freezing. Even tables that always use lazy freezing will have a decent chance of relfrozenxid advancement long before table age nears autovacuum_freeze_max_age. This also lays the groundwork for completely removing aggressive mode VACUUMs in a later commit. Scanning strategies now supersede the "early aggressive VACUUM" concept implemented by vacuum_freeze_table_age, which is now just a compatibility option (its new default of -1 is interpreted as "just use autovacuum_freeze_max_age"). For now VACUUM will still condition its cleanup lock wait behavior on being in aggressive mode. Also add explicit I/O prefetching of heap pages, which is controlled by maintenance_io_concurrency. We prefetch at the point that the next block in line is requested by VACUUM. Prefetching is under the direct control of the visibility map snapshot code, since VACUUM's vmsnap is now an authoritative guide to which pages VACUUM will scan. Prefetching should totally avoid the loss of performance that might otherwise result from removing SKIP_PAGES_THRESHOLD in this commit. SKIP_PAGES_THRESHOLD was intended to force OS readahead and encourage relfrozenxid advancement. See commit bf136cf6 from around the time the visibility map first went in for full details. Author: Peter Geoghegan Reviewed-By: Andres Freund Reviewed-By: Jeff Davis Reviewed-By: Matthias van de Meent Reviewed-By: John Naylor Discussion: https://postgr.es/m/CAH2-WzkFok_6EAHuK39GaW4FjEFQsY=3J0AAd6FXk93u-Xq3Fg@mail.gmail.com --- src/include/access/visibilitymap.h | 17 + src/include/commands/vacuum.h | 20 +- src/backend/access/heap/heapam.c | 1 + src/backend/access/heap/vacuumlazy.c | 618 +++++++++--------- src/backend/access/heap/visibilitymap.c | 539 +++++++++++++++ src/backend/commands/vacuum.c | 68 +- src/backend/utils/misc/guc_tables.c | 8 +- src/backend/utils/misc/postgresql.conf.sample | 9 +- doc/src/sgml/config.sgml | 66 +- doc/src/sgml/maintenance.sgml | 78 +-- doc/src/sgml/ref/vacuum.sgml | 10 +- src/test/regress/expected/reloptions.out | 8 +- src/test/regress/sql/reloptions.sql | 8 +- 13 files changed, 1037 insertions(+), 413 deletions(-) diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h index daaa01a25..d8df744da 100644 --- a/src/include/access/visibilitymap.h +++ b/src/include/access/visibilitymap.h @@ -26,6 +26,17 @@ #define VM_ALL_FROZEN(r, b, v) \ ((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0) +/* Snapshot of visibility map at a point in time */ +typedef struct vmsnapshot vmsnapshot; + +/* VACUUM scanning strategy */ +typedef enum vmstrategy +{ + VMSNAP_SCAN_LAZY, /* Skip all-visible and all-frozen pages */ + VMSNAP_SCAN_EAGER, /* Only skip all-frozen pages */ + VMSNAP_SCAN_ALL /* Don't skip any pages (scan them instead) */ +} vmstrategy; + extern bool visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags); extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk, @@ -35,6 +46,12 @@ extern void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, uint8 flags); extern uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf); +extern vmsnapshot *visibilitymap_snap_acquire(Relation rel, BlockNumber rel_pages, + BlockNumber *scanned_pages_lazy, + BlockNumber *scanned_pages_eager); +extern void visibilitymap_snap_strategy(vmsnapshot *vmsnap, vmstrategy strat); +extern BlockNumber visibilitymap_snap_next(vmsnapshot *vmsnap); +extern void visibilitymap_snap_release(vmsnapshot *vmsnap); extern void visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen); extern BlockNumber visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks); diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index d900b1be1..16642b8b7 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -187,7 +187,7 @@ typedef struct VacAttrStats #define VACOPT_FULL 0x10 /* FULL (non-concurrent) vacuum */ #define VACOPT_SKIP_LOCKED 0x20 /* skip if cannot get lock */ #define VACOPT_PROCESS_TOAST 0x40 /* process the TOAST table, if any */ -#define VACOPT_DISABLE_PAGE_SKIPPING 0x80 /* don't skip any pages */ +#define VACOPT_DISABLE_PAGE_SKIPPING 0x80 /* don't skip using VM */ #define VACOPT_SKIP_DATABASE_STATS 0x100 /* skip vac_update_datfrozenxid() */ #define VACOPT_ONLY_DATABASE_STATS 0x200 /* only vac_update_datfrozenxid() */ @@ -282,6 +282,24 @@ struct VacuumCutoffs * Threshold that triggers VACUUM's eager freezing strategy */ BlockNumber freeze_strategy_threshold_nblocks; + + /* + * The tableagefrac value 1.0 represents the point that autovacuum.c + * scheduling (and VACUUM itself) considers relfrozenxid/relminmxid + * advancement strictly necessary. Values near 0.0 mean that both + * relfrozenxid and relminmxid are a recently allocated XID/MXID. + * + * We don't need separate relfrozenxid and relminmxid tableagefrac + * variants. We base tableagefrac on whichever pg_class field is closer + * to the point of having autovacuum.c launch an autovacuum to advance the + * field's value. + * + * Lower values provide useful context, and influence whether VACUUM will + * opt to advance relfrozenxid before the point that it is strictly + * necessary. VACUUM can (and often does) opt to advance relfrozenxid + * and/or relminmxid proactively. + */ + double tableagefrac; }; /* diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 95f4d59e3..351b822b6 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -7057,6 +7057,7 @@ heap_freeze_tuple(HeapTupleHeader tuple, cutoffs.FreezeLimit = FreezeLimit; cutoffs.MultiXactCutoff = MultiXactCutoff; cutoffs.freeze_strategy_threshold_nblocks = 0; + cutoffs.tableagefrac = 0; pagefrz.freeze_required = true; pagefrz.FreezePageRelfrozenXid = FreezeLimit; diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index f9536e522..ecf4d7e05 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -11,8 +11,8 @@ * We are willing to use at most maintenance_work_mem (or perhaps * autovacuum_work_mem) memory space to keep track of dead TIDs. We initially * allocate an array of TIDs of that size, with an upper limit that depends on - * table size (this limit ensures we don't allocate a huge area uselessly for - * vacuuming small tables). If the array threatens to overflow, we must call + * the number of pages we'll scan (this limit ensures we don't allocate a huge + * area for TIDs uselessly). If the array threatens to overflow, we must call * lazy_vacuum to vacuum indexes (and to vacuum the pages that we've pruned). * This frees up the memory space dedicated to storing dead TIDs. * @@ -110,10 +110,18 @@ ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ)) /* - * Before we consider skipping a page that's marked as clean in - * visibility map, we must've seen at least this many clean pages. + * tableagefrac-wise cutoffs influencing VACUUM's choice of scanning strategy */ -#define SKIP_PAGES_THRESHOLD ((BlockNumber) 32) +#define TABLEAGEFRAC_MIDPOINT 0.5 /* half way to antiwraparound AV */ +#define TABLEAGEFRAC_HIGHPOINT 0.9 /* Eagerness now mandatory */ + +/* + * Thresholds (expressed as a proportion of rel_pages) that determine the + * cutoff (in extra pages scanned) for eager vmsnap scanning behavior at + * particular tableagefrac-wise table ages + */ +#define MAX_PAGES_YOUNG_TABLEAGE 0.05 /* 5% of rel_pages */ +#define MAX_PAGES_OLD_TABLEAGE 0.70 /* 70% of rel_pages */ /* * Size of the prefetch window for lazy vacuum backwards truncation scan. @@ -151,8 +159,6 @@ typedef struct LVRelState /* Aggressive VACUUM? (must set relfrozenxid >= FreezeLimit) */ bool aggressive; - /* Use visibility map to skip? (disabled by DISABLE_PAGE_SKIPPING) */ - bool skipwithvm; /* Eagerly freeze all tuples on pages about to be set all-visible? */ bool eager_freeze_strategy; /* Wraparound failsafe has been triggered? */ @@ -171,7 +177,9 @@ typedef struct LVRelState /* Tracks oldest extant XID/MXID for setting relfrozenxid/relminmxid */ TransactionId NewRelfrozenXid; MultiXactId NewRelminMxid; - bool skippedallvis; + /* Immutable snapshot of visibility map (as of time that VACUUM began) */ + vmsnapshot *vmsnap; + vmstrategy vmstrat; /* Error reporting state */ char *dbname; @@ -223,6 +231,7 @@ typedef struct LVPagePruneState { bool hastup; /* Page prevents rel truncation? */ bool has_lpdead_items; /* includes existing LP_DEAD items */ + bool pd_allvis_corrupt; /* PD_ALL_VISIBLE bit spuriously set? */ /* * State describes the proper VM bit states to set for the page following @@ -245,11 +254,8 @@ typedef struct LVSavedErrInfo /* non-export function prototypes */ static void lazy_scan_heap(LVRelState *vacrel); -static void lazy_scan_strategy(LVRelState *vacrel); -static BlockNumber lazy_scan_skip(LVRelState *vacrel, Buffer *vmbuffer, - BlockNumber next_block, - bool *next_unskippable_allvis, - bool *skipping_current_range); +static BlockNumber lazy_scan_strategy(LVRelState *vacrel, + bool force_scan_all); static bool lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, Page page, bool sharelock, Buffer vmbuffer); @@ -279,7 +285,8 @@ static bool should_attempt_truncation(LVRelState *vacrel); static void lazy_truncate_heap(LVRelState *vacrel); static BlockNumber count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected); -static void dead_items_alloc(LVRelState *vacrel, int nworkers); +static void dead_items_alloc(LVRelState *vacrel, int nworkers, + BlockNumber scanned_pages); static void dead_items_cleanup(LVRelState *vacrel); static bool heap_page_is_all_visible(LVRelState *vacrel, Buffer buf, TransactionId *visibility_cutoff_xid, bool *all_frozen); @@ -311,10 +318,10 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, LVRelState *vacrel; bool verbose, instrument, - skipwithvm, frozenxid_updated, minmulti_updated; BlockNumber orig_rel_pages, + scanned_pages, new_rel_pages, new_rel_allvisible; PGRUsage ru0; @@ -461,37 +468,29 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, /* Initialize state used to track oldest extant XID/MXID */ vacrel->NewRelfrozenXid = vacrel->cutoffs.OldestXmin; vacrel->NewRelminMxid = vacrel->cutoffs.OldestMxact; - vacrel->skippedallvis = false; - skipwithvm = true; - if (params->options & VACOPT_DISABLE_PAGE_SKIPPING) - { - /* - * Force aggressive mode, and disable skipping blocks using the - * visibility map (even those set all-frozen) - */ - vacrel->aggressive = true; - skipwithvm = false; - } - - vacrel->skipwithvm = skipwithvm; /* - * Now determine VACUUM's freezing strategy. + * Now determine VACUUM's freezing and scanning strategies. + * + * This process is driven in part by information from VACUUM's visibility + * map snapshot, which will be acquired in passing. lazy_scan_heap will + * use the same immutable VM snapshot to determine which pages to scan. + * Using an immutable structure (instead of the live visibility map) makes + * VACUUM avoid scanning concurrently modified pages. These pages can + * only have deleted tuples that OldestXmin will consider RECENTLY_DEAD. */ - lazy_scan_strategy(vacrel); + scanned_pages = lazy_scan_strategy(vacrel, + (params->options & + VACOPT_DISABLE_PAGE_SKIPPING) != 0); if (verbose) - { - if (vacrel->aggressive) - ereport(INFO, - (errmsg("aggressively vacuuming \"%s.%s.%s\"", - vacrel->dbname, vacrel->relnamespace, - vacrel->relname))); - else - ereport(INFO, - (errmsg("vacuuming \"%s.%s.%s\"", - vacrel->dbname, vacrel->relnamespace, - vacrel->relname))); - } + ereport(INFO, + (errmsg("vacuuming \"%s.%s.%s\"", + vacrel->dbname, vacrel->relnamespace, + vacrel->relname), + errdetail("Table has %u pages in total, of which %u pages (%.2f%% of total) will be scanned.", + orig_rel_pages, scanned_pages, + orig_rel_pages == 0 ? 100.0 : + 100.0 * scanned_pages / orig_rel_pages))); /* * Allocate dead_items array memory using dead_items_alloc. This handles @@ -501,13 +500,14 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, * is already dangerously old.) */ lazy_check_wraparound_failsafe(vacrel); - dead_items_alloc(vacrel, params->nworkers); + dead_items_alloc(vacrel, params->nworkers, scanned_pages); /* * Call lazy_scan_heap to perform all required heap pruning, index * vacuuming, and heap vacuuming (plus related processing) */ lazy_scan_heap(vacrel); + Assert(vacrel->scanned_pages == scanned_pages); /* * Free resources managed by dead_items_alloc. This ends parallel mode in @@ -554,12 +554,11 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, MultiXactIdPrecedesOrEquals(vacrel->aggressive ? vacrel->cutoffs.MultiXactCutoff : vacrel->cutoffs.relminmxid, vacrel->NewRelminMxid)); - if (vacrel->skippedallvis) + if (vacrel->vmstrat == VMSNAP_SCAN_LAZY) { /* - * Must keep original relfrozenxid in a non-aggressive VACUUM that - * chose to skip an all-visible page range. The state that tracks new - * values will have missed unfrozen XIDs from the pages we skipped. + * Must keep original relfrozenxid/relminmxid when lazy_scan_strategy + * decided to skip all-visible pages containing unfrozen XIDs/MXIDs */ Assert(!vacrel->aggressive); vacrel->NewRelfrozenXid = InvalidTransactionId; @@ -604,6 +603,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, vacrel->missed_dead_tuples); pgstat_progress_end_command(); + /* Done with rel's visibility map snapshot */ + visibilitymap_snap_release(vacrel->vmsnap); + if (instrument) { TimestampTz endtime = GetCurrentTimestamp(); @@ -631,10 +633,6 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, initStringInfo(&buf); if (verbose) { - /* - * Aggressiveness already reported earlier, in dedicated - * VACUUM VERBOSE ereport - */ Assert(!params->is_wraparound); msgfmt = _("finished vacuuming \"%s.%s.%s\": index scans: %d\n"); } @@ -829,13 +827,10 @@ static void lazy_scan_heap(LVRelState *vacrel) { BlockNumber rel_pages = vacrel->rel_pages, - blkno, - next_unskippable_block, + next_block_to_scan, next_fsm_block_to_vacuum = 0; VacDeadItems *dead_items = vacrel->dead_items; Buffer vmbuffer = InvalidBuffer; - bool next_unskippable_allvis, - skipping_current_range; const int initprog_index[] = { PROGRESS_VACUUM_PHASE, PROGRESS_VACUUM_TOTAL_HEAP_BLKS, @@ -849,46 +844,27 @@ lazy_scan_heap(LVRelState *vacrel) initprog_val[2] = dead_items->max_items; pgstat_progress_update_multi_param(3, initprog_index, initprog_val); - /* Set up an initial range of skippable blocks using the visibility map */ - next_unskippable_block = lazy_scan_skip(vacrel, &vmbuffer, 0, - &next_unskippable_allvis, - &skipping_current_range); - for (blkno = 0; blkno < rel_pages; blkno++) + next_block_to_scan = visibilitymap_snap_next(vacrel->vmsnap); + while (next_block_to_scan < rel_pages) { + BlockNumber blkno = next_block_to_scan; Buffer buf; Page page; - bool all_visible_according_to_vm; LVPagePruneState prunestate; - if (blkno == next_unskippable_block) - { - /* - * Can't skip this page safely. Must scan the page. But - * determine the next skippable range after the page first. - */ - all_visible_according_to_vm = next_unskippable_allvis; - next_unskippable_block = lazy_scan_skip(vacrel, &vmbuffer, - blkno + 1, - &next_unskippable_allvis, - &skipping_current_range); + next_block_to_scan = visibilitymap_snap_next(vacrel->vmsnap); - Assert(next_unskippable_block >= blkno + 1); - } - else - { - /* Last page always scanned (may need to set nonempty_pages) */ - Assert(blkno < rel_pages - 1); - - if (skipping_current_range) - continue; - - /* Current range is too small to skip -- just scan the page */ - all_visible_according_to_vm = true; - } + /* + * visibilitymap_snap_next must always force us to scan the last page + * in rel (in the range of rel_pages) so that VACUUM can avoid useless + * attempts at rel truncation (per should_attempt_truncation comments) + */ + Assert(next_block_to_scan > blkno); + Assert(next_block_to_scan < rel_pages || blkno == rel_pages - 1); vacrel->scanned_pages++; - /* Report as block scanned, update error traceback information */ + /* Report all blocks < blkno as initial-heap-pass processed */ pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); update_vacuum_error_info(vacrel, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP, blkno, InvalidOffsetNumber); @@ -1025,12 +1001,24 @@ lazy_scan_heap(LVRelState *vacrel) */ lazy_scan_prune(vacrel, buf, blkno, page, &prunestate); - Assert(!prunestate.all_visible || !prunestate.has_lpdead_items); - /* Remember the location of the last page with nonremovable tuples */ if (prunestate.hastup) vacrel->nonempty_pages = blkno + 1; + /* + * Clear PD_ALL_VISIBLE (and page's visibility map bits) in the event + * of lazy_scan_prune detecting an inconsistency + */ + if (unlikely(prunestate.pd_allvis_corrupt)) + { + elog(WARNING, "page containing dead tuples has PD_ALL_VISIBLE set in relation \"%s\" page %u", + vacrel->relname, blkno); + PageClearAllVisible(page); + MarkBufferDirty(buf); + visibilitymap_clear(vacrel->rel, blkno, vmbuffer, + VISIBILITYMAP_VALID_BITS); + } + if (vacrel->nindexes == 0) { /* @@ -1089,10 +1077,9 @@ lazy_scan_heap(LVRelState *vacrel) } /* - * Handle setting visibility map bit based on information from the VM - * (as of last lazy_scan_skip() call), and from prunestate + * Set visibility map bits based on prunestate's instructions */ - if (!all_visible_according_to_vm && prunestate.all_visible) + if (prunestate.all_visible) { uint8 flags = VISIBILITYMAP_ALL_VISIBLE; @@ -1102,34 +1089,36 @@ lazy_scan_heap(LVRelState *vacrel) flags |= VISIBILITYMAP_ALL_FROZEN; } - /* - * It should never be the case that the visibility map page is set - * while the page-level bit is clear, but the reverse is allowed - * (if checksums are not enabled). Regardless, set both bits so - * that we get back in sync. - * - * NB: If the heap page is all-visible but the VM bit is not set, - * we don't need to dirty the heap page. However, if checksums - * are enabled, we do need to make sure that the heap page is - * dirtied before passing it to visibilitymap_set(), because it - * may be logged. Given that this situation should only happen in - * rare cases after a crash, it is not worth optimizing. - */ - PageSetAllVisible(page); - MarkBufferDirty(buf); + if (!PageIsAllVisible(page)) + { + /* + * We could avoid dirtying the page just to set PD_ALL_VISIBLE + * when checksums are disabled. It is very likely that the + * heap page is already dirty anyway, so keep the rule simple: + * always dirty a page when setting its PD_ALL_VISIBLE bit. + */ + PageSetAllVisible(page); + MarkBufferDirty(buf); + } visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr, vmbuffer, prunestate.visibility_cutoff_xid, flags); } /* - * As of PostgreSQL 9.2, the visibility map bit should never be set if - * the page-level bit is clear. However, it's possible that the bit - * got cleared after lazy_scan_skip() was called, so we must recheck - * with buffer lock before concluding that the VM is corrupt. + * When the page isn't eligible to become all-visible, we defensively + * check that PD_ALL_VISIBLE agrees with the visibility map instead. + * If there is disagreement then we clear both VM bits to repair. + * + * We don't expect (and deliberately avoid testing) mutual agreement; + * it's okay for PD_ALL_VISIBLE to be set while both visibility map + * bits remain unset (iff checksums are disabled). It's even okay for + * prunestate's all_visible flag to disagree with PD_ALL_VISIBLE here + * (lazy_scan_prune's pd_allvis_corrupt comments explain why that is). */ - else if (all_visible_according_to_vm && !PageIsAllVisible(page) && - visibilitymap_get_status(vacrel->rel, blkno, &vmbuffer) != 0) + else if (!PageIsAllVisible(page) && + unlikely(visibilitymap_get_status(vacrel->rel, blkno, + &vmbuffer) != 0)) { elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u", vacrel->relname, blkno); @@ -1137,65 +1126,6 @@ lazy_scan_heap(LVRelState *vacrel) VISIBILITYMAP_VALID_BITS); } - /* - * It's possible for the value returned by - * GetOldestNonRemovableTransactionId() to move backwards, so it's not - * wrong for us to see tuples that appear to not be visible to - * everyone yet, while PD_ALL_VISIBLE is already set. The real safe - * xmin value never moves backwards, but - * GetOldestNonRemovableTransactionId() is conservative and sometimes - * returns a value that's unnecessarily small, so if we see that - * contradiction it just means that the tuples that we think are not - * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag - * is correct. - * - * There should never be LP_DEAD items on a page with PD_ALL_VISIBLE - * set, however. - */ - else if (prunestate.has_lpdead_items && PageIsAllVisible(page)) - { - elog(WARNING, "page containing LP_DEAD items is marked as all-visible in relation \"%s\" page %u", - vacrel->relname, blkno); - PageClearAllVisible(page); - MarkBufferDirty(buf); - visibilitymap_clear(vacrel->rel, blkno, vmbuffer, - VISIBILITYMAP_VALID_BITS); - } - - /* - * If the all-visible page is all-frozen but not marked as such yet, - * mark it as all-frozen. Note that all_frozen is only valid if - * all_visible is true, so we must check both prunestate fields. - */ - else if (all_visible_according_to_vm && prunestate.all_visible && - prunestate.all_frozen && - !VM_ALL_FROZEN(vacrel->rel, blkno, &vmbuffer)) - { - /* - * Avoid relying on all_visible_according_to_vm as a proxy for the - * page-level PD_ALL_VISIBLE bit being set, since it might have - * become stale -- even when all_visible is set in prunestate - */ - if (!PageIsAllVisible(page)) - { - PageSetAllVisible(page); - MarkBufferDirty(buf); - } - - /* - * Set the page all-frozen (and all-visible) in the VM. - * - * We can pass InvalidTransactionId as our visibility_cutoff_xid, - * since a snapshotConflictHorizon sufficient to make everything - * safe for REDO was logged when the page's tuples were frozen. - */ - Assert(!TransactionIdIsValid(prunestate.visibility_cutoff_xid)); - visibilitymap_set(vacrel->rel, blkno, buf, InvalidXLogRecPtr, - vmbuffer, InvalidTransactionId, - VISIBILITYMAP_ALL_VISIBLE | - VISIBILITYMAP_ALL_FROZEN); - } - /* * Final steps for block: drop cleanup lock, record free space in the * FSM @@ -1232,12 +1162,13 @@ lazy_scan_heap(LVRelState *vacrel) } } + /* initial heap pass finished (final pass may still be required) */ vacrel->blkno = InvalidBlockNumber; if (BufferIsValid(vmbuffer)) ReleaseBuffer(vmbuffer); - /* report that everything is now scanned */ - pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno); + /* report all blocks as initial-heap-pass processed */ + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, rel_pages); /* now we can compute the new value for pg_class.reltuples */ vacrel->new_live_tuples = vac_estimate_reltuples(vacrel->rel, rel_pages, @@ -1254,20 +1185,26 @@ lazy_scan_heap(LVRelState *vacrel) /* * Do index vacuuming (call each index's ambulkdelete routine), then do - * related heap vacuuming + * related heap vacuuming in final heap pass */ if (dead_items->num_items > 0) lazy_vacuum(vacrel); /* - * Vacuum the remainder of the Free Space Map. We must do this whether or - * not there were indexes, and whether or not we bypassed index vacuuming. + * Now that both our initial heap pass and final heap pass (if any) have + * ended, vacuum the Free Space Map. (Actually, similar FSM vacuuming will + * have taken place earlier when VACUUM needed to call lazy_vacuum to deal + * with running out of dead_items space. Hopefully that will be rare.) */ - if (blkno > next_fsm_block_to_vacuum) - FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, blkno); + if (rel_pages > 0) + { + Assert(vacrel->scanned_pages > 0); + FreeSpaceMapVacuumRange(vacrel->rel, next_fsm_block_to_vacuum, + rel_pages); + } - /* report all blocks vacuumed */ - pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno); + /* report all blocks as final-heap-pass processed */ + pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, rel_pages); /* Do final index cleanup (call each index's amvacuumcleanup routine) */ if (vacrel->nindexes > 0 && vacrel->do_index_cleanup) @@ -1275,7 +1212,7 @@ lazy_scan_heap(LVRelState *vacrel) } /* - * lazy_scan_strategy() -- Determine freezing strategy. + * lazy_scan_strategy() -- Determine freezing/vmsnap scanning strategies. * * Our lazy freezing strategy is useful when putting off the work of freezing * totally avoids freezing that turns out to have been wasted effort later on. @@ -1283,11 +1220,42 @@ lazy_scan_heap(LVRelState *vacrel) * continual growth, where freezing pages proactively is needed just to avoid * falling behind on freezing (eagerness is also likely to be cheaper in the * short/medium term for such tables, but the long term picture matters most). + * + * Our lazy vmsnap scanning strategy is useful when we can save a significant + * amount of work in the short term by not advancing relfrozenxid/relminmxid. + * Our eager vmsnap scanning strategy is useful when there is hardly any work + * avoided by being lazy anyway, and/or when tableagefrac is nearing or has + * already surpassed 1.0, which is the point of antiwraparound autovacuuming. + * + * Freezing and scanning strategies are structured as two independent choices, + * but they are not independent in any practical sense (it's just mechanical). + * Eager and lazy behaviors go hand in hand, since the choice of each strategy + * is driven by similar considerations about the needs of the target table. + * Moreover, choosing eager scanning strategy can easily result in freezing + * many more pages (compared to an equivalent lazy scanning strategy VACUUM), + * since VACUUM can only freeze pages that it actually scans. (All-visible + * pages may well have XIDs < FreezeLimit by now, but VACUUM has no way of + * noticing that it should freeze such pages besides just scanning them.) + * + * The single most important justification for the eager behaviors is system + * level performance stability. It is often better to freeze all-visible + * pages before we're truly forced to (just to advance relfrozenxid) as a way + * of avoiding big spikes, where VACUUM has to freeze many pages all at once. + * + * Returns final scanned_pages for the VACUUM operation. The exact number of + * pages that lazy_scan_heap scans depends in part on which vmsnap scanning + * strategy we choose (only eager scanning will scan rel's all-visible pages). */ -static void -lazy_scan_strategy(LVRelState *vacrel) +static BlockNumber +lazy_scan_strategy(LVRelState *vacrel, bool force_scan_all) { - BlockNumber rel_pages = vacrel->rel_pages; + BlockNumber rel_pages = vacrel->rel_pages, + scanned_pages_lazy, + scanned_pages_eager, + nextra_scanned_eager, + nextra_young_threshold, + nextra_old_threshold, + nextra_toomany_threshold; /* * Decide freezing strategy. @@ -1295,125 +1263,160 @@ lazy_scan_strategy(LVRelState *vacrel) * The eager freezing strategy is used when the threshold controlled by * freeze_strategy_threshold GUC/reloption exceeds rel_pages. * + * Also freeze eagerly whenever table age is close to requiring (or is + * actually undergoing) an antiwraparound autovacuum. This may delay the + * next antiwraparound autovacuum against the table. We avoid relying on + * them, if at all possible (mostly-static tables tend to rely on them). + * * Also freeze eagerly with an unlogged or temp table, where the total * cost of freezing each page is just the cycles needed to prepare a set * of freeze plans. Executing the freeze plans adds very little cost. * Dirtying extra pages isn't a concern, either; VACUUM will definitely * set PD_ALL_VISIBLE on affected pages, regardless of freezing strategy. + * + * Once a table first becomes big enough for eager freezing, it's almost + * inevitable that it will also naturally settle into a cadence where + * relfrozenxid is advanced during every VACUUM (barring rel truncation). + * This is a consequence of eager freezing strategy avoiding creating new + * all-visible pages: if there never are any all-visible pages (if all + * skippable pages are fully all-frozen), then there is no way that lazy + * scanning strategy can ever look better than eager scanning strategy. + * There are still ways that the occasional all-visible page could slip + * into a table that we always freeze eagerly (at least when its tuples + * tend to contain MultiXacts), but that should have negligible impact. */ vacrel->eager_freeze_strategy = (rel_pages >= vacrel->cutoffs.freeze_strategy_threshold_nblocks || + vacrel->cutoffs.tableagefrac > TABLEAGEFRAC_HIGHPOINT || !RelationIsPermanent(vacrel->rel)); -} - -/* - * lazy_scan_skip() -- set up range of skippable blocks using visibility map. - * - * lazy_scan_heap() calls here every time it needs to set up a new range of - * blocks to skip via the visibility map. Caller passes the next block in - * line. We return a next_unskippable_block for this range. When there are - * no skippable blocks we just return caller's next_block. The all-visible - * status of the returned block is set in *next_unskippable_allvis for caller, - * too. Block usually won't be all-visible (since it's unskippable), but it - * can be during aggressive VACUUMs (as well as in certain edge cases). - * - * Sets *skipping_current_range to indicate if caller should skip this range. - * Costs and benefits drive our decision. Very small ranges won't be skipped. - * - * Note: our opinion of which blocks can be skipped can go stale immediately. - * It's okay if caller "misses" a page whose all-visible or all-frozen marking - * was concurrently cleared, though. All that matters is that caller scan all - * pages whose tuples might contain XIDs < OldestXmin, or MXIDs < OldestMxact. - * (Actually, non-aggressive VACUUMs can choose to skip all-visible pages with - * older XIDs/MXIDs. The vacrel->skippedallvis flag will be set here when the - * choice to skip such a range is actually made, making everything safe.) - */ -static BlockNumber -lazy_scan_skip(LVRelState *vacrel, Buffer *vmbuffer, BlockNumber next_block, - bool *next_unskippable_allvis, bool *skipping_current_range) -{ - BlockNumber rel_pages = vacrel->rel_pages, - next_unskippable_block = next_block, - nskippable_blocks = 0; - bool skipsallvis = false; - - *next_unskippable_allvis = true; - while (next_unskippable_block < rel_pages) - { - uint8 mapbits = visibilitymap_get_status(vacrel->rel, - next_unskippable_block, - vmbuffer); - - if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) == 0) - { - Assert((mapbits & VISIBILITYMAP_ALL_FROZEN) == 0); - *next_unskippable_allvis = false; - break; - } - - /* - * Caller must scan the last page to determine whether it has tuples - * (caller must have the opportunity to set vacrel->nonempty_pages). - * This rule avoids having lazy_truncate_heap() take access-exclusive - * lock on rel to attempt a truncation that fails anyway, just because - * there are tuples on the last page (it is likely that there will be - * tuples on other nearby pages as well, but those can be skipped). - * - * Implement this by always treating the last block as unsafe to skip. - */ - if (next_unskippable_block == rel_pages - 1) - break; - - /* DISABLE_PAGE_SKIPPING makes all skipping unsafe */ - if (!vacrel->skipwithvm) - { - /* Caller shouldn't rely on all_visible_according_to_vm */ - *next_unskippable_allvis = false; - break; - } - - /* - * Aggressive VACUUM caller can't skip pages just because they are - * all-visible. They may still skip all-frozen pages, which can't - * contain XIDs < OldestXmin (XIDs that aren't already frozen by now). - */ - if ((mapbits & VISIBILITYMAP_ALL_FROZEN) == 0) - { - if (vacrel->aggressive) - break; - - /* - * All-visible block is safe to skip in non-aggressive case. But - * remember that the final range contains such a block for later. - */ - skipsallvis = true; - } - - vacuum_delay_point(); - next_unskippable_block++; - nskippable_blocks++; - } /* - * We only skip a range with at least SKIP_PAGES_THRESHOLD consecutive - * pages. Since we're reading sequentially, the OS should be doing - * readahead for us, so there's no gain in skipping a page now and then. - * Skipping such a range might even discourage sequential detection. + * Decide vmsnap scanning strategy. * - * This test also enables more frequent relfrozenxid advancement during - * non-aggressive VACUUMs. If the range has any all-visible pages then - * skipping makes updating relfrozenxid unsafe, which is a real downside. + * First acquire a visibility map snapshot, which determines the number of + * pages that each vmsnap scanning strategy is required to scan for us in + * passing. + * + * The number of "extra" scanned_pages added by choosing VMSNAP_SCAN_EAGER + * over VMSNAP_SCAN_LAZY is a key input into the decision making process. + * It is a good proxy for the added cost of applying our eager vmsnap + * strategy during this particular VACUUM. (We may or may not have to + * dirty/freeze the extra pages when we scan them, which isn't something + * that we try to model. It shouldn't matter very much at this level.) */ - if (nskippable_blocks < SKIP_PAGES_THRESHOLD) - *skipping_current_range = false; + vacrel->vmsnap = visibilitymap_snap_acquire(vacrel->rel, rel_pages, + &scanned_pages_lazy, + &scanned_pages_eager); + nextra_scanned_eager = scanned_pages_eager - scanned_pages_lazy; + + /* + * Next determine guideline "nextra_scanned_eager" thresholds, which are + * applied based in part on tableagefrac (when nextra_toomany_threshold is + * determined below). These thresholds also represent minimum and maximum + * sensible thresholds that can ever make sense for a table of this size + * (when the table's age isn't old enough to make eagerness mandatory). + * + * For the most part we only care about relative (not absolute) costs. We + * want to advance relfrozenxid at an opportune time, during a VACUUM that + * has to scan relatively many pages either way (whether due to the need + * to remove dead tuples from many pages, or due to the table containing + * lots of existing all-frozen pages, or due to a combination of both). + * Even small tables (where lazy freezing is used) shouldn't have to do + * dramatically more work than usual when advancing relfrozenxid, which + * our policy of waiting for the right VACUUM largely avoids, in practice. + */ + nextra_young_threshold = (double) rel_pages * MAX_PAGES_YOUNG_TABLEAGE; + nextra_old_threshold = (double) rel_pages * MAX_PAGES_OLD_TABLEAGE; + + /* + * Next determine nextra_toomany_threshold, which represents how many + * extra scanned_pages are deemed too high a cost to pay for eagerness, + * given present conditions. This is our model's break-even point. + */ + if (vacrel->cutoffs.tableagefrac < TABLEAGEFRAC_MIDPOINT) + { + /* + * The table's age is still below table age mid point, so table age is + * still of only minimal concern. We're still willing to act eagerly + * when it's _very_ cheap to do so: when use of VMSNAP_SCAN_EAGER will + * force us to scan some extra pages not exceeding 5% of rel_pages. + */ + nextra_toomany_threshold = nextra_young_threshold; + } + else if (vacrel->cutoffs.tableagefrac <= TABLEAGEFRAC_HIGHPOINT) + { + double nextra_scale; + + /* + * The table's age is starting to become a concern, but not to the + * extent that we'll force the use of VMSNAP_SCAN_EAGER strategy. + * We'll need to interpolate to get an nextra_scanned_eager-based + * threshold. + * + * If tableagefrac is only barely over the midway point, then we'll + * choose an nextra_scanned_eager threshold of ~5% of rel_pages. The + * opposite extreme occurs when tableagefrac is very near to the high + * point. That will make our nextra_scanned_eager threshold very + * aggressive: we'll go with VMSNAP_SCAN_EAGER when doing so requires + * we scan a number of extra blocks as high as ~70% of rel_pages. + * + * Note that the threshold grows (on a percentage basis) by ~8.1% of + * rel_pages for every additional 5%-of-tableagefrac increment added + * (after tableagefrac has crossed the 50%-of-tableagefrac mid point, + * until the 90%-of-tableagefrac high point is reached, when we switch + * over to not caring about the added cost of eager freezing at all). + */ + nextra_scale = + 1.0 - ((TABLEAGEFRAC_HIGHPOINT - vacrel->cutoffs.tableagefrac) / + (TABLEAGEFRAC_HIGHPOINT - TABLEAGEFRAC_MIDPOINT)); + + nextra_toomany_threshold = + (nextra_young_threshold * (1.0 - nextra_scale)) + + (nextra_old_threshold * nextra_scale); + } else { - *skipping_current_range = true; - if (skipsallvis) - vacrel->skippedallvis = true; + /* + * The table's age surpasses the high point, and so is approaching (or + * may even surpass) the point that an antiwraparound autovacuum is + * required. Force VMSNAP_SCAN_EAGER, no matter how many extra pages + * we'll be required to scan as a result (costs no longer matter). + * + * Note that there is a discontinuity when tableagefrac crosses this + * 90%-of-tableagefrac high point: the threshold set here jumps from + * 70% of rel_pages to 100% of rel_pages (MaxBlockNumber, actually). + * It's useful to only care about table age once it gets this high. + * That way even extreme cases will have at least some chance of using + * eager scanning before an antiwraparound autovacuum is launched. + */ + nextra_toomany_threshold = MaxBlockNumber; } - return next_unskippable_block; + /* Make final choice on scanning strategy using final threshold */ + nextra_toomany_threshold = Max(nextra_toomany_threshold, 32); + vacrel->vmstrat = (nextra_scanned_eager >= nextra_toomany_threshold ? + VMSNAP_SCAN_LAZY : VMSNAP_SCAN_EAGER); + + /* + * VACUUM's DISABLE_PAGE_SKIPPING option overrides our decision by forcing + * VACUUM to scan every page (VACUUM effectively distrusts rel's VM) + */ + if (force_scan_all) + vacrel->vmstrat = VMSNAP_SCAN_ALL; + + Assert(!vacrel->aggressive || vacrel->vmstrat != VMSNAP_SCAN_LAZY); + + /* Inform vmsnap infrastructure of our chosen strategy */ + visibilitymap_snap_strategy(vacrel->vmsnap, vacrel->vmstrat); + + /* Return appropriate scanned_pages for final strategy chosen */ + if (vacrel->vmstrat == VMSNAP_SCAN_LAZY) + return scanned_pages_lazy; + if (vacrel->vmstrat == VMSNAP_SCAN_EAGER) + return scanned_pages_eager; + + /* DISABLE_PAGE_SKIPPING/VMSNAP_SCAN_ALL case */ + return rel_pages; } /* @@ -1633,6 +1636,7 @@ retry: */ prunestate->hastup = false; prunestate->has_lpdead_items = false; + prunestate->pd_allvis_corrupt = false; prunestate->all_visible = true; prunestate->all_frozen = true; prunestate->visibility_cutoff_xid = InvalidTransactionId; @@ -1966,12 +1970,26 @@ retry: prunestate->all_visible = false; } - /* Finally, add page-local counts to whole-VACUUM counts */ + /* Add page-local counts to whole-VACUUM counts */ vacrel->tuples_deleted += tuples_deleted; vacrel->tuples_frozen += tuples_frozen; vacrel->lpdead_items += lpdead_items; vacrel->live_tuples += live_tuples; vacrel->recently_dead_tuples += recently_dead_tuples; + + /* + * There should never be dead or deleted tuples when PD_ALL_VISIBLE is + * already set. Check that now, to help caller maintain the VM correctly. + * + * We deliberately avoid indicating corruption when a tuple was found to + * be HEAPTUPLE_INSERT_IN_PROGRESS on a page that has PD_ALL_VISIBLE set. + * That would lead to false positives, since OldestXmin is conservative. + * (It's possible that this VACUUM has an earlier OldestXmin than a VACUUM + * that ran against the same table at some point in the recent past.) + */ + if (PageIsAllVisible(page) && + (lpdead_items > 0 || tuples_deleted > 0 || recently_dead_tuples > 0)) + prunestate->pd_allvis_corrupt = true; } /* @@ -2503,6 +2521,7 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) vacuumed_pages++; } + /* final heap pass finished */ vacrel->blkno = InvalidBlockNumber; if (BufferIsValid(vmbuffer)) ReleaseBuffer(vmbuffer); @@ -2846,6 +2865,14 @@ lazy_cleanup_one_index(Relation indrel, IndexBulkDeleteResult *istat, * Also don't attempt it if we are doing early pruning/vacuuming, because a * scan which cannot find a truncated heap page cannot determine that the * snapshot is too old to read that page. + * + * Note that we effectively rely on visibilitymap_snap_next() having forced + * VACUUM to scan the final page (rel_pages - 1) in all cases. Without that, + * we'd tend to needlessly acquire an AccessExclusiveLock just to attempt rel + * truncation that is bound to fail. VACUUM cannot set vacrel->nonempty_pages + * in pages that it skips using the VM, so we must avoid interpreting skipped + * pages as empty pages when it makes little sense. Observing that the final + * page has tuples is a simple way of avoiding pathological locking behavior. */ static bool should_attempt_truncation(LVRelState *vacrel) @@ -3136,14 +3163,13 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected) /* * Returns the number of dead TIDs that VACUUM should allocate space to - * store, given a heap rel of size vacrel->rel_pages, and given current - * maintenance_work_mem setting (or current autovacuum_work_mem setting, - * when applicable). + * store, given the expected scanned_pages for this VACUUM operation, + * and given current maintenance_work_mem/autovacuum_work_mem setting. * * See the comments at the head of this file for rationale. */ static int -dead_items_max_items(LVRelState *vacrel) +dead_items_max_items(LVRelState *vacrel, BlockNumber scanned_pages) { int64 max_items; int vac_work_mem = IsAutoVacuumWorkerProcess() && @@ -3152,15 +3178,13 @@ dead_items_max_items(LVRelState *vacrel) if (vacrel->nindexes > 0) { - BlockNumber rel_pages = vacrel->rel_pages; - max_items = MAXDEADITEMS(vac_work_mem * 1024L); max_items = Min(max_items, INT_MAX); max_items = Min(max_items, MAXDEADITEMS(MaxAllocSize)); /* curious coding here to ensure the multiplication can't overflow */ - if ((BlockNumber) (max_items / MaxHeapTuplesPerPage) > rel_pages) - max_items = rel_pages * MaxHeapTuplesPerPage; + if ((BlockNumber) (max_items / MaxHeapTuplesPerPage) > scanned_pages) + max_items = scanned_pages * MaxHeapTuplesPerPage; /* stay sane if small maintenance_work_mem */ max_items = Max(max_items, MaxHeapTuplesPerPage); @@ -3182,12 +3206,12 @@ dead_items_max_items(LVRelState *vacrel) * DSM when required. */ static void -dead_items_alloc(LVRelState *vacrel, int nworkers) +dead_items_alloc(LVRelState *vacrel, int nworkers, BlockNumber scanned_pages) { VacDeadItems *dead_items; int max_items; - max_items = dead_items_max_items(vacrel); + max_items = dead_items_max_items(vacrel, scanned_pages); Assert(max_items >= MaxHeapTuplesPerPage); /* diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 74ff01bb1..379c1ba5b 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -16,6 +16,10 @@ * visibilitymap_pin_ok - check whether correct map page is already pinned * visibilitymap_set - set a bit in a previously pinned page * visibilitymap_get_status - get status of bits + * visibilitymap_snap_acquire - acquire snapshot of visibility map + * visibilitymap_snap_strategy - set VACUUM's scanning strategy + * visibilitymap_snap_next - get next block to scan from vmsnap + * visibilitymap_snap_release - release previously acquired snapshot * visibilitymap_count - count number of bits set in visibility map * visibilitymap_prepare_truncate - * prepare for truncation of the visibility map @@ -52,6 +56,10 @@ * * VACUUM will normally skip pages for which the visibility map bit is set; * such pages can't contain any dead tuples and therefore don't need vacuuming. + * VACUUM uses a snapshot of the visibility map to avoid scanning pages whose + * visibility map bit gets concurrently unset. This also provides us with a + * convenient way of performing I/O prefetching on behalf of VACUUM, since the + * pages that VACUUM's first heap pass will scan are fully predetermined. * * LOCKING * @@ -92,10 +100,12 @@ #include "access/xlogutils.h" #include "miscadmin.h" #include "port/pg_bitutils.h" +#include "storage/buffile.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" #include "storage/smgr.h" #include "utils/inval.h" +#include "utils/spccache.h" /*#define TRACE_VISIBILITYMAP */ @@ -124,9 +134,81 @@ #define FROZEN_MASK64 UINT64CONST(0xaaaaaaaaaaaaaaaa) /* The upper bit of each * bit pair */ +/* + * Prefetching of heap pages takes place as VACUUM requests the next block in + * line from its visibility map snapshot + * + * XXX MIN_PREFETCH_SIZE of 32 is a little on the high side, but matches + * hard-coded constant used by vacuumlazy.c when prefetching for rel + * truncation. Might be better to increase the maintenance_io_concurrency + * default, or to do nothing like this at all. + */ +#define STAGED_BUFSIZE (MAX_IO_CONCURRENCY * 2) +#define MIN_PREFETCH_SIZE ((BlockNumber) 32) + +/* + * Snapshot of visibility map at the start of a VACUUM operation + */ +struct vmsnapshot +{ + /* Target heap rel */ + Relation rel; + /* Scanning strategy used by VACUUM operation */ + vmstrategy strat; + /* Per-strategy final scanned_pages */ + BlockNumber rel_pages; + BlockNumber scanned_pages_lazy; + BlockNumber scanned_pages_eager; + + /* + * Materialized visibility map state. + * + * VM snapshots spill to a temp file when required. + */ + BlockNumber nvmpages; + BufFile *file; + + /* + * Prefetch distance, used to perform I/O prefetching of heap pages + */ + int prefetch_distance; + + /* Current VM page cached */ + BlockNumber curvmpage; + char *rawmap; + PGAlignedBlock vmpage; + + /* Staging area for blocks returned to VACUUM */ + BlockNumber staged[STAGED_BUFSIZE]; + int current_nblocks_staged; + + /* + * Next block from range of rel_pages to consider placing in staged block + * array (it will be placed there if it's going to be scanned by VACUUM) + */ + BlockNumber next_block; + + /* + * Number of blocks that we still need to return, and number of blocks + * that we still need to prefetch + */ + BlockNumber scanned_pages_to_return; + BlockNumber scanned_pages_to_prefetch; + + /* offset of next block in line to return (from staged) */ + int next_return_idx; + /* offset of next block in line to prefetch (from staged) */ + int next_prefetch_idx; + /* offset of first garbage/invalid element (from staged) */ + int first_invalid_idx; +}; + + /* prototypes for internal routines */ static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend); static void vm_extend(Relation rel, BlockNumber vm_nblocks); +static void vm_snap_stage_blocks(vmsnapshot *vmsnap); +static uint8 vm_snap_get_status(vmsnapshot *vmsnap, BlockNumber heapBlk); /* @@ -376,6 +458,354 @@ visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) return result; } +/* + * visibilitymap_snap_acquire - get read-only snapshot of visibility map + * + * Initializes VACUUM caller's snapshot, allocating memory in current context. + * Used by VACUUM to determine which pages it must scan up front. + * + * Set scanned_pages_lazy and scanned_pages_eager to help VACUUM decide on its + * scanning strategy. These are VACUUM's scanned_pages when it opts to skip + * all eligible pages and scanned_pages when it opts to just skip all-frozen + * pages, respectively. + * + * Caller finalizes scanning strategy by calling visibilitymap_snap_strategy. + * This determines the kind of blocks visibilitymap_snap_next should indicate + * need to be scanned by VACUUM. + */ +vmsnapshot * +visibilitymap_snap_acquire(Relation rel, BlockNumber rel_pages, + BlockNumber *scanned_pages_lazy, + BlockNumber *scanned_pages_eager) +{ + BlockNumber nvmpages = 0, + mapBlockLast = 0, + all_visible = 0, + all_frozen = 0; + uint8 mapbits_last_page = 0; + vmsnapshot *vmsnap; + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "visibilitymap_snap_acquire %s %u", + RelationGetRelationName(rel), rel_pages); +#endif + + /* + * Allocate space for VM pages up to and including those required to have + * bits for the would-be heap block that is just beyond rel_pages + */ + if (rel_pages > 0) + { + mapBlockLast = HEAPBLK_TO_MAPBLOCK(rel_pages - 1); + nvmpages = mapBlockLast + 1; + } + + /* Allocate and initialize VM snapshot state */ + vmsnap = palloc0(sizeof(vmsnapshot)); + vmsnap->rel = rel; + vmsnap->strat = VMSNAP_SCAN_ALL; /* for now */ + vmsnap->rel_pages = rel_pages; /* scanned_pages for VMSNAP_SCAN_ALL */ + vmsnap->scanned_pages_lazy = 0; + vmsnap->scanned_pages_eager = 0; + + /* + * vmsnap temp file state. + * + * Only relations large enough to need more than one visibility map page + * use a temp file (cannot wholly rely on vmsnap's single page cache). + */ + vmsnap->nvmpages = nvmpages; + vmsnap->file = NULL; + if (nvmpages > 1) + vmsnap->file = BufFileCreateTemp(false); + vmsnap->prefetch_distance = 0; +#ifdef USE_PREFETCH + vmsnap->prefetch_distance = + get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); +#endif + vmsnap->prefetch_distance = Max(vmsnap->prefetch_distance, MIN_PREFETCH_SIZE); + + /* cache of VM pages read from temp file */ + vmsnap->curvmpage = 0; + vmsnap->rawmap = NULL; + + /* staged blocks array state */ + vmsnap->current_nblocks_staged = 0; + vmsnap->next_block = 0; + vmsnap->scanned_pages_to_return = 0; + vmsnap->scanned_pages_to_prefetch = 0; + /* Offsets into staged blocks array */ + vmsnap->next_return_idx = 0; + vmsnap->next_prefetch_idx = 0; + vmsnap->first_invalid_idx = 0; + + for (BlockNumber mapBlock = 0; mapBlock <= mapBlockLast; mapBlock++) + { + Buffer mapBuffer; + char *map; + uint64 *umap; + + mapBuffer = vm_readbuf(rel, mapBlock, false); + if (!BufferIsValid(mapBuffer)) + { + /* + * Not all VM pages available. Remember that, so that we'll treat + * relevant heap pages as not all-visible/all-frozen when asked. + */ + vmsnap->nvmpages = mapBlock; + break; + } + + /* Cache page locally */ + LockBuffer(mapBuffer, BUFFER_LOCK_SHARE); + memcpy(vmsnap->vmpage.data, BufferGetPage(mapBuffer), BLCKSZ); + UnlockReleaseBuffer(mapBuffer); + + /* Finish off this VM page using snapshot's vmpage cache */ + vmsnap->curvmpage = mapBlock; + vmsnap->rawmap = map = PageGetContents(vmsnap->vmpage.data); + umap = (uint64 *) map; + + if (mapBlock == mapBlockLast) + { + uint32 mapByte; + uint8 mapOffset; + + /* + * The last VM page requires some extra steps. + * + * First get the status of the last heap page (page in the range + * of rel_pages) in passing. + */ + Assert(mapBlock == HEAPBLK_TO_MAPBLOCK(rel_pages - 1)); + mapByte = HEAPBLK_TO_MAPBYTE(rel_pages - 1); + mapOffset = HEAPBLK_TO_OFFSET(rel_pages - 1); + mapbits_last_page = ((map[mapByte] >> mapOffset) & + VISIBILITYMAP_VALID_BITS); + + /* + * Also defensively "truncate" our local copy of the last page in + * order to reliably exclude heap pages beyond the range of + * rel_pages. This is just paranoia. + */ + mapByte = HEAPBLK_TO_MAPBYTE(rel_pages); + mapOffset = HEAPBLK_TO_OFFSET(rel_pages); + if (mapByte != 0 || mapOffset != 0) + { + MemSet(&map[mapByte + 1], 0, MAPSIZE - (mapByte + 1)); + map[mapByte] &= (1 << mapOffset) - 1; + } + } + + /* Maintain count of all-frozen and all-visible pages */ + for (int i = 0; i < MAPSIZE / sizeof(uint64); i++) + { + all_visible += pg_popcount64(umap[i] & VISIBLE_MASK64); + all_frozen += pg_popcount64(umap[i] & FROZEN_MASK64); + } + + /* Finally, write out vmpage cache VM page to vmsnap's temp file */ + if (vmsnap->file) + BufFileWrite(vmsnap->file, vmsnap->vmpage.data, BLCKSZ); + } + + /* + * Should always have at least as many all_visible pages as all_frozen + * pages. Even still, we generally only interpret a page as all-frozen + * when both the all-visible and all-frozen bits are set together. Clamp + * so that we'll avoid giving our caller an obviously bogus summary of the + * visibility map when certain pages only have their all-frozen bit set. + * More paranoia. + */ + Assert(all_frozen <= all_visible && all_visible <= rel_pages); + all_frozen = Min(all_frozen, all_visible); + + /* + * Done copying all VM pages from authoritative VM into a VM snapshot. + * + * Figure out the final scanned_pages for the two skipping policies that + * we might use: skipallvis (skip both all-frozen and all-visible) and + * skipallfrozen (just skip all-frozen). + */ + vmsnap->scanned_pages_lazy = rel_pages - all_visible; + vmsnap->scanned_pages_eager = rel_pages - all_frozen; + + /* + * When the last page is skippable in principle, it still won't be treated + * as skippable by visibilitymap_snap_next, which recognizes the last page + * as a special case. Compensate by incrementing each scanning strategy's + * scanned_pages as needed to avoid counting the last page as skippable. + * + * As usual we expect that the all-frozen bit can only be set alongside + * the all-visible bit (for any given page), but only interpret a page as + * truly all-frozen when both of its VM bits are set together. + */ + if (mapbits_last_page & VISIBILITYMAP_ALL_VISIBLE) + { + vmsnap->scanned_pages_lazy++; + if (mapbits_last_page & VISIBILITYMAP_ALL_FROZEN) + vmsnap->scanned_pages_eager++; + } + + *scanned_pages_lazy = vmsnap->scanned_pages_lazy; + *scanned_pages_eager = vmsnap->scanned_pages_eager; + + return vmsnap; +} + +/* + * visibilitymap_snap_strategy -- determine VACUUM's scanning strategy. + * + * VACUUM chooses a vmsnap strategy according to priorities around advancing + * relfrozenxid. See visibilitymap_snap_acquire. + */ +void +visibilitymap_snap_strategy(vmsnapshot *vmsnap, vmstrategy strat) +{ + int nprefetch; + + /* Remember final scanning strategy */ + vmsnap->strat = strat; + + if (vmsnap->strat == VMSNAP_SCAN_LAZY) + vmsnap->scanned_pages_to_return = vmsnap->scanned_pages_lazy; + else if (vmsnap->strat == VMSNAP_SCAN_EAGER) + vmsnap->scanned_pages_to_return = vmsnap->scanned_pages_eager; + else + vmsnap->scanned_pages_to_return = vmsnap->rel_pages; + + vmsnap->scanned_pages_to_prefetch = vmsnap->scanned_pages_to_return; + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "visibilitymap_snap_strategy %s %d %u", + RelationGetRelationName(vmsnap->rel), (int) strat, + vmsnap->scanned_pages_to_return); +#endif + + /* + * Stage blocks (may have to read from temp file). + * + * We rely on the assumption that we'll always have a large enough staged + * blocks array to accommodate any possible prefetch distance. + */ + vm_snap_stage_blocks(vmsnap); + + nprefetch = Min(vmsnap->current_nblocks_staged, vmsnap->prefetch_distance); +#ifdef USE_PREFETCH + for (int i = 0; i < nprefetch; i++) + { + PrefetchBuffer(vmsnap->rel, MAIN_FORKNUM, vmsnap->staged[i]); + } +#endif + + vmsnap->scanned_pages_to_prefetch -= nprefetch; + vmsnap->next_prefetch_idx += nprefetch; +} + +/* + * visibilitymap_snap_next -- get next block to scan from vmsnap. + * + * Returns next block in line for VACUUM to scan according to vmsnap. Caller + * skips any and all blocks preceding returned block. + * + * VACUUM always scans the last page to determine whether it has tuples. This + * is useful as a way of avoiding certain pathological cases with heap rel + * truncation. We always return the final block (rel_pages - 1) here last. + */ +BlockNumber +visibilitymap_snap_next(vmsnapshot *vmsnap) +{ + BlockNumber next_block_to_scan; + + if (vmsnap->scanned_pages_to_return == 0) + return InvalidBlockNumber; + + /* Prepare to return this block */ + next_block_to_scan = vmsnap->staged[vmsnap->next_return_idx++]; + vmsnap->current_nblocks_staged--; + vmsnap->scanned_pages_to_return--; + + /* + * Did the staged blocks array just run out of blocks to return to caller, + * or do we need to stage more blocks for I/O prefetching purposes? + */ + Assert(vmsnap->next_prefetch_idx <= vmsnap->first_invalid_idx); + if ((vmsnap->current_nblocks_staged == 0 && + vmsnap->scanned_pages_to_return > 0) || + (vmsnap->next_prefetch_idx == vmsnap->first_invalid_idx && + vmsnap->scanned_pages_to_prefetch > 0)) + { + if (vmsnap->current_nblocks_staged > 0) + { + /* + * We've run out of prefetchable blocks, but still have some + * non-returned blocks. Shift existing blocks to the start of the + * array. The newly staged blocks go after these ones. + */ + memmove(&vmsnap->staged[0], + &vmsnap->staged[vmsnap->next_return_idx], + sizeof(BlockNumber) * vmsnap->current_nblocks_staged); + } + + /* + * Reset offsets in staged blocks array, while accounting for likely + * presence of preexisting blocks that have already been prefetched + * but have yet to be returned to VACUUM caller + */ + vmsnap->next_prefetch_idx -= vmsnap->next_return_idx; + vmsnap->first_invalid_idx -= vmsnap->next_return_idx; + vmsnap->next_return_idx = 0; + + /* Stage more blocks (may have to read from temp file) */ + vm_snap_stage_blocks(vmsnap); + } + + /* + * By here we're guaranteed to have at least one prefetchable block in the + * staged blocks array (unless we've already prefetched all blocks that + * will ever be returned to VACUUM caller) + */ + if (vmsnap->next_prefetch_idx < vmsnap->first_invalid_idx) + { +#ifdef USE_PREFETCH + /* Still have remaining blocks to prefetch, so prefetch next one */ + BlockNumber prefetch = vmsnap->staged[vmsnap->next_prefetch_idx++]; + + PrefetchBuffer(vmsnap->rel, MAIN_FORKNUM, prefetch); +#else + vmsnap->next_prefetch_idx++; +#endif + Assert(vmsnap->current_nblocks_staged > 1); + Assert(vmsnap->scanned_pages_to_prefetch > 0); + vmsnap->scanned_pages_to_prefetch--; + } + else + { + Assert(vmsnap->scanned_pages_to_prefetch == 0); + } + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "visibilitymap_snap_next %s %u", + RelationGetRelationName(vmsnap->rel), next_block_to_scan); +#endif + + return next_block_to_scan; +} + +/* + * visibilitymap_snap_release - release previously acquired snapshot + * + * Frees resources allocated in visibilitymap_snap_acquire for VACUUM. + */ +void +visibilitymap_snap_release(vmsnapshot *vmsnap) +{ + Assert(vmsnap->scanned_pages_to_return == 0); + if (vmsnap->file) + BufFileClose(vmsnap->file); + pfree(vmsnap); +} + /* * visibilitymap_count - count number of bits set in visibility map * @@ -680,3 +1110,112 @@ vm_extend(Relation rel, BlockNumber vm_nblocks) UnlockRelationForExtension(rel, ExclusiveLock); } + +/* + * Stage some heap blocks from vmsnap to return to VACUUM caller. + * + * Called when we completely run out of staged blocks to return to VACUUM, or + * when vmsnap still has some pending staged blocks, but too few to be able to + * prefetch incrementally as the remaining blocks are returned to VACUUM. + */ +static void +vm_snap_stage_blocks(vmsnapshot *vmsnap) +{ + Assert(vmsnap->current_nblocks_staged < STAGED_BUFSIZE); + Assert(vmsnap->first_invalid_idx < STAGED_BUFSIZE); + Assert(vmsnap->next_return_idx <= vmsnap->first_invalid_idx); + Assert(vmsnap->next_prefetch_idx <= vmsnap->first_invalid_idx); + + while (vmsnap->next_block < vmsnap->rel_pages && + vmsnap->current_nblocks_staged < STAGED_BUFSIZE) + { + for (;;) + { + uint8 mapbits = vm_snap_get_status(vmsnap, + vmsnap->next_block); + + if ((mapbits & VISIBILITYMAP_ALL_VISIBLE) == 0) + { + Assert((mapbits & VISIBILITYMAP_ALL_FROZEN) == 0); + break; + } + + /* + * Stop staging blocks just before final page, which must always + * be scanned by VACUUM + */ + if (vmsnap->next_block == vmsnap->rel_pages - 1) + break; + + /* VMSNAP_SCAN_ALL forcing VACUUM to scan every page? */ + if (vmsnap->strat == VMSNAP_SCAN_ALL) + break; + + /* + * Check if VACUUM must scan this page because it's not all-frozen + * and VACUUM opted to use VMSNAP_SCAN_EAGER strategy + */ + if ((mapbits & VISIBILITYMAP_ALL_FROZEN) == 0 && + vmsnap->strat == VMSNAP_SCAN_EAGER) + break; + + /* VACUUM will skip this block -- so don't stage it for later */ + vmsnap->next_block++; + } + + /* VACUUM will scan this block, so stage it for later */ + vmsnap->staged[vmsnap->first_invalid_idx++] = vmsnap->next_block++; + vmsnap->current_nblocks_staged++; + } +} + +/* + * Get status of bits from vm snapshot + */ +static uint8 +vm_snap_get_status(vmsnapshot *vmsnap, BlockNumber heapBlk) +{ + BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); + uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk); + +#ifdef TRACE_VISIBILITYMAP + elog(DEBUG1, "vm_snap_get_status %u", heapBlk); +#endif + + /* + * If we didn't see the VM page when the snapshot was first acquired we + * defensively assume heapBlk not all-visible or all-frozen + */ + Assert(heapBlk <= vmsnap->rel_pages); + if (unlikely(mapBlock >= vmsnap->nvmpages)) + return 0; + + /* + * Read from temp file when required. + * + * Although this routine supports random access, sequential access is + * expected. We should only need to read each temp file page into cache + * at most once per VACUUM. + */ + if (unlikely(mapBlock != vmsnap->curvmpage)) + { + size_t nread; + + if (BufFileSeekBlock(vmsnap->file, mapBlock) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not seek to block %u of vmsnap temporary file", + mapBlock))); + nread = BufFileRead(vmsnap->file, vmsnap->vmpage.data, BLCKSZ); + if (nread != BLCKSZ) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read block %u of vmsnap temporary file: read only %zu of %zu bytes", + mapBlock, nread, (size_t) BLCKSZ))); + vmsnap->curvmpage = mapBlock; + vmsnap->rawmap = PageGetContents(vmsnap->vmpage.data); + } + + return ((vmsnap->rawmap[mapByte] >> mapOffset) & VISIBILITYMAP_VALID_BITS); +} diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index dcdccea03..de2e98368 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -970,11 +970,11 @@ vacuum_get_cutoffs(Relation rel, const VacuumParams *params, freeze_strategy_threshold; uint64 threshold_nblocks; TransactionId nextXID, - safeOldestXmin, - aggressiveXIDCutoff; + safeOldestXmin; MultiXactId nextMXID, - safeOldestMxact, - aggressiveMXIDCutoff; + safeOldestMxact; + double XIDFrac, + MXIDFrac; /* Use mutable copies of freeze age parameters */ freeze_min_age = params->freeze_min_age; @@ -1114,48 +1114,48 @@ vacuum_get_cutoffs(Relation rel, const VacuumParams *params, cutoffs->freeze_strategy_threshold_nblocks = threshold_nblocks; /* - * Finally, figure out if caller needs to do an aggressive VACUUM or not. - * * Determine the table freeze age to use: as specified by the caller, or - * the value of the vacuum_freeze_table_age GUC, but in any case not more - * than autovacuum_freeze_max_age * 0.95, so that if you have e.g nightly - * VACUUM schedule, the nightly VACUUM gets a chance to freeze XIDs before - * anti-wraparound autovacuum is launched. + * the value of the vacuum_freeze_table_age GUC. The GUC's default value + * of -1 is interpreted as "just use autovacuum_freeze_max_age value". + * Also clamp using autovacuum_freeze_max_age. */ if (freeze_table_age < 0) freeze_table_age = vacuum_freeze_table_age; - freeze_table_age = Min(freeze_table_age, autovacuum_freeze_max_age * 0.95); - Assert(freeze_table_age >= 0); - aggressiveXIDCutoff = nextXID - freeze_table_age; - if (!TransactionIdIsNormal(aggressiveXIDCutoff)) - aggressiveXIDCutoff = FirstNormalTransactionId; - if (TransactionIdPrecedesOrEquals(rel->rd_rel->relfrozenxid, - aggressiveXIDCutoff)) - return true; + if (freeze_table_age < 0 || freeze_table_age > autovacuum_freeze_max_age) + freeze_table_age = autovacuum_freeze_max_age; /* * Similar to the above, determine the table freeze age to use for * multixacts: as specified by the caller, or the value of the - * vacuum_multixact_freeze_table_age GUC, but in any case not more than - * effective_multixact_freeze_max_age * 0.95, so that if you have e.g. - * nightly VACUUM schedule, the nightly VACUUM gets a chance to freeze - * multixacts before anti-wraparound autovacuum is launched. + * vacuum_multixact_freeze_table_age GUC. The GUC's default value of -1 + * is interpreted as "just use effective_multixact_freeze_max_age value". + * Also clamp using effective_multixact_freeze_max_age. */ if (multixact_freeze_table_age < 0) multixact_freeze_table_age = vacuum_multixact_freeze_table_age; - multixact_freeze_table_age = - Min(multixact_freeze_table_age, - effective_multixact_freeze_max_age * 0.95); - Assert(multixact_freeze_table_age >= 0); - aggressiveMXIDCutoff = nextMXID - multixact_freeze_table_age; - if (aggressiveMXIDCutoff < FirstMultiXactId) - aggressiveMXIDCutoff = FirstMultiXactId; - if (MultiXactIdPrecedesOrEquals(rel->rd_rel->relminmxid, - aggressiveMXIDCutoff)) - return true; + if (multixact_freeze_table_age < 0 || + multixact_freeze_table_age > effective_multixact_freeze_max_age) + multixact_freeze_table_age = effective_multixact_freeze_max_age; - /* Non-aggressive VACUUM */ - return false; + /* + * Finally, set tableagefrac for VACUUM. This can come from either XID or + * XMID table age (whichever is greater currently). + */ + XIDFrac = (double) (nextXID - cutoffs->relfrozenxid) / + ((double) freeze_table_age + 0.5); + MXIDFrac = (double) (nextMXID - cutoffs->relminmxid) / + ((double) multixact_freeze_table_age + 0.5); + cutoffs->tableagefrac = Max(XIDFrac, MXIDFrac); + + /* + * Make sure that antiwraparound autovacuums reliably advance relfrozenxid + * to the satisfaction of autovacuum.c, even when the reloption version of + * autovacuum_freeze_max_age happens to be in use + */ + if (params->is_wraparound) + cutoffs->tableagefrac = 1.0; + + return (cutoffs->tableagefrac >= 1.0); } /* diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 615bee883..e8c6c13da 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -2497,10 +2497,10 @@ struct config_int ConfigureNamesInt[] = { {"vacuum_freeze_table_age", PGC_USERSET, CLIENT_CONN_STATEMENT, gettext_noop("Age at which VACUUM should scan whole table to freeze tuples."), - NULL + gettext_noop("-1 to use autovacuum_freeze_max_age value.") }, &vacuum_freeze_table_age, - 150000000, 0, 2000000000, + -1, -1, 2000000000, NULL, NULL, NULL }, @@ -2517,10 +2517,10 @@ struct config_int ConfigureNamesInt[] = { {"vacuum_multixact_freeze_table_age", PGC_USERSET, CLIENT_CONN_STATEMENT, gettext_noop("Multixact age at which VACUUM should scan whole table to freeze tuples."), - NULL + gettext_noop("-1 to use autovacuum_multixact_freeze_max_age value.") }, &vacuum_multixact_freeze_table_age, - 150000000, 0, 2000000000, + -1, -1, 2000000000, NULL, NULL, NULL }, diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 6d8c76cf6..acdf7be61 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -660,6 +660,13 @@ # autovacuum, -1 means use # vacuum_cost_limit +# - AUTOVACUUM compatibility options (legacy) - + +#vacuum_freeze_table_age = -1 # target maximum XID age, or -1 to + # use autovacuum_freeze_max_age +#vacuum_multixact_freeze_table_age = -1 # target maximum MXID age, or -1 to + # use autovacuum_multixact_freeze_max_age + #------------------------------------------------------------------------------ # CLIENT CONNECTION DEFAULTS @@ -693,11 +700,9 @@ #lock_timeout = 0 # in milliseconds, 0 is disabled #idle_in_transaction_session_timeout = 0 # in milliseconds, 0 is disabled #idle_session_timeout = 0 # in milliseconds, 0 is disabled -#vacuum_freeze_table_age = 150000000 #vacuum_freeze_strategy_threshold = 4GB #vacuum_freeze_min_age = 50000000 #vacuum_failsafe_age = 1600000000 -#vacuum_multixact_freeze_table_age = 150000000 #vacuum_multixact_freeze_min_age = 5000000 #vacuum_multixact_failsafe_age = 1600000000 #bytea_output = 'hex' # hex, escape diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index b995c3824..c98e6c306 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -9210,20 +9210,28 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; - VACUUM performs an aggressive scan if the table's - pg_class.relfrozenxid field has reached - the age specified by this setting. An aggressive scan differs from - a regular VACUUM in that it visits every page that might - contain unfrozen XIDs or MXIDs, not just those that might contain dead - tuples. The default is 150 million transactions. Although users can - set this value anywhere from zero to two billion, VACUUM - will silently limit the effective value to 95% of - , so that a - periodic manual VACUUM has a chance to run before an - anti-wraparound autovacuum is launched for the table. For more - information see - . + VACUUM reliably advances + relfrozenxid to a recent value if + the table's + pg_class.relfrozenxid + field has reached the age specified by this setting. + The default is -1. If -1 is specified, the value + of is used. + Although users can set this value anywhere from zero to two + billion, VACUUM will silently limit the + effective value to . For more + information see . + + + The meaning of this parameter, and its default value, changed + in PostgreSQL 16. Freezing and advancing + pg_class.relfrozenxid + now take place more proactively, based on criteria that considers both + costs and benefits. + + @@ -9292,19 +9300,27 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; - VACUUM performs an aggressive scan if the table's - pg_class.relminmxid field has reached - the age specified by this setting. An aggressive scan differs from - a regular VACUUM in that it visits every page that might - contain unfrozen XIDs or MXIDs, not just those that might contain dead - tuples. The default is 150 million multixacts. - Although users can set this value anywhere from zero to two billion, - VACUUM will silently limit the effective value to 95% of - , so that a - periodic manual VACUUM has a chance to run before an - anti-wraparound is launched for the table. - For more information see . + VACUUM reliably advances + relminmxid to a recent value if the table's + pg_class.relminmxid + field has reached the age specified by this setting. + The default is -1. If -1 is specified, the value of is used. + Although users can set this value anywhere from zero to two + billion, VACUUM will silently limit the + effective value to . For more + information see . + + + The meaning of this parameter, and its default value, changed + in PostgreSQL 16. Freezing and advancing + pg_class.relminmxid + now take place more proactively, based on criteria that considers both + costs and benefits. + + diff --git a/doc/src/sgml/maintenance.sgml b/doc/src/sgml/maintenance.sgml index 759ea5ac9..18c32983f 100644 --- a/doc/src/sgml/maintenance.sgml +++ b/doc/src/sgml/maintenance.sgml @@ -497,13 +497,6 @@ aggressive vacuum, which will freeze all eligible unfrozen XID and MXID values, including those from all-visible but not all-frozen pages. In practice most tables require periodic aggressive vacuuming. - - controls when VACUUM does that: all-visible but not all-frozen - pages are scanned if the number of transactions that have passed since the - last such scan is greater than vacuum_freeze_table_age minus - vacuum_freeze_min_age. Setting - vacuum_freeze_table_age to 0 forces VACUUM to - always use its aggressive strategy. @@ -533,27 +526,9 @@ vacuum_freeze_min_age. - - The effective maximum for vacuum_freeze_table_age is 0.95 * - autovacuum_freeze_max_age; a setting higher than that will be - capped to the maximum. A value higher than - autovacuum_freeze_max_age wouldn't make sense because an - anti-wraparound autovacuum would be triggered at that point anyway, and - the 0.95 multiplier leaves some breathing room to run a manual - VACUUM before that happens. As a rule of thumb, - vacuum_freeze_table_age should be set to a value somewhat - below autovacuum_freeze_max_age, leaving enough gap so that - a regularly scheduled VACUUM or an autovacuum triggered by - normal delete and update activity is run in that window. Setting it too - close could lead to anti-wraparound autovacuums, even though the table - was recently vacuumed to reclaim space, whereas lower values lead to more - frequent aggressive vacuuming. - - The sole disadvantage of increasing autovacuum_freeze_max_age - (and vacuum_freeze_table_age along with it) is that - the pg_xact and pg_commit_ts + is that the pg_xact and pg_commit_ts subdirectories of the database cluster will take more space, because it must store the commit status and (if track_commit_timestamp is enabled) timestamp of all transactions back to @@ -630,7 +605,7 @@ SELECT datname, age(datfrozenxid) FROM pg_database; advanced when every page of the table that might contain unfrozen XIDs is scanned. This happens when relfrozenxid is more than - vacuum_freeze_table_age transactions old, when + autovacuum_freeze_max_age transactions old, when VACUUM's FREEZE option is used, or when all pages that are not already all-frozen happen to require vacuuming to remove dead row versions. When VACUUM @@ -648,6 +623,29 @@ SELECT datname, age(datfrozenxid) FROM pg_database; be forced for the table. + + + vacuum_freeze_table_age can be used to override + autovacuum_freeze_max_age locally. + VACUUM will advance + relfrozenxid in the same way as it + would had autovacuum_freeze_max_age been set to + the same value, without any direct impact on autovacuum + scheduling. + + + Prior to PostgreSQL 16, + VACUUM did not apply a cost model to decide + when to advance relfrozenxid, which + made vacuum_freeze_table_age an important + tunable setting. This is no longer the case. The revised + vacuum_freeze_table_age default of + -1 makes VACUUM use + autovacuum_freeze_max_age as an input to its + cost model, which should be adequate in most environments. + + + If for some reason autovacuum fails to clear old XIDs from a table, the system will begin to emit warning messages like this when the database's @@ -720,12 +718,6 @@ HINT: Stop the postmaster and vacuum that database in single-user mode. transaction ID, or a newer multixact ID. For each table, pg_class.relminmxid stores the oldest possible multixact ID still appearing in any tuple of that table. - If this value is older than - , an aggressive - vacuum is forced. As discussed in the previous section, an aggressive - vacuum means that only those pages which are known to be all-frozen will - be skipped. mxid_age() can be used on - pg_class.relminmxid to find its age. @@ -844,10 +836,22 @@ vacuum insert threshold = vacuum base insert threshold + vacuum insert scale fac DELETE and INSERT operation. (It is only semi-accurate because some information might be lost under heavy load.) If the relfrozenxid value of the table - is more than vacuum_freeze_table_age transactions old, - an aggressive vacuum is performed to freeze old tuples and advance - relfrozenxid; otherwise, only pages that have been modified - since the last vacuum are scanned. + is more than autovacuum_freeze_max_age transactions old, + vacuum must freeze old tuples from existing all-visible pages to + be able to advance relfrozenxid; + otherwise, vacuum applies a cost model that advances + relfrozenxid whenever the added cost of + doing so during the ongoing operation is sufficiently low. + autovacuum_freeze_max_age is used to guide + VACUUM on how + relfrozenxid must be advanced in the + worst case, which is often only weakly predictive of the actual + rate. Much depends on workload characteristics. A cost model + dynamically determines whether or not to advance + relfrozenxid at the start of each + VACUUM. The model finds the most opportune + time by weighing the added cost of advancement against the age + that relfrozenxid has already attained. diff --git a/doc/src/sgml/ref/vacuum.sgml b/doc/src/sgml/ref/vacuum.sgml index 545b23b54..6ba4385a0 100644 --- a/doc/src/sgml/ref/vacuum.sgml +++ b/doc/src/sgml/ref/vacuum.sgml @@ -158,11 +158,11 @@ VACUUM [ FULL ] [ FREEZE ] [ VERBOSE ] [ ANALYZE ] [ visibility map. Pages where - all tuples are known to be frozen can always be skipped, and those - where all tuples are known to be visible to all transactions may be - skipped except when performing an aggressive vacuum. Furthermore, - except when performing an aggressive vacuum, some pages may be skipped - in order to avoid waiting for other sessions to finish using them. + all tuples are known to be frozen can always be skipped. Pages + where all tuples are known to be visible to all transactions are + skipped whenever VACUUM determined that + advancing relfrozenxid and + relminmxid was unnecessary. This option disables all page-skipping behavior, and is intended to be used only when the contents of the visibility map are suspect, which should happen only if there is a hardware or software diff --git a/src/test/regress/expected/reloptions.out b/src/test/regress/expected/reloptions.out index b6aef6f65..0e569d300 100644 --- a/src/test/regress/expected/reloptions.out +++ b/src/test/regress/expected/reloptions.out @@ -102,8 +102,8 @@ SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass; INSERT INTO reloptions_test VALUES (1, NULL), (NULL, NULL); ERROR: null value in column "i" of relation "reloptions_test" violates not-null constraint DETAIL: Failing row contains (null, null). --- Do an aggressive vacuum to prevent page-skipping. -VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) reloptions_test; +-- Do a VACUUM FREEZE to prevent skipping any pruning. +VACUUM FREEZE reloptions_test; SELECT pg_relation_size('reloptions_test') > 0; ?column? ---------- @@ -128,8 +128,8 @@ SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass; INSERT INTO reloptions_test VALUES (1, NULL), (NULL, NULL); ERROR: null value in column "i" of relation "reloptions_test" violates not-null constraint DETAIL: Failing row contains (null, null). --- Do an aggressive vacuum to prevent page-skipping. -VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) reloptions_test; +-- Do a VACUUM FREEZE to prevent skipping any pruning. +VACUUM FREEZE reloptions_test; SELECT pg_relation_size('reloptions_test') = 0; ?column? ---------- diff --git a/src/test/regress/sql/reloptions.sql b/src/test/regress/sql/reloptions.sql index 4252b0202..b2bed8ed8 100644 --- a/src/test/regress/sql/reloptions.sql +++ b/src/test/regress/sql/reloptions.sql @@ -61,8 +61,8 @@ CREATE TEMP TABLE reloptions_test(i INT NOT NULL, j text) autovacuum_enabled=false); SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass; INSERT INTO reloptions_test VALUES (1, NULL), (NULL, NULL); --- Do an aggressive vacuum to prevent page-skipping. -VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) reloptions_test; +-- Do a VACUUM FREEZE to prevent skipping any pruning. +VACUUM FREEZE reloptions_test; SELECT pg_relation_size('reloptions_test') > 0; SELECT reloptions FROM pg_class WHERE oid = @@ -72,8 +72,8 @@ SELECT reloptions FROM pg_class WHERE oid = ALTER TABLE reloptions_test RESET (vacuum_truncate); SELECT reloptions FROM pg_class WHERE oid = 'reloptions_test'::regclass; INSERT INTO reloptions_test VALUES (1, NULL), (NULL, NULL); --- Do an aggressive vacuum to prevent page-skipping. -VACUUM (FREEZE, DISABLE_PAGE_SKIPPING) reloptions_test; +-- Do a VACUUM FREEZE to prevent skipping any pruning. +VACUUM FREEZE reloptions_test; SELECT pg_relation_size('reloptions_test') = 0; -- Test toast.* options -- 2.39.0