From 1fc1a338e5d6621f89df46fe29d08c799267b39d Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Wed, 17 Sep 2025 15:52:18 -0400 Subject: [PATCH v14 14/24] Eliminate XLOG_HEAP2_VISIBLE from vacuum phase III Instead of emitting a separate XLOG_HEAP2_VISIBLE record for each page that is rendered all-visible by vacuum's third phase, include the updates to the VM in the already emitted XLOG_HEAP2_PRUNE_VACUUM_CLEANUP record. The visibilitymap bits are stored in the flags member of the xl_heap_prune struct. This can decrease the number of of WAL records vacuum phase III emits by as much as half. Author: Melanie Plageman Reviewed-by: Kirill Reshke Reviewed-by: Andres Freund Discussion: https://postgr.es/m/flat/CAAKRu_ZMw6Npd_qm2KM%2BFwQ3cMOMx1Dh3VMhp8-V7SOLxdK9-g%40mail.gmail.com --- src/backend/access/heap/heapam_xlog.c | 147 ++++++++++++++++++------- src/backend/access/heap/pruneheap.c | 37 ++++++- src/backend/access/heap/vacuumlazy.c | 38 +++---- src/backend/access/rmgrdesc/heapdesc.c | 11 +- src/include/access/heapam.h | 1 + src/include/access/heapam_xlog.h | 25 ++++- 6 files changed, 190 insertions(+), 69 deletions(-) diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c index 68b41f39e69..c1f332f7a9a 100644 --- a/src/backend/access/heap/heapam_xlog.c +++ b/src/backend/access/heap/heapam_xlog.c @@ -35,7 +35,9 @@ heap_xlog_prune_freeze(XLogReaderState *record) Buffer buffer; RelFileLocator rlocator; BlockNumber blkno; - XLogRedoAction action; + Buffer vmbuffer = InvalidBuffer; + uint8 vmflags = 0; + Size freespace = 0; XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno); memcpy(&xlrec, maindataptr, SizeOfHeapPrune); @@ -50,11 +52,22 @@ heap_xlog_prune_freeze(XLogReaderState *record) Assert((xlrec.flags & XLHP_CLEANUP_LOCK) != 0 || (xlrec.flags & (XLHP_HAS_REDIRECTIONS | XLHP_HAS_DEAD_ITEMS)) == 0); + if (xlrec.flags & XLHP_VM_ALL_VISIBLE) + { + vmflags = VISIBILITYMAP_ALL_VISIBLE; + if (xlrec.flags & XLHP_VM_ALL_FROZEN) + vmflags |= VISIBILITYMAP_ALL_FROZEN; + } + /* - * We are about to remove and/or freeze tuples. In Hot Standby mode, - * ensure that there are no queries running for which the removed tuples - * are still visible or which still consider the frozen xids as running. - * The conflict horizon XID comes after xl_heap_prune. + * After xl_heap_prune is the optional snapshot conflict horizon. + * + * In Hot Standby mode, we must ensure that there are no running queries + * which would conflict with the changes in this record. That means we + * can't replay this record if it removes tuples that are still visible to + * transactions on the standby, freeze tuples with xids that are still + * considered running on the standby, or set a page as all-visible in the + * VM if it isn't all-visible to all transactions on the standby. */ if ((xlrec.flags & XLHP_HAS_CONFLICT_HORIZON) != 0) { @@ -71,12 +84,12 @@ heap_xlog_prune_freeze(XLogReaderState *record) } /* - * If we have a full-page image, restore it and we're done. + * If we have a full-page image of the heap block, restore it and we're + * done with the heap block. */ - action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, - (xlrec.flags & XLHP_CLEANUP_LOCK) != 0, - &buffer); - if (action == BLK_NEEDS_REDO) + if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, + (xlrec.flags & XLHP_CLEANUP_LOCK) != 0, + &buffer) == BLK_NEEDS_REDO) { Page page = BufferGetPage(buffer); OffsetNumber *redirected; @@ -100,6 +113,11 @@ heap_xlog_prune_freeze(XLogReaderState *record) do_prune = nredirected > 0 || ndead > 0 || nunused > 0; + /* Ensure the record does something */ + Assert(do_prune || nplans > 0 || + vmflags & VISIBILITYMAP_VALID_BITS || + xlrec.flags & XLHP_SET_PD_ALL_VIS); + /* * Update all line pointers per the record, and repair fragmentation * if needed. @@ -147,15 +165,23 @@ heap_xlog_prune_freeze(XLogReaderState *record) * page-level PD_ALL_VISIBLE bit is clear. If that were to occur, * then a subsequent page modification would fail to clear the * visibility map bit. + * + * Note: we don't worry about updating the page's prunability hints. + * At worst this will cause an extra prune cycle to occur soon. */ if (xlrec.flags & XLHP_SET_PD_ALL_VIS) PageSetAllVisible(page); /* - * Note: we don't worry about updating the page's prunability hints. - * At worst this will cause an extra prune cycle to occur soon. + * We must never end up with the VM bit set and the page-level + * PD_ALL_VISIBLE bit clear. If that were to occur, a subsequent page + * modification would fail to clear the VM bit. */ - MarkBufferDirty(buffer); + Assert(!(vmflags & VISIBILITYMAP_VALID_BITS) || PageIsAllVisible(page)); + + /* If this record only sets the VM, no need to dirty the heap page */ + if (do_prune || nplans > 0 || xlrec.flags & XLHP_SET_PD_ALL_VIS) + MarkBufferDirty(buffer); /* * We always emit a WAL record when setting PD_ALL_VISIBLE, but we are @@ -171,47 +197,94 @@ heap_xlog_prune_freeze(XLogReaderState *record) } /* - * If we released any space or line pointers or set PD_ALL_VISIBLE update - * the freespace map. + * If we released any space or line pointers or set PD_ALL_VISIBLE or the + * VM, update the freespace map. * - * Even if we are just setting PD_ALL_VISIBLE (and thus not freeing up any - * space), we'll still update the FSM for this page. Since the FSM is not - * WAL-logged and only updated heuristically, it easily becomes stale in - * standbys. If the standby is later promoted and runs VACUUM, it will - * skip updating individual free space figures for pages that became - * all-visible (or all-frozen, depending on the vacuum mode,) which is - * troublesome when FreeSpaceMapVacuum propagates too optimistic free - * space values to upper FSM layers; later inserters try to use such pages - * only to find out that they are unusable. This can cause long stalls - * when there are many such pages. + * Even if we are just setting PD_ALL_VISIBLE or updating the VM (and thus + * not freeing up any space), we'll still update the FSM for this page. + * Since the FSM is not WAL-logged and only updated heuristically, it + * easily becomes stale in standbys. If the standby is later promoted and + * runs VACUUM, it will skip updating individual free space figures for + * pages that became all-visible (or all-frozen, depending on the vacuum + * mode,) which is troublesome when FreeSpaceMapVacuum propagates too + * optimistic free space values to upper FSM layers; later inserters try + * to use such pages only to find out that they are unusable. This can + * cause long stalls when there are many such pages. * * Forestall those problems by updating FSM's idea about a page that is * becoming all-visible or all-frozen. * * Do this regardless of a full-page image being applied, since the FSM * data is not in the page anyway. + * + * We want to avoid holding an exclusive lock on the heap buffer while + * doing IO (either of the FSM or the VM), so we'll release the lock on + * the heap buffer before doing either. */ if (BufferIsValid(buffer)) { if (xlrec.flags & (XLHP_HAS_REDIRECTIONS | XLHP_HAS_DEAD_ITEMS | XLHP_HAS_NOW_UNUSED_ITEMS | - XLHP_SET_PD_ALL_VIS)) - { - Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer)); + XLHP_SET_PD_ALL_VIS | + (vmflags & VISIBILITYMAP_VALID_BITS))) + freespace = PageGetHeapFreeSpace(BufferGetPage(buffer)); - /* - * We want to avoid holding an exclusive lock on the heap buffer - * while doing IO, so we'll release the lock on the heap buffer - * first. - */ - UnlockReleaseBuffer(buffer); + UnlockReleaseBuffer(buffer); + } - XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); + /* + * Now read and update the VM block. + * + * Note that the heap relation may have been dropped or truncated, leading + * us to skip updating the heap block due to the LSN interlock. However, + * even in that case, it's still safe to update the visibility map. Any + * WAL record that clears the visibility map bit does so before checking + * the page LSN, so any bits that need to be cleared will still be + * cleared. + * + * Note that the lock on the heap page was dropped above. In normal + * operation this would never be safe because a concurrent query could + * modify the heap page and clear PD_ALL_VISIBLE -- violating the + * invariant that PD_ALL_VISIBLE must be set if the corresponding bit in + * the VM is set. + * + * In recovery, we expect no other writers, so writing to the VM page + * without holding a lock on the heap page is considered safe enough. It + * is done this way when replaying xl_heap_visible records (see + * heap_xlog_visible()). + */ + if (vmflags & VISIBILITYMAP_VALID_BITS && + XLogReadBufferForRedoExtended(record, 1, + RBM_ZERO_ON_ERROR, + false, + &vmbuffer) == BLK_NEEDS_REDO) + { + Page vmpage = BufferGetPage(vmbuffer); + uint8 old_vmbits = 0; + Relation reln = CreateFakeRelcacheEntry(rlocator); + + /* initialize the page if it was read as zeros */ + if (PageIsNew(vmpage)) + PageInit(vmpage, BLCKSZ, 0); + + old_vmbits = visibilitymap_set_vmbits(reln, blkno, vmbuffer, vmflags); + + /* Only set VM page LSN if we modified the page */ + if (old_vmbits != vmflags) + { + Assert(BufferIsDirty(vmbuffer)); + PageSetLSN(BufferGetPage(vmbuffer), lsn); } - else - UnlockReleaseBuffer(buffer); + + FreeFakeRelcacheEntry(reln); } + + if (BufferIsValid(vmbuffer)) + UnlockReleaseBuffer(vmbuffer); + + if (freespace > 0) + XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); } /* diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 9b25131543b..9e00fbf3cd1 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -20,6 +20,7 @@ #include "access/multixact.h" #include "access/transam.h" #include "access/xlog.h" +#include "access/visibilitymapdefs.h" #include "access/xloginsert.h" #include "commands/vacuum.h" #include "executor/instrument.h" @@ -913,6 +914,7 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, conflict_xid = prstate.latest_xid_removed; log_heap_prune_and_freeze(relation, buffer, + InvalidBuffer, 0, conflict_xid, true, do_set_pd_vis, @@ -2088,14 +2090,18 @@ heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, * * This is used for several different page maintenance operations: * - * - Page pruning, in VACUUM's 1st pass or on access: Some items are + * - Page pruning, in vacuum phase I or on-access: Some items are * redirected, some marked dead, and some removed altogether. * - * - Freezing: Items are marked as 'frozen'. + * - Freezing: During vacuum phase I, items are marked as 'frozen' * - * - Vacuum, 2nd pass: Items that are already LP_DEAD are marked as unused. + * - Reaping: During vacuum phase III, items that are already LP_DEAD are + * marked as unused. * - * They have enough commonalities that we use a single WAL record for them + * - VM updates: After vacuum phase III, the heap page may be marked + * all-visible and all-frozen. + * + * These changes all happen together, so we use a single WAL record for them * all. * * If replaying the record requires a cleanup lock, pass cleanup_lock = true. @@ -2103,6 +2109,10 @@ heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, * replaying 'unused' items depends on whether they were all previously marked * as dead. * + * If the VM is being updated, vmflags will contain the bits to set. In this + * case, vmbuffer should already have been updated and marked dirty and should + * still be pinned and locked. + * * set_pd_all_vis indicates that we set PD_ALL_VISIBLE and thus should update * the page LSN when checksums/wal_log_hints are enabled even if we did not * prune or freeze tuples on the page. @@ -2113,6 +2123,7 @@ heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, */ void log_heap_prune_and_freeze(Relation relation, Buffer buffer, + Buffer vmbuffer, uint8 vmflags, TransactionId conflict_xid, bool cleanup_lock, bool set_pd_all_vis, @@ -2139,6 +2150,8 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer, xlrec.flags = 0; regbuf_flags = REGBUF_STANDARD; + Assert((vmflags & VISIBILITYMAP_VALID_BITS) == vmflags); + /* * We can avoid an FPI if the only modification we are making to the heap * page is to set PD_ALL_VISIBLE and checksums/wal_log_hints are disabled. @@ -2157,6 +2170,10 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer, */ XLogBeginInsert(); XLogRegisterBuffer(0, buffer, regbuf_flags); + + if (vmflags & VISIBILITYMAP_VALID_BITS) + XLogRegisterBuffer(1, vmbuffer, 0); + if (nfrozen > 0) { int nplans; @@ -2213,6 +2230,12 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer, * Prepare the main xl_heap_prune record. We already set the XLHP_HAS_* * flag above. */ + if (vmflags & VISIBILITYMAP_ALL_VISIBLE) + { + xlrec.flags |= XLHP_VM_ALL_VISIBLE; + if (vmflags & VISIBILITYMAP_ALL_FROZEN) + xlrec.flags |= XLHP_VM_ALL_FROZEN; + } if (set_pd_all_vis) xlrec.flags |= XLHP_SET_PD_ALL_VIS; if (RelationIsAccessibleInLogicalDecoding(relation)) @@ -2247,6 +2270,12 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer, } recptr = XLogInsert(RM_HEAP2_ID, info); + if (vmflags & VISIBILITYMAP_VALID_BITS) + { + Assert(BufferIsDirty(vmbuffer)); + PageSetLSN(BufferGetPage(vmbuffer), recptr); + } + /* * We must bump the page LSN if pruning or freezing. If we are only * updating PD_ALL_VISIBLE, though, we can skip doing this unless diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index a0f3984e37f..b6c973cd111 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -1906,6 +1906,8 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, log_newpage_buffer(buf, true); else log_heap_prune_and_freeze(vacrel->rel, buf, + InvalidBuffer, + 0, InvalidTransactionId, /* conflict xid */ false, /* cleanup lock */ true, /* set_pd_all_vis */ @@ -2817,6 +2819,7 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, OffsetNumber unused[MaxHeapTuplesPerPage]; int nunused = 0; TransactionId visibility_cutoff_xid; + TransactionId conflict_xid = InvalidTransactionId; bool all_frozen; LVSavedErrInfo saved_err_info; uint8 vmflags = 0; @@ -2842,6 +2845,9 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, vmflags |= VISIBILITYMAP_ALL_FROZEN; Assert(!TransactionIdIsValid(visibility_cutoff_xid)); } + + /* Take the lock on the vmbuffer before entering a critical section */ + LockBuffer(vmbuffer, BUFFER_LOCK_EXCLUSIVE); } START_CRIT_SECTION(); @@ -2868,7 +2874,13 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, * setting the VM, we must set PD_ALL_VISIBLE as well. */ if ((vmflags & VISIBILITYMAP_VALID_BITS) != 0) + { PageSetAllVisible(page); + visibilitymap_set_vmbits(vacrel->rel, + blkno, + vmbuffer, vmflags); + conflict_xid = visibility_cutoff_xid; + } /* * Mark buffer dirty before we write WAL. @@ -2879,7 +2891,8 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, if (RelationNeedsWAL(vacrel->rel)) { log_heap_prune_and_freeze(vacrel->rel, buffer, - InvalidTransactionId, + vmbuffer, vmflags, + conflict_xid, false, /* no cleanup lock required */ (vmflags & VISIBILITYMAP_VALID_BITS) != 0, PRUNE_VACUUM_CLEANUP, @@ -2889,36 +2902,17 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, unused, nunused); } - /* - * Note that we don't end the critical section until after emitting the VM - * record. This ensures both PD_ALL_VISIBLE and the VM bits are set or - * unset in the event of a crash. While it is correct for PD_ALL_VISIBLE - * to be set and the VM to be clear, we should do our best to keep these - * in sync. This does mean that we will take a lock on the VM buffer - * inside of a critical section, which is generally discouraged. There is - * precedent for this in other callers of visibilitymap_set(), though. - */ + END_CRIT_SECTION(); - /* - * Now that we have removed the LP_DEAD items from the page, set the - * visibility map if the page became all-visible/all-frozen. Changes to - * the heap page have already been logged. - */ if ((vmflags & VISIBILITYMAP_ALL_VISIBLE) != 0) { - visibilitymap_set(vacrel->rel, blkno, - InvalidXLogRecPtr, - vmbuffer, visibility_cutoff_xid, - vmflags); - /* Count the newly set VM page for logging */ + LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); vacrel->vm_new_visible_pages++; if (all_frozen) vacrel->vm_new_visible_frozen_pages++; } - END_CRIT_SECTION(); - /* Revert to the previous phase information for error traceback */ restore_vacuum_error_info(vacrel, &saved_err_info); } diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c index b48d7dc1d24..1cb44ca32d3 100644 --- a/src/backend/access/rmgrdesc/heapdesc.c +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -103,7 +103,7 @@ plan_elem_desc(StringInfo buf, void *plan, void *data) * code, the latter of which is used in frontend (pg_waldump) code. */ void -heap_xlog_deserialize_prune_and_freeze(char *cursor, uint8 flags, +heap_xlog_deserialize_prune_and_freeze(char *cursor, uint16 flags, int *nplans, xlhp_freeze_plan **plans, OffsetNumber **frz_offsets, int *nredirected, OffsetNumber **redirected, @@ -287,6 +287,15 @@ heap2_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, ", isCatalogRel: %c", xlrec->flags & XLHP_IS_CATALOG_REL ? 'T' : 'F'); + if (xlrec->flags & XLHP_VM_ALL_VISIBLE) + { + uint8 vmflags = VISIBILITYMAP_ALL_VISIBLE; + + if (xlrec->flags & XLHP_VM_ALL_FROZEN) + vmflags |= VISIBILITYMAP_ALL_FROZEN; + appendStringInfo(buf, ", vm_flags: 0x%02X", vmflags); + } + if (XLogRecHasBlockData(record, 0)) { Size datalen; diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 2f77d8dbcd6..be66970c9f0 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -389,6 +389,7 @@ extern void heap_page_prune_execute(Buffer buffer, bool lp_truncate_only, OffsetNumber *nowunused, int nunused); extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets); extern void log_heap_prune_and_freeze(Relation relation, Buffer buffer, + Buffer vmbuffer, uint8 vmflags, TransactionId conflict_xid, bool cleanup_lock, bool set_pd_all_vis, diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 82b8f7f2bbc..833114e0a6e 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -249,7 +249,7 @@ typedef struct xl_heap_update * Main data section: * * xl_heap_prune - * uint8 flags + * uint16 flags * TransactionId snapshot_conflict_horizon * * Block 0 data section: @@ -284,7 +284,7 @@ typedef struct xl_heap_update */ typedef struct xl_heap_prune { - uint8 flags; + uint16 flags; /* * If XLHP_HAS_CONFLICT_HORIZON is set, the conflict horizon XID follows, @@ -292,11 +292,17 @@ typedef struct xl_heap_prune */ } xl_heap_prune; -#define SizeOfHeapPrune (offsetof(xl_heap_prune, flags) + sizeof(uint8)) +#define SizeOfHeapPrune (offsetof(xl_heap_prune, flags) + sizeof(uint16)) #define XLHP_SET_PD_ALL_VIS (1 << 0) -/* to handle recovery conflict during logical decoding on standby */ +/* + * To handle recovery conflict during logical decoding on standby, we must know + * if the table is a catalog table. Note that in visibilitymapdefs.h + * VISIBLITYMAP_XLOG_CATALOG_REL is also defined as (1 << 2). xl_heap_prune + * records should use XLHP_IS_CATALOG_REL, not VISIBILIYTMAP_XLOG_CATALOG_REL -- + * even if they only contain updates to the VM. + */ #define XLHP_IS_CATALOG_REL (1 << 1) /* @@ -332,6 +338,15 @@ typedef struct xl_heap_prune #define XLHP_HAS_DEAD_ITEMS (1 << 6) #define XLHP_HAS_NOW_UNUSED_ITEMS (1 << 7) +/* + * The xl_heap_prune record's flags may also contain which VM bits to set. + * xl_heap_prune should always use the XLHP_VM_ALL_VISIBLE and + * XLHP_VM_ALL_FROZEN flags and translate them to their visibilitymapdefs.h + * equivalents, VISIBILITYMAP_ALL_VISIBLE and VISIBILITYMAP_ALL_FROZEN. + */ +#define XLHP_VM_ALL_VISIBLE (1 << 8) +#define XLHP_VM_ALL_FROZEN (1 << 9) + /* * xlhp_freeze_plan describes how to freeze a group of one or more heap tuples * (appears in xl_heap_prune's xlhp_freeze_plans sub-record) @@ -498,7 +513,7 @@ extern XLogRecPtr log_heap_visible(Relation rel, uint8 vmflags); /* in heapdesc.c, so it can be shared between frontend/backend code */ -extern void heap_xlog_deserialize_prune_and_freeze(char *cursor, uint8 flags, +extern void heap_xlog_deserialize_prune_and_freeze(char *cursor, uint16 flags, int *nplans, xlhp_freeze_plan **plans, OffsetNumber **frz_offsets, int *nredirected, OffsetNumber **redirected, -- 2.43.0