From a407673cb2632d4544cc56458dbf4a063da2067c Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Tue, 2 Dec 2025 16:16:22 -0500 Subject: [PATCH v23 05/14] Eliminate XLOG_HEAP2_VISIBLE from vacuum phase I prune/freeze Vacuum no longer emits a separate WAL record for each page set all-visible or all-frozen during phase I. Instead, visibility map updates are now included in the XLOG_HEAP2_PRUNE_VACUUM_SCAN record that is already emitted for pruning and freezing. Previously, heap_page_prune_and_freeze() determined whether a page was all-visible, but the corresponding VM bits were only set later in lazy_scan_prune(). Now the VM is updated immediately in heap_page_prune_and_freeze(), at the same time as the heap modifications. This change applies only to vacuum phase I, not to pruning performed during normal page access. NOTE: This commit is the main commit and all review-only commits preceding it will be squashed into it. Author: Melanie Plageman Reviewed-by: Andres Freund Reviewed-by: Robert Haas Reviewed-by: Kirill Reshke Reviewed-by: Chao Li Discussion: https://postgr.es/m/flat/CAAKRu_ZMw6Npd_qm2KM%2BFwQ3cMOMx1Dh3VMhp8-V7SOLxdK9-g%40mail.gmail.com --- src/backend/access/heap/heapam_xlog.c | 48 +++-- src/backend/access/heap/pruneheap.c | 294 +++++++++++++++----------- src/backend/access/heap/vacuumlazy.c | 1 + src/include/access/heapam.h | 1 + 4 files changed, 212 insertions(+), 132 deletions(-) diff --git a/src/backend/access/heap/heapam_xlog.c b/src/backend/access/heap/heapam_xlog.c index 11cb3f74da5..b1ceab71928 100644 --- a/src/backend/access/heap/heapam_xlog.c +++ b/src/backend/access/heap/heapam_xlog.c @@ -104,6 +104,8 @@ heap_xlog_prune_freeze(XLogReaderState *record) OffsetNumber *frz_offsets; char *dataptr = XLogRecGetBlockData(record, 0, &datalen); bool do_prune; + bool set_lsn = false; + bool mark_buffer_dirty = false; heap_xlog_deserialize_prune_and_freeze(dataptr, xlrec.flags, &nplans, &plans, &frz_offsets, @@ -157,17 +159,39 @@ heap_xlog_prune_freeze(XLogReaderState *record) /* There should be no more data */ Assert((char *) frz_offsets == dataptr + datalen); - if (vmflags & VISIBILITYMAP_VALID_BITS) - PageSetAllVisible(page); - - MarkBufferDirty(buffer); + if (do_prune || nplans > 0) + mark_buffer_dirty = set_lsn = true; /* - * See log_heap_prune_and_freeze() for commentary on when we set the - * heap page LSN. + * The critical integrity requirement here is that we must never end + * up with the visibility map bit set and the page-level + * PD_ALL_VISIBLE bit unset. If that were to occur, a subsequent page + * modification would fail to clear the visibility map bit. + * + * vmflags may be nonzero with PD_ALL_VISIBLE already set (e.g. when + * marking an all-visible page all-frozen). If only the VM is updated, + * the heap page need not be dirtied. */ - if (do_prune || nplans > 0 || - ((vmflags & VISIBILITYMAP_VALID_BITS) && XLogHintBitIsNeeded())) + if ((vmflags & VISIBILITYMAP_VALID_BITS) && !PageIsAllVisible(page)) + { + PageSetAllVisible(page); + mark_buffer_dirty = true; + + /* + * See log_heap_prune_and_freeze() for commentary on when we set + * the heap page LSN. + */ + if (XLogHintBitIsNeeded()) + set_lsn = true; + } + + /* We should always mark a buffer dirty before stamping with an LSN */ + Assert(!set_lsn || mark_buffer_dirty); + + if (mark_buffer_dirty) + MarkBufferDirty(buffer); + + if (set_lsn) PageSetLSN(page, lsn); /* @@ -246,10 +270,10 @@ heap_xlog_prune_freeze(XLogReaderState *record) /* * Replay XLOG_HEAP2_VISIBLE records. * - * The critical integrity requirement here is that we must never end up with - * a situation where the visibility map bit is set, and the page-level - * PD_ALL_VISIBLE bit is clear. If that were to occur, then a subsequent - * page modification would fail to clear the visibility map bit. + * The critical integrity requirement here is that we must never end up with a + * situation where the visibility map bit is set, and the page-level + * PD_ALL_VISIBLE bit is clear. If that were to occur, then a subsequent page + * modification would fail to clear the visibility map bit. */ static void heap_xlog_visible(XLogReaderState *record) diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 2512b5d83e3..b851d723c74 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -194,6 +194,12 @@ static void page_verify_redirects(Page page); static bool heap_page_will_freeze(Relation relation, Buffer buffer, bool did_tuple_hint_fpi, bool do_prune, bool do_hint_prune, PruneState *prstate); +static TransactionId get_conflict_xid(bool do_prune, bool do_freeze, bool do_set_vm, + uint8 new_vmbits, + TransactionId latest_xid_removed, + TransactionId frz_conflict_horizon, + TransactionId visibility_cutoff_xid, + bool blk_already_av); static bool heap_page_will_set_vis(Relation relation, BlockNumber heap_blk, Buffer heap_buf, @@ -783,6 +789,64 @@ heap_page_will_freeze(Relation relation, Buffer buffer, return do_freeze; } +/* + * Calculate the conflict horizon for the whole XLOG_HEAP2_PRUNE_VACUUM_SCAN + * or XLOG_HEAP2_PRUNE_ON_ACCESS record. + */ +static TransactionId +get_conflict_xid(bool do_prune, bool do_freeze, bool do_set_vm, uint8 new_vmbits, + TransactionId latest_xid_removed, TransactionId frz_conflict_horizon, + TransactionId visibility_cutoff_xid, bool blk_already_av) +{ + TransactionId conflict_xid; + + /* + * We can omit the snapshot conflict horizon if we are not pruning or + * freezing any tuples and are setting an already all-visible page + * all-frozen in the VM. In this case, all of the tuples on the page must + * already be visible to all MVCC snapshots on the standby. + */ + if (!do_prune && !do_freeze && + do_set_vm && blk_already_av && (new_vmbits & VISIBILITYMAP_ALL_FROZEN)) + return InvalidTransactionId; + + /* + * The snapshotConflictHorizon for the whole record should be the most + * conservative of all the horizons calculated for any of the possible + * modifications. If this record will prune tuples, any transactions on + * the standby older than the youngest xmax of the most recently removed + * tuple this record will prune will conflict. If this record will freeze + * tuples, any transactions on the standby with xids older than the + * youngest tuple this record will freeze will conflict. + */ + conflict_xid = InvalidTransactionId; + + /* + * If we are updating the VM, the conflict horizon is almost always the + * visibility cutoff XID. + * + * Separately, if we are freezing any tuples, as an optimization, we can + * use the visibility_cutoff_xid as the conflict horizon if the page will + * be all-frozen. This is true even if there are LP_DEAD line pointers + * because we ignored those when maintaining the visibility_cutoff_xid. + * This will have been calculated earlier as the frz_conflict_horizon when + * we determined we would freeze. + */ + if (do_set_vm) + conflict_xid = visibility_cutoff_xid; + else if (do_freeze) + conflict_xid = frz_conflict_horizon; + + /* + * If we are removing tuples with a younger xmax than our so far + * calculated conflict_xid, we must use this as our horizon. + */ + if (TransactionIdFollows(latest_xid_removed, conflict_xid)) + conflict_xid = latest_xid_removed; + + return conflict_xid; +} + /* * Decide whether to set the visibility map bits for heap_blk, using * information from PruneState and blk_known_av. Some callers may already have @@ -984,7 +1048,6 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, Buffer vmbuffer = params->vmbuffer; Page page = BufferGetPage(buffer); BlockNumber blockno = BufferGetBlockNumber(buffer); - TransactionId vm_conflict_horizon = InvalidTransactionId; PruneState prstate; bool do_freeze; bool do_prune; @@ -993,6 +1056,9 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, bool do_set_pd_vis; bool did_tuple_hint_fpi; int64 fpi_before = pgWalUsage.wal_fpi; + TransactionId conflict_xid = InvalidTransactionId; + uint8 new_vmbits = 0; + uint8 old_vmbits = 0; /* Initialize prstate */ prune_freeze_setup(params, @@ -1058,6 +1124,39 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, prstate.all_visible = prstate.all_frozen = false; Assert(!prstate.all_frozen || prstate.all_visible); + Assert(!prstate.all_visible || (prstate.lpdead_items == 0)); + + /* + * Decide whether to set the page-level PD_ALL_VISIBLE bit and the VM bits + * based on information from the VM and the all_visible/all_frozen flags. + * + * While it is valid for PD_ALL_VISIBLE to be set when the corresponding + * VM bit is clear, we strongly prefer to keep them in sync. + * + * Accordingly, we also allow updating only the VM when PD_ALL_VISIBLE has + * already been set. Setting only the VM is most common when setting an + * already all-visible page all-frozen. + */ + do_set_vm = heap_page_will_set_vis(params->relation, + blockno, buffer, vmbuffer, params->blk_known_av, + &prstate, &new_vmbits, &do_set_pd_vis); + + /* We should only set the VM if PD_ALL_VISIBLE is set or will be */ + Assert(!do_set_vm || do_set_pd_vis || PageIsAllVisible(page)); + + /* + * new_vmbits should be 0 regardless of whether or not the page is + * all-visible if we do not intend to set the VM. + */ + Assert(do_set_vm || new_vmbits == 0); + + conflict_xid = get_conflict_xid(do_prune, do_freeze, do_set_vm, new_vmbits, + prstate.latest_xid_removed, prstate.frz_conflict_horizon, + prstate.visibility_cutoff_xid, params->blk_known_av); + + /* Lock vmbuffer before entering a critical section */ + if (do_set_vm) + LockBuffer(vmbuffer, BUFFER_LOCK_EXCLUSIVE); /* Any error while applying the changes is critical */ START_CRIT_SECTION(); @@ -1079,14 +1178,17 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, /* * If that's all we had to do to the page, this is a non-WAL-logged - * hint. If we are going to freeze or prune the page, we will mark - * the buffer dirty below. + * hint. If we are going to freeze or prune the page or set + * PD_ALL_VISIBLE, we will mark the buffer dirty below. + * + * Setting PD_ALL_VISIBLE is fully WAL-logged because it is forbidden + * for the VM to be set and PD_ALL_VISIBLE to be clear. */ - if (!do_freeze && !do_prune) + if (!do_freeze && !do_prune && !do_set_pd_vis) MarkBufferDirtyHint(buffer, true); } - if (do_prune || do_freeze) + if (do_prune || do_freeze || do_set_vm) { /* Apply the planned item changes and repair page fragmentation. */ if (do_prune) @@ -1100,36 +1202,33 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, if (do_freeze) heap_freeze_prepared_tuples(buffer, prstate.frozen, prstate.nfrozen); - MarkBufferDirty(buffer); + if (do_set_pd_vis) + PageSetAllVisible(page); + + if (do_prune || do_freeze || do_set_pd_vis) + MarkBufferDirty(buffer); + + if (do_set_vm) + { + Assert(PageIsAllVisible(page)); + old_vmbits = visibilitymap_set_vmbits(blockno, + vmbuffer, new_vmbits, + params->relation->rd_locator); + Assert(old_vmbits != new_vmbits); + } /* * Emit a WAL XLOG_HEAP2_PRUNE* record showing what we did */ if (RelationNeedsWAL(params->relation)) { - /* - * The snapshotConflictHorizon for the whole record should be the - * most conservative of all the horizons calculated for any of the - * possible modifications. If this record will prune tuples, any - * transactions on the standby older than the youngest xmax of the - * most recently removed tuple this record will prune will - * conflict. If this record will freeze tuples, any transactions - * on the standby with xids older than the youngest tuple this - * record will freeze will conflict. - */ - TransactionId conflict_xid; - - if (TransactionIdFollows(prstate.frz_conflict_horizon, - prstate.latest_xid_removed)) - conflict_xid = prstate.frz_conflict_horizon; - else - conflict_xid = prstate.latest_xid_removed; - log_heap_prune_and_freeze(params->relation, buffer, - InvalidBuffer, /* vmbuffer */ - 0, /* vmflags */ + do_set_vm ? vmbuffer : InvalidBuffer, + do_set_vm ? new_vmbits : 0, conflict_xid, - true, params->reason, + true, /* cleanup lock */ + do_set_pd_vis, + params->reason, prstate.frozen, prstate.nfrozen, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, @@ -1139,43 +1238,8 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, END_CRIT_SECTION(); - /* Copy information back for caller */ - presult->ndeleted = prstate.ndeleted; - presult->nnewlpdead = prstate.ndead; - presult->nfrozen = prstate.nfrozen; - presult->live_tuples = prstate.live_tuples; - presult->recently_dead_tuples = prstate.recently_dead_tuples; - presult->hastup = prstate.hastup; - - presult->lpdead_items = prstate.lpdead_items; - /* the presult->deadoffsets array was already filled in */ - - if (prstate.attempt_freeze) - { - if (presult->nfrozen > 0) - { - *new_relfrozen_xid = prstate.pagefrz.FreezePageRelfrozenXid; - *new_relmin_mxid = prstate.pagefrz.FreezePageRelminMxid; - } - else - { - *new_relfrozen_xid = prstate.pagefrz.NoFreezePageRelfrozenXid; - *new_relmin_mxid = prstate.pagefrz.NoFreezePageRelminMxid; - } - } - - /* - * If updating the visibility map, the conflict horizon for that record - * must be the newest xmin on the page. However, if the page is - * completely frozen, there can be no conflict and the vm_conflict_horizon - * should remain InvalidTransactionId. This includes the case that we - * just froze all the tuples; the prune-freeze record included the - * conflict XID already so we don't need to again. - */ - if (prstate.all_frozen) - vm_conflict_horizon = InvalidTransactionId; - else - vm_conflict_horizon = prstate.visibility_cutoff_xid; + if (do_set_vm) + LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); /* * During its second pass over the heap, VACUUM calls @@ -1190,7 +1254,8 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, TransactionId debug_cutoff; bool debug_all_frozen; - Assert(presult->lpdead_items == 0); + Assert(prstate.lpdead_items == 0); + Assert(prstate.cutoffs); Assert(heap_page_is_all_visible(params->relation, buffer, prstate.cutoffs->OldestXmin, @@ -1200,62 +1265,36 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, Assert(prstate.all_frozen == debug_all_frozen); Assert(!TransactionIdIsValid(debug_cutoff) || - debug_cutoff == vm_conflict_horizon); + debug_cutoff == prstate.visibility_cutoff_xid); } #endif - Assert(!prstate.all_frozen || prstate.all_visible); - Assert(!prstate.all_visible || (prstate.lpdead_items == 0)); - - /* - * Decide whether to set the page-level PD_ALL_VISIBLE bit and the VM bits - * based on information from the VM and the all_visible/all_frozen flags. - * - * While it is valid for PD_ALL_VISIBLE to be set when the corresponding - * VM bit is clear, we strongly prefer to keep them in sync. - * - * Accordingly, we also allow updating only the VM when PD_ALL_VISIBLE has - * already been set. Setting only the VM is most common when setting an - * already all-visible page all-frozen. - */ - do_set_vm = heap_page_will_set_vis(params->relation, - blockno, - buffer, - vmbuffer, - params->blk_known_av, - &prstate, - &presult->new_vmbits, - &do_set_pd_vis); - - /* We should only set the VM if PD_ALL_VISIBLE is set or will be */ - Assert(!do_set_vm || do_set_pd_vis || PageIsAllVisible(page)); + /* Copy information back for caller */ + presult->ndeleted = prstate.ndeleted; + presult->nnewlpdead = prstate.ndead; + presult->nfrozen = prstate.nfrozen; + presult->live_tuples = prstate.live_tuples; + presult->recently_dead_tuples = prstate.recently_dead_tuples; + presult->hastup = prstate.hastup; + presult->new_vmbits = new_vmbits; + presult->old_vmbits = old_vmbits; - /* - * new_vmbits should be 0 regardless of whether or not the page is - * all-visible if we do not intend to set the VM. - */ - Assert(do_set_vm || presult->new_vmbits == 0); + presult->lpdead_items = prstate.lpdead_items; + /* the presult->deadoffsets array was already filled in */ - if (do_set_pd_vis) + if (prstate.attempt_freeze) { - /* - * NB: If the heap page is all-visible but the VM bit is not set, we - * don't need to dirty the heap page. However, if checksums are - * enabled, we do need to make sure that the heap page is dirtied - * before passing it to visibilitymap_set(), because it may be logged. - * Given that this situation should only happen in rare cases after a - * crash, it is not worth optimizing. - */ - MarkBufferDirty(buffer); - PageSetAllVisible(page); + if (presult->nfrozen > 0) + { + *new_relfrozen_xid = prstate.pagefrz.FreezePageRelfrozenXid; + *new_relmin_mxid = prstate.pagefrz.FreezePageRelminMxid; + } + else + { + *new_relfrozen_xid = prstate.pagefrz.NoFreezePageRelfrozenXid; + *new_relmin_mxid = prstate.pagefrz.NoFreezePageRelminMxid; + } } - - presult->old_vmbits = 0; - if (do_set_vm) - presult->old_vmbits = visibilitymap_set(params->relation, blockno, buffer, - InvalidXLogRecPtr, - vmbuffer, vm_conflict_horizon, - presult->new_vmbits); } @@ -2387,14 +2426,18 @@ heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, * * This is used for several different page maintenance operations: * - * - Page pruning, in VACUUM's 1st pass or on access: Some items are + * - Page pruning, in vacuum phase I or on-access: Some items are * redirected, some marked dead, and some removed altogether. * - * - Freezing: Items are marked as 'frozen'. + * - Freezing: During vacuum phase I, items are marked as 'frozen' + * + * - Reaping: During vacuum phase III, items that are already LP_DEAD are + * marked as unused. * - * - Vacuum, 2nd pass: Items that are already LP_DEAD are marked as unused. + * - VM updates: After vacuum phases I and III, the heap page may be marked + * all-visible and all-frozen. * - * They have enough commonalities that we use a single WAL record for them + * These changes all happen together, so we use a single WAL record for them * all. * * If replaying the record requires a cleanup lock, pass cleanup_lock = true. @@ -2406,6 +2449,15 @@ heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, * case, vmbuffer should already have been updated and marked dirty and should * still be pinned and locked. * + * set_pd_all_vis indicates that we set PD_ALL_VISIBLE and thus should update + * the page LSN when checksums/wal_log_hints are enabled even if we did not + * prune or freeze tuples on the page. + * + * In some cases, such as when heap_page_prune_and_freeze() is setting an + * already marked all-visible page all-frozen, PD_ALL_VISIBLE may already be + * set. So, it is possible for vmflags to be non-zero and set_pd_all_vis to be + * false. + * * Note: This function scribbles on the 'frozen' array. * * Note: This is called in a critical section, so careful what you do here. @@ -2415,6 +2467,7 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer, Buffer vmbuffer, uint8 vmflags, TransactionId conflict_xid, bool cleanup_lock, + bool set_pd_all_vis, PruneReason reason, HeapTupleFreeze *frozen, int nfrozen, OffsetNumber *redirected, int nredirected, @@ -2451,7 +2504,7 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer, */ if (!do_prune && nfrozen == 0 && - (!do_set_vm || !XLogHintBitIsNeeded())) + (!set_pd_all_vis || !XLogHintBitIsNeeded())) regbuf_flags_heap |= REGBUF_NO_IMAGE; /* @@ -2569,7 +2622,8 @@ log_heap_prune_and_freeze(Relation relation, Buffer buffer, * See comment at the top of the function about regbuf_flags_heap for * details on when we can advance the page LSN. */ - if (do_prune || nfrozen > 0 || (do_set_vm && XLogHintBitIsNeeded())) + if (do_prune || nfrozen > 0 || + (set_pd_all_vis && XLogHintBitIsNeeded())) { Assert(BufferIsDirty(buffer)); PageSetLSN(BufferGetPage(buffer), recptr); diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 4aa425ec945..0d39d57115d 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -2776,6 +2776,7 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, vmflags, conflict_xid, false, /* no cleanup lock required */ + (vmflags & VISIBILITYMAP_VALID_BITS) != 0, PRUNE_VACUUM_CLEANUP, NULL, 0, /* frozen */ NULL, 0, /* redirected */ diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index b20096b6ca1..14c1d92604d 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -435,6 +435,7 @@ extern void log_heap_prune_and_freeze(Relation relation, Buffer buffer, Buffer vmbuffer, uint8 vmflags, TransactionId conflict_xid, bool cleanup_lock, + bool set_pd_all_vis, PruneReason reason, HeapTupleFreeze *frozen, int nfrozen, OffsetNumber *redirected, int nredirected, -- 2.43.0