From 9904f827846bb2660dbc9ff0ecb1d24dbe9dc3bc Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Wed, 17 Sep 2025 17:29:59 -0400 Subject: [PATCH v14 17/24] Eliminate XLOG_HEAP2_VISIBLE from vacuum phase I prune/freeze Instead of emitting a separate WAL record for every block rendered all-visible/frozen by vacuum's phase I, include the changes to the VM in the XLOG_HEAP2_PRUNE_VACUUM_SCAN record already emitted. This is only enabled for vacuum's prune/freeze work, not for on-access pruning. --- src/backend/access/heap/pruneheap.c | 183 +++++++++++++++++----------- src/include/access/heapam.h | 3 +- 2 files changed, 112 insertions(+), 74 deletions(-) diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index e3f9967e26c..a14c793da7e 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -662,50 +662,58 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, prstate.deadoffsets = presult->deadoffsets; /* - * Caller may update the VM after we're done. We can keep track of - * whether the page will be all-visible and all-frozen after pruning and - * freezing to help the caller to do that. + * Keep track of whether or not the page will be all-visible and + * all-frozen for use in opportunistic freezing and to update the VM if + * the caller requests it. * - * Currently, only VACUUM sets the VM bits. To save the effort, only do - * the bookkeeping if the caller needs it. Currently, that's tied to - * HEAP_PAGE_PRUNE_FREEZE, but it could be a separate flag if you wanted - * to update the VM bits without also freezing or freeze without also - * setting the VM bits. + * Currently, only VACUUM attempts freezing. But other callers could. The + * visibility bookkeeping is required for opportunistic freezing (in + * addition to setting the VM bits) because we only consider + * opportunistically freezing tuples if the whole page would become + * all-frozen or if the whole page will be frozen except for dead tuples + * that will be removed by vacuum. But if consider_update_vm is false, + * we'll not set the VM even if the page is discovered to be all-visible. * - * In addition to telling the caller whether it can set the VM bit, we - * also use 'all_visible' and 'all_frozen' for our own decision-making. If - * the whole page would become frozen, we consider opportunistically - * freezing tuples. We will not be able to freeze the whole page if there - * are tuples present that are not visible to everyone or if there are - * dead tuples which are not yet removable. However, dead tuples which - * will be removed by the end of vacuuming should not preclude us from - * opportunistically freezing. Because of that, we do not immediately - * clear all_visible when we see LP_DEAD items. We fix that after - * scanning the line pointers, before we return the value to the caller, - * so that the caller doesn't set the VM bit incorrectly. + * If only HEAP_PAGE_PRUNE_UPDATE_ViS is passed and not + * HEAP_PAGE_PRUNE_FREEZE, prstate.all_frozen must be initialized to false + * because we will not call heap_prepare_freeze_tuple() on each tuple. + * + * Dead tuples which will be removed by the end of vacuuming should not + * preclude us from opportunistically freezing, so we do not clear + * all_visible when we see LP_DEAD items. We fix that after determining + * whether or not to freeze but before deciding whether or not to update + * the VM so that we don't set the VM bit incorrectly. + * + * If not freezing and not updating the VM, we avoid the extra + * bookkeeping. Initializing all_visible to false allows skipping the work + * to update them in heap_prune_record_unchanged_lp_normal(). */ if (prstate.attempt_freeze) { prstate.all_visible = true; prstate.all_frozen = true; } + else if ((options & HEAP_PAGE_PRUNE_UPDATE_VIS) != 0) + { + prstate.all_visible = true; + prstate.all_frozen = false; + } else { - /* - * Initializing to false allows skipping the work to update them in - * heap_prune_record_unchanged_lp_normal(). - */ prstate.all_visible = false; prstate.all_frozen = false; } /* - * The visibility cutoff xid is the newest xmin of live tuples on the - * page. In the common case, this will be set as the conflict horizon the - * caller can use for updating the VM. If, at the end of freezing and - * pruning, the page is all-frozen, there is no possibility that any - * running transaction on the standby does not see tuples on the page as - * all-visible, so the conflict horizon remains InvalidTransactionId. + * The visibility cutoff xid is the newest xmin of live, committed tuples + * older than OldestXmin on the page. This field is only kept up-to-date + * if the page is all-visible. As soon as a tuple is encountered that is + * not visible to all, this field is unmaintained. As long as it is + * maintained, it can be used to calculate the snapshot conflict horizon. + * This is most likely to happen when updating the VM and/or freezing all + * live tuples on the page. It is updated before returning to the caller + * because vacuum does assert-build only validation on the page using this + * field. */ prstate.visibility_cutoff_xid = InvalidTransactionId; @@ -943,16 +951,15 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * visibility map bits based on information from the VM and from * all_visible and all_frozen variables. * - * Though callers should set the VM if PD_ALL_VISIBLE is set, it is - * allowed for the page-level bit to be set and the VM to be clear. We log - * setting PD_ALL_VISIBLE on the heap page in a - * XLOG_HEAP2_PRUNE_VACUUM_SCAN record and setting the VM bits in a later - * emitted XLOG_HEAP2_VISIBLE record. + * It is allowed for the page-level bit to be set and the VM to be clear, + * however, we have a strong preference for keeping them in sync. * - * Setting PD_ALL_VISIBLE when we are making the changes to the page that - * render it all-visible allows us to omit the heap page from the WAL - * chain when later updating the VM -- even when checksums/wal_log_hints - * are enabled. + * Prior to Postgres 19, it was possible for the page-level bit to be set + * and the VM bit to be clear. This could happen if we crashed after + * setting PD_ALL_VISIBLE but before setting bits in the VM. + * + * As such, it is possible to only update the VM when PD_ALL_VISIBLE is + * already set. */ do_set_pd_vis = false; do_set_vm = false; @@ -961,6 +968,10 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, blockno, buffer, vmbuffer, blk_known_av, &prstate, &new_vmbits, &do_set_pd_vis); + /* Lock vmbuffer before entering a critical section */ + if (do_set_vm) + LockBuffer(vmbuffer, BUFFER_LOCK_EXCLUSIVE); + /* Any error while applying the changes is critical */ START_CRIT_SECTION(); @@ -991,7 +1002,7 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, MarkBufferDirtyHint(buffer, true); } - if (do_prune || do_freeze || do_set_pd_vis) + if (do_prune || do_freeze || do_set_pd_vis || do_set_vm) { /* Apply the planned item changes and repair page fragmentation. */ if (do_prune) @@ -1008,12 +1019,31 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, if (do_set_pd_vis) PageSetAllVisible(page); - MarkBufferDirty(buffer); + if (do_prune || do_freeze || do_set_pd_vis) + MarkBufferDirty(buffer); + + if (do_set_vm) + { + Assert(PageIsAllVisible(page)); + + old_vmbits = visibilitymap_set_vmbits(relation, blockno, + vmbuffer, new_vmbits); + if (old_vmbits == new_vmbits) + { + LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); + /* Unset so we don't emit WAL since no change occurred */ + do_set_vm = false; + } + } /* - * Emit a WAL XLOG_HEAP2_PRUNE* record showing what we did + * Emit a WAL XLOG_HEAP2_PRUNE* record showing what we did If we were + * only updating the VM and it turns out it was already set, we will + * have unset do_set_vm earlier. As such, check it again before + * emitting the record. */ - if (RelationNeedsWAL(relation)) + if (RelationNeedsWAL(relation) && + (do_prune || do_freeze || do_set_pd_vis || do_set_vm)) { /* * The snapshotConflictHorizon for the whole record should be the @@ -1025,15 +1055,45 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, * on the standby with xids older than the youngest tuple this * record will freeze will conflict. */ - TransactionId conflict_xid; + TransactionId conflict_xid = InvalidTransactionId; - if (TransactionIdFollows(frz_conflict_horizon, prstate.latest_xid_removed)) + /* + * If we are updating the VM, the conflict horizon is almost + * always the visibility cutoff XID. + * + * Separately, if we are freezing any tuples, as an optimization, + * we can use the visibility_cutoff_xid as the conflict horizon if + * the page will be all-frozen. This is true even if there are + * LP_DEAD line pointers because we ignored those when maintaining + * the visibility_cutoff_xid. This will have been calculated + * earlier as the frz_conflict_horizon when we determined we would + * freeze. + */ + if (do_set_vm) + conflict_xid = prstate.visibility_cutoff_xid; + else if (do_freeze) conflict_xid = frz_conflict_horizon; - else + + /* + * If we are removing tuples with a younger xmax than our so far + * calculated conflict_xid, we must use this as our horizon. + */ + if (TransactionIdFollows(prstate.latest_xid_removed, conflict_xid)) conflict_xid = prstate.latest_xid_removed; + /* + * We can omit the snapshot conflict horizon if we are not pruning + * or freezing any tuples and are setting an already all-visible + * page all-frozen in the VM. In this case, all of the tuples on + * the page must already be visible to all MVCC snapshots on the + * standby. + */ + if (!do_prune && !do_freeze && do_set_vm && + blk_known_av && (new_vmbits & VISIBILITYMAP_ALL_FROZEN)) + conflict_xid = InvalidTransactionId; + log_heap_prune_and_freeze(relation, buffer, - InvalidBuffer, 0, + vmbuffer, new_vmbits, conflict_xid, true, do_set_pd_vis, @@ -1047,6 +1107,9 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, END_CRIT_SECTION(); + if (do_set_vm) + LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK); + Assert(!prstate.all_visible || (prstate.lpdead_items == 0)); /* @@ -1078,32 +1141,6 @@ heap_page_prune_and_freeze(Relation relation, Buffer buffer, } #endif - /* Now set the VM */ - if (do_set_vm) - { - TransactionId vm_conflict_horizon; - - Assert((new_vmbits & VISIBILITYMAP_VALID_BITS) != 0); - - /* - * The conflict horizon for that record must be the newest xmin on the - * page. However, if the page is completely frozen, there can be no - * conflict and the vm_conflict_horizon should remain - * InvalidTransactionId. This includes the case that we just froze - * all the tuples; the prune-freeze record included the conflict XID - * already so a snapshotConflictHorizon sufficient to make everything - * safe for REDO was logged when the page's tuples were frozen. - */ - if (prstate.all_frozen) - vm_conflict_horizon = InvalidTransactionId; - else - vm_conflict_horizon = prstate.visibility_cutoff_xid; - old_vmbits = visibilitymap_set(relation, blockno, - InvalidXLogRecPtr, - vmbuffer, vm_conflict_horizon, - new_vmbits); - } - /* Copy information back for caller */ presult->ndeleted = prstate.ndeleted; presult->nnewlpdead = prstate.ndead; @@ -2261,7 +2298,7 @@ heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples, * - Reaping: During vacuum phase III, items that are already LP_DEAD are * marked as unused. * - * - VM updates: After vacuum phase III, the heap page may be marked + * - VM updates: After vacuum phases I and III, the heap page may be marked * all-visible and all-frozen. * * These changes all happen together, so we use a single WAL record for them diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 797cd51145d..cac7a4c2899 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -239,7 +239,8 @@ typedef struct PruneFreezeResult * visibility map before updating it during phase I of vacuuming. * new_vmbits are the state of those bits after phase I of vacuuming. * - * These are only set if the HEAP_PAGE_PRUNE_UPDATE_VIS option is set. + * These are only set if the HEAP_PAGE_PRUNE_UPDATE_VIS option is set and + * we have actually updated the VM. */ uint8 new_vmbits; uint8 old_vmbits; -- 2.43.0