From 8ff462278736c7fa1de096f43e805a92c68a5b07 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Wed, 3 Dec 2025 15:24:08 -0500 Subject: [PATCH v31 15/16] Allow on-access pruning to set pages all-visible Many queries do not modify the underlying relation. For such queries, if on-access pruning occurs during the scan, we can check whether the page has become all-visible and update the visibility map accordingly. Previously, only vacuum and COPY FREEZE marked pages as all-visible or all-frozen. This commit implements on-access VM setting for sequential scans as well as for the underlying heap relation in index scans and bitmap heap scans. Author: Melanie Plageman Reviewed-by: Andres Freund Reviewed-by: Kirill Reshke Discussion: https://postgr.es/m/flat/CAAKRu_ZMw6Npd_qm2KM%2BFwQ3cMOMx1Dh3VMhp8-V7SOLxdK9-g%40mail.gmail.com --- src/backend/access/heap/heapam.c | 15 ++++++- src/backend/access/heap/heapam_handler.c | 15 ++++++- src/backend/access/heap/pruneheap.c | 40 ++++++++++++++++++- src/include/access/heapam.h | 24 +++++++++-- .../t/035_standby_logical_decoding.pl | 3 +- 5 files changed, 89 insertions(+), 8 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index fb7a7548aa0..d9dc79f4a96 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -570,6 +570,7 @@ heap_prepare_pagescan(TableScanDesc sscan) Buffer buffer = scan->rs_cbuf; BlockNumber block = scan->rs_cblock; Snapshot snapshot; + Buffer *vmbuffer = NULL; Page page; int lines; bool all_visible; @@ -584,7 +585,9 @@ heap_prepare_pagescan(TableScanDesc sscan) /* * Prune and repair fragmentation for the whole page, if possible. */ - heap_page_prune_opt(scan->rs_base.rs_rd, buffer); + if (sscan->rs_flags & SO_HINT_REL_READ_ONLY) + vmbuffer = &scan->rs_vmbuffer; + heap_page_prune_opt(scan->rs_base.rs_rd, buffer, vmbuffer); /* * We must hold share lock on the buffer content while examining tuple @@ -1261,6 +1264,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, sizeof(TBMIterateResult)); } + scan->rs_vmbuffer = InvalidBuffer; return (TableScanDesc) scan; } @@ -1299,6 +1303,12 @@ heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, scan->rs_cbuf = InvalidBuffer; } + if (BufferIsValid(scan->rs_vmbuffer)) + { + ReleaseBuffer(scan->rs_vmbuffer); + scan->rs_vmbuffer = InvalidBuffer; + } + /* * SO_TYPE_BITMAPSCAN would be cleaned up here, but it does not hold any * additional data vs a normal HeapScan @@ -1331,6 +1341,9 @@ heap_endscan(TableScanDesc sscan) if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); + if (BufferIsValid(scan->rs_vmbuffer)) + ReleaseBuffer(scan->rs_vmbuffer); + /* * Must free the read stream before freeing the BufferAccessStrategy. */ diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index fc251e11f8a..6946da8c9d7 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -85,6 +85,7 @@ heapam_index_fetch_begin(Relation rel, uint32 flags) hscan->xs_base.rel = rel; hscan->xs_cbuf = InvalidBuffer; + hscan->xs_vmbuffer = InvalidBuffer; hscan->modifies_base_rel = !(flags & SO_HINT_REL_READ_ONLY); return &hscan->xs_base; @@ -100,6 +101,12 @@ heapam_index_fetch_reset(IndexFetchTableData *scan) ReleaseBuffer(hscan->xs_cbuf); hscan->xs_cbuf = InvalidBuffer; } + + if (BufferIsValid(hscan->xs_vmbuffer)) + { + ReleaseBuffer(hscan->xs_vmbuffer); + hscan->xs_vmbuffer = InvalidBuffer; + } } static void @@ -139,7 +146,8 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, * Prune page, but only if we weren't already on this page */ if (prev_buf != hscan->xs_cbuf) - heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf); + heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf, + hscan->modifies_base_rel ? NULL : &hscan->xs_vmbuffer); } /* Obtain share-lock on the buffer so we can examine visibility */ @@ -2477,6 +2485,7 @@ BitmapHeapScanNextBlock(TableScanDesc scan, TBMIterateResult *tbmres; OffsetNumber offsets[TBM_MAX_TUPLES_PER_PAGE]; int noffsets = -1; + Buffer *vmbuffer = NULL; Assert(scan->rs_flags & SO_TYPE_BITMAPSCAN); Assert(hscan->rs_read_stream); @@ -2523,7 +2532,9 @@ BitmapHeapScanNextBlock(TableScanDesc scan, /* * Prune and repair fragmentation for the whole page, if possible. */ - heap_page_prune_opt(scan->rs_rd, buffer); + if (scan->rs_flags & SO_HINT_REL_READ_ONLY) + vmbuffer = &hscan->rs_vmbuffer; + heap_page_prune_opt(scan->rs_rd, buffer, vmbuffer); /* * We must hold share lock on the buffer content while examining tuple diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 8e59e7692c1..f414f02964d 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -202,6 +202,8 @@ static bool heap_page_will_set_vm(PruneState *prstate, Relation relation, BlockNumber heap_blk, Buffer heap_buffer, Page heap_page, Buffer vmbuffer, + PruneReason reason, + bool do_prune, bool do_freeze, int nlpdead_items, uint8 *old_vmbits, uint8 *new_vmbits); @@ -223,9 +225,13 @@ static TransactionId get_conflict_xid(bool do_prune, bool do_freeze, bool do_set * if there's not any use in pruning. * * Caller must have pin on the buffer, and must *not* have a lock on it. + * + * If vmbuffer is not NULL, it is okay for pruning to set the visibility map if + * the page is all-visible. We will take care of pinning and, if needed, + * reading in the page of the visibility map. */ void -heap_page_prune_opt(Relation relation, Buffer buffer) +heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer) { Page page = BufferGetPage(buffer); TransactionId prune_xid; @@ -306,6 +312,13 @@ heap_page_prune_opt(Relation relation, Buffer buffer) .cutoffs = NULL, }; + if (vmbuffer) + { + visibilitymap_pin(relation, BufferGetBlockNumber(buffer), vmbuffer); + params.options |= HEAP_PAGE_PRUNE_UPDATE_VM; + params.vmbuffer = *vmbuffer; + } + heap_page_prune_and_freeze(¶ms, &presult, &dummy_off_loc, NULL, NULL); @@ -951,6 +964,9 @@ identify_and_fix_vm_corruption(Relation rel, Buffer heap_buffer, * corrupted, it will fix them by clearing the VM bits and visibility hint. * This does not need to be done in a critical section. * + * This should be called only after do_freeze has been decided (and do_prune + * has been set), as these factor into our heuristic-based decision. + * * Returns true if one or both VM bits should be set, along with returning the * current value of the VM bits in *old_vmbits and the desired new value of * the VM bits in *new_vmbits. @@ -964,6 +980,8 @@ heap_page_will_set_vm(PruneState *prstate, Relation relation, BlockNumber heap_blk, Buffer heap_buffer, Page heap_page, Buffer vmbuffer, + PruneReason reason, + bool do_prune, bool do_freeze, int nlpdead_items, uint8 *old_vmbits, uint8 *new_vmbits) @@ -974,6 +992,24 @@ heap_page_will_set_vm(PruneState *prstate, if (!prstate->attempt_update_vm) return false; + /* + * If this is an on-access call and we're not actually pruning, avoid + * setting the visibility map if it would newly dirty the heap page or, if + * the page is already dirty, if doing so would require including a + * full-page image (FPI) of the heap page in the WAL. This situation + * should be rare, as on-access pruning is only attempted when + * pd_prune_xid is valid. + */ + if (reason == PRUNE_ON_ACCESS && + prstate->all_visible && + !do_prune && !do_freeze && + (!BufferIsDirty(heap_buffer) || XLogCheckBufferNeedsBackup(heap_buffer))) + { + prstate->all_visible = false; + prstate->all_frozen = false; + return false; + } + *old_vmbits = visibilitymap_get_status(relation, heap_blk, &vmbuffer); @@ -1171,6 +1207,8 @@ heap_page_prune_and_freeze(PruneFreezeParams *params, buffer, page, vmbuffer, + params->reason, + do_prune, do_freeze, prstate.lpdead_items, &old_vmbits, &new_vmbits); diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index ba62a4d4cba..b0e7c71463c 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -95,6 +95,13 @@ typedef struct HeapScanDescData */ ParallelBlockTableScanWorkerData *rs_parallelworkerdata; + /* + * For sequential scans and bitmap heap scans. If the relation is not + * being modified, on-access pruning may read in the current heap page's + * corresponding VM block to this buffer. + */ + Buffer rs_vmbuffer; + /* these fields only used in page-at-a-time mode and for bitmap scans */ uint32 rs_cindex; /* current tuple's index in vistuples */ uint32 rs_ntuples; /* number of visible tuples on page */ @@ -117,8 +124,18 @@ typedef struct IndexFetchHeapData { IndexFetchTableData xs_base; /* AM independent part of the descriptor */ - Buffer xs_cbuf; /* current heap buffer in scan, if any */ - /* NB: if xs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ + /* + * Current heap buffer in scan, if any. NB: if xs_cbuf is not + * InvalidBuffer, we hold a pin on that buffer. + */ + Buffer xs_cbuf; + + /* + * For index scans that do not modify the underlying heap table, on-access + * pruning may read in the current heap page's corresponding VM block to + * this buffer. + */ + Buffer xs_vmbuffer; /* * Some optimizations can only be performed if the query does not modify @@ -419,7 +436,8 @@ extern TransactionId heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate); /* in heap/pruneheap.c */ -extern void heap_page_prune_opt(Relation relation, Buffer buffer); +extern void heap_page_prune_opt(Relation relation, Buffer buffer, + Buffer *vmbuffer); extern void heap_page_prune_and_freeze(PruneFreezeParams *params, PruneFreezeResult *presult, OffsetNumber *off_loc, diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl index ebe2fae1789..bdd9f0a62cd 100644 --- a/src/test/recovery/t/035_standby_logical_decoding.pl +++ b/src/test/recovery/t/035_standby_logical_decoding.pl @@ -296,6 +296,7 @@ wal_level = 'logical' max_replication_slots = 4 max_wal_senders = 4 autovacuum = off +hot_standby_feedback = on }); $node_primary->dump_info; $node_primary->start; @@ -748,7 +749,7 @@ check_pg_recvlogical_stderr($handle, $logstart = -s $node_standby->logfile; reactive_slots_change_hfs_and_wait_for_xmins('shared_row_removal_', - 'no_conflict_', 0, 1); + 'no_conflict_', 1, 0); # This should not trigger a conflict wait_until_vacuum_can_remove( -- 2.43.0