Separate catalog_xmin from xmin in walsender hot standby feedback When a standby sends hot standby feedback to a primary that does not use a physical replication slot, ProcessStandbyHSFeedbackMessage() stores min(feedbackCatalogXmin, feedbackXmin) into MyProc->xmin. Because ComputeXidHorizons() treats proc->xmin uniformly for both data and catalog horizons, a very old catalog_xmin from a logical replication slot on the standby incorrectly prevents vacuuming of regular user data tables on the primary. Physical replication slots already handle this correctly by tracking xmin and catalog_xmin in separate slot fields, so slot_catalog_xmin only affects catalog_oldest_nonremovable and not data_oldest_nonremovable. However, many production HA deployments intentionally avoid physical replication slots due to their lifecycle management complexity across failovers -- when a primary fails, physical slots on the old primary are lost and cannot be automatically migrated to the promoted standby. These deployments rely on wal_keep_size or WAL archiving for WAL retention, combined with hot_standby_feedback for visibility horizon management. The no-slot path is therefore a legitimate and widely-used production configuration, not a legacy code path. This commit adds a catalog_xmin field to PGPROC, so that the walsender can track the catalog horizon separately from the data horizon even without a replication slot. ComputeXidHorizons() now accumulates proc->catalog_xmin and applies it only to catalog_oldest_nonremovable and shared_oldest_nonremovable (after saving the _raw value), exactly mirroring how slot_catalog_xmin is handled. GetReplicationHorizons() is updated to include proc_catalog_xmin in the catalog_xmin feedback sent upstream, ensuring correct propagation in cascading standby configurations. --- src/include/storage/proc.h | 4 + src/backend/storage/lmgr/proc.c | 2 + src/backend/storage/ipc/procarray.c | 50 ++++++++-- src/backend/replication/walsender.c | 19 ++-- src/test/recovery/t/053_hs_feedback_catalog_xmin.pl | 181 +++++++++++++++++++ 5 files changed, 240 insertions(+), 16 deletions(-) create mode 100644 src/test/recovery/t/053_hs_feedback_catalog_xmin.pl diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 3e1d1fa..d49bd0d 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -250,6 +250,10 @@ typedef struct PGPROC * vacuum must not remove tuples deleted by * xid >= xmin ! */ + TransactionId catalog_xmin; /* catalog xmin from hot standby feedback, + * used separately from xmin so that it only + * affects catalog horizon, not data horizon */ + XidCacheStatus subxidStatus; /* mirrored with * ProcGlobal->subxidStates[i] */ struct XidCache subxids; /* cache for subtransaction XIDs */ diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 1ac2506..258f28c 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -476,6 +476,7 @@ InitProcess(void) MyProc->fpLocalTransactionId = InvalidLocalTransactionId; MyProc->xid = InvalidTransactionId; MyProc->xmin = InvalidTransactionId; + MyProc->catalog_xmin = InvalidTransactionId; MyProc->pid = MyProcPid; MyProc->vxid.procNumber = MyProcNumber; MyProc->vxid.lxid = InvalidLocalTransactionId; @@ -678,6 +679,7 @@ InitAuxiliaryProcess(void) MyProc->fpLocalTransactionId = InvalidLocalTransactionId; MyProc->xid = InvalidTransactionId; MyProc->xmin = InvalidTransactionId; + MyProc->catalog_xmin = InvalidTransactionId; MyProc->vxid.procNumber = INVALID_PROC_NUMBER; MyProc->vxid.lxid = InvalidLocalTransactionId; MyProc->databaseId = InvalidOid; diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 9299bce..91d2747 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -208,6 +208,12 @@ typedef struct ComputeXidHorizonsResult TransactionId slot_xmin; TransactionId slot_catalog_xmin; + /* + * Oldest catalog_xmin accumulated from PGPROC entries (set by walsenders + * without a replication slot via hot standby feedback). + */ + TransactionId proc_catalog_xmin; + /* * Oldest xid that any backend might still consider running. This needs to * include processes running VACUUM, in contrast to the normal visibility @@ -698,6 +704,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) proc->vxid.lxid = InvalidLocalTransactionId; proc->xmin = InvalidTransactionId; + proc->catalog_xmin = InvalidTransactionId; /* be sure this is cleared in abort */ proc->delayChkptFlags = 0; @@ -738,6 +745,7 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) proc->xid = InvalidTransactionId; proc->vxid.lxid = InvalidLocalTransactionId; proc->xmin = InvalidTransactionId; + proc->catalog_xmin = InvalidTransactionId; /* be sure this is cleared in abort */ proc->delayChkptFlags = 0; @@ -923,6 +931,7 @@ ProcArrayClearTransaction(PGPROC *proc) proc->vxid.lxid = InvalidLocalTransactionId; proc->xmin = InvalidTransactionId; + proc->catalog_xmin = InvalidTransactionId; Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK)); Assert(!proc->delayChkptFlags); @@ -1680,6 +1689,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) /* inferred after ProcArrayLock is released */ h->catalog_oldest_nonremovable = InvalidTransactionId; + h->proc_catalog_xmin = InvalidTransactionId; LWLockAcquire(ProcArrayLock, LW_SHARED); @@ -1801,6 +1811,20 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) h->data_oldest_nonremovable = TransactionIdOlder(h->data_oldest_nonremovable, xmin); } + + /* + * Accumulate catalog_xmin from PGPROC entries. This is set by + * walsenders without a replication slot to separately track the + * catalog horizon from hot standby feedback, so that it only holds + * back catalog table vacuuming, not user data table vacuuming. + */ + { + TransactionId catxmin = UINT32_ACCESS_ONCE(proc->catalog_xmin); + + if (TransactionIdIsValid(catxmin)) + h->proc_catalog_xmin = + TransactionIdOlder(h->proc_catalog_xmin, catxmin); + } } /* @@ -1842,19 +1866,25 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) /* * The only difference between catalog / data horizons is that the slot's - * catalog xmin is applied to the catalog one (so catalogs can be accessed - * for logical decoding). Initialize with data horizon, and then back up - * further if necessary. Have to back up the shared horizon as well, since - * that also can contain catalogs. + * catalog xmin and proc catalog_xmin are applied to the catalog one (so + * catalogs can be accessed for logical decoding). Initialize with data + * horizon, and then back up further if necessary. Have to back up the + * shared horizon as well, since that also can contain catalogs. */ h->shared_oldest_nonremovable_raw = h->shared_oldest_nonremovable; h->shared_oldest_nonremovable = TransactionIdOlder(h->shared_oldest_nonremovable, h->slot_catalog_xmin); + h->shared_oldest_nonremovable = + TransactionIdOlder(h->shared_oldest_nonremovable, + h->proc_catalog_xmin); h->catalog_oldest_nonremovable = h->data_oldest_nonremovable; h->catalog_oldest_nonremovable = TransactionIdOlder(h->catalog_oldest_nonremovable, h->slot_catalog_xmin); + h->catalog_oldest_nonremovable = + TransactionIdOlder(h->catalog_oldest_nonremovable, + h->proc_catalog_xmin); /* * It's possible that slots backed up the horizons further than @@ -1897,6 +1927,9 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) Assert(!TransactionIdIsValid(h->slot_catalog_xmin) || TransactionIdPrecedesOrEquals(h->oldest_considered_running, h->slot_catalog_xmin)); + Assert(!TransactionIdIsValid(h->proc_catalog_xmin) || + TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->proc_catalog_xmin)); /* update approximate horizons with the computed horizons */ GlobalVisUpdateApply(h); @@ -1991,12 +2024,13 @@ GetReplicationHorizons(TransactionId *xmin, TransactionId *catalog_xmin) /* * Don't want to use shared_oldest_nonremovable here, as that contains the - * effect of replication slot's catalog_xmin. We want to send a separate - * feedback for the catalog horizon, so the primary can remove data table - * contents more aggressively. + * effect of replication slot's catalog_xmin and proc catalog_xmin. We want + * to send a separate feedback for the catalog horizon, so the primary can + * remove data table contents more aggressively. */ *xmin = horizons.shared_oldest_nonremovable_raw; - *catalog_xmin = horizons.slot_catalog_xmin; + *catalog_xmin = TransactionIdOlder(horizons.slot_catalog_xmin, + horizons.proc_catalog_xmin); } /* diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 3d4ab92..4a5b74c 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -2610,6 +2610,7 @@ PhysicalReplicationSlotNewXmin(TransactionId feedbackXmin, TransactionId feedbac SpinLockAcquire(&slot->mutex); MyProc->xmin = InvalidTransactionId; + MyProc->catalog_xmin = InvalidTransactionId; /* * For physical replication we don't need the interlock provided by xmin @@ -2739,6 +2740,7 @@ ProcessStandbyHSFeedbackMessage(void) && !TransactionIdIsNormal(feedbackCatalogXmin)) { MyProc->xmin = InvalidTransactionId; + MyProc->catalog_xmin = InvalidTransactionId; if (MyReplicationSlot != NULL) PhysicalReplicationSlotNewXmin(feedbackXmin, feedbackCatalogXmin); return; @@ -2780,22 +2782,17 @@ ProcessStandbyHSFeedbackMessage(void) * risk already since a VACUUM could already have determined the horizon.) * * If we're using a replication slot we reserve the xmin via that, - * otherwise via the walsender's PGPROC entry. We can only track the - * catalog xmin separately when using a slot, so we store the least of the - * two provided when not using a slot. - * - * XXX: It might make sense to generalize the ephemeral slot concept and - * always use the slot mechanism to handle the feedback xmin. + * otherwise via the walsender's PGPROC entry. With the PGPROC entry we + * track catalog_xmin separately from xmin, so that catalog_xmin only + * affects catalog table horizons and does not hold back vacuuming of + * regular user data tables. */ if (MyReplicationSlot != NULL) /* XXX: persistency configurable? */ PhysicalReplicationSlotNewXmin(feedbackXmin, feedbackCatalogXmin); else { - if (TransactionIdIsNormal(feedbackCatalogXmin) - && TransactionIdPrecedes(feedbackCatalogXmin, feedbackXmin)) - MyProc->xmin = feedbackCatalogXmin; - else - MyProc->xmin = feedbackXmin; + MyProc->xmin = feedbackXmin; + MyProc->catalog_xmin = feedbackCatalogXmin; } } diff --git a/src/test/recovery/t/053_hs_feedback_catalog_xmin.pl b/src/test/recovery/t/053_hs_feedback_catalog_xmin.pl new file mode 100644 index 0000000..1234567 --- /dev/null +++ b/src/test/recovery/t/053_hs_feedback_catalog_xmin.pl @@ -0,0 +1,181 @@ +# Copyright (c) 2025-2026, PostgreSQL Global Development Group +# +# Test that hot standby feedback catalog_xmin does not hold back vacuuming +# of regular user data tables on the primary. +# +# Scenario: primary -> standby (no physical replication slot), with +# hot_standby_feedback = on. The standby has a logical replication slot +# which sets a catalog_xmin. Before the fix, the catalog_xmin would be +# merged into the walsender's MyProc->xmin on the primary, incorrectly +# preventing vacuuming of dead tuples in user data tables. After the fix, +# catalog_xmin is tracked separately via MyProc->catalog_xmin and only +# affects catalog table horizons. + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Initialize primary - no replication slot will be used for the standby +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(allows_streaming => 'logical'); +$node_primary->append_conf( + 'postgresql.conf', qq[ +wal_level = logical +max_replication_slots = 10 +max_wal_senders = 10 +hot_standby_feedback = on +autovacuum = off +wal_receiver_status_interval = 1s +]); +$node_primary->start; + +# Take backup and create standby WITHOUT a replication slot +$node_primary->backup('my_backup'); + +my $node_standby = PostgreSQL::Test::Cluster->new('standby'); +$node_standby->init_from_backup($node_primary, 'my_backup', + has_streaming => 1); +$node_standby->append_conf( + 'postgresql.conf', qq[ +hot_standby_feedback = on +wal_receiver_status_interval = 1s +]); +$node_standby->start; + +# Wait for standby to connect and start streaming +$node_primary->poll_query_until( + 'postgres', + "SELECT count(*) >= 1 FROM pg_stat_replication WHERE state = 'streaming';") + or die "Timed out waiting for standby to start streaming"; + +# Create a test table on the primary and generate some dead tuples +$node_primary->safe_psql('postgres', qq[ + CREATE TABLE test_data (id int, val text); + INSERT INTO test_data SELECT g, 'initial' FROM generate_series(1, 1000) g; +]); + +# Wait for standby to catch up +$node_primary->wait_for_catchup($node_standby); + +# Record the current xmin state - before creating logical slot on standby, +# the walsender should have no xmin holding back vacuum. +my $xmin_before = $node_primary->safe_psql('postgres', + "SELECT backend_xmin FROM pg_stat_replication WHERE state = 'streaming' LIMIT 1;"); +note "walsender xmin before logical slot: '$xmin_before'"; + +# Create a logical replication slot on the standby. This will establish a +# catalog_xmin on the standby, which will be reported back to the primary +# via hot standby feedback. +$node_standby->safe_psql('postgres', qq[ + SELECT pg_create_logical_replication_slot('test_logical_slot', 'test_decoding'); +]); + +# Generate some transactions on the primary so the xmin/catalog_xmin diverge +$node_primary->safe_psql('postgres', qq[ + UPDATE test_data SET val = 'updated' WHERE id <= 500; +]); + +# Wait for standby to catch up and feedback to be applied +$node_primary->wait_for_catchup($node_standby); + +# Give some time for hot standby feedback to arrive (up to 5 seconds) +$node_primary->poll_query_until( + 'postgres', qq[ + SELECT backend_xmin IS NOT NULL + FROM pg_stat_replication + WHERE state = 'streaming'; +]) or die "Timed out waiting for hot standby feedback to set backend_xmin"; + +# Now check the key invariant: the walsender's backend_xmin on the primary +# should reflect feedbackXmin (data horizon), NOT the older catalog_xmin. +# +# Get the standby's logical slot catalog_xmin +my $standby_catalog_xmin = $node_standby->safe_psql('postgres', + "SELECT catalog_xmin FROM pg_replication_slots WHERE slot_name = 'test_logical_slot';"); +note "standby logical slot catalog_xmin: $standby_catalog_xmin"; + +# Get the walsender's backend_xmin on the primary +my $walsender_xmin = $node_primary->safe_psql('postgres', + "SELECT backend_xmin FROM pg_stat_replication WHERE state = 'streaming' LIMIT 1;"); +note "primary walsender backend_xmin: $walsender_xmin"; + +# The walsender's xmin should NOT be the catalog_xmin (which is older). +# If they are equal, the old bug is still present - catalog_xmin is leaking +# into data horizon. +# +# Note: we can't just compare them directly because xmin could be the same +# if no new transactions happened. Instead, generate more transactions and +# verify that vacuum can clean up dead tuples in user tables. + +# Generate dead tuples on the primary +$node_primary->safe_psql('postgres', qq[ + UPDATE test_data SET val = 'updated2'; +]); + +# More transactions to advance xmin +$node_primary->safe_psql('postgres', qq[ + BEGIN; + CREATE TABLE dummy_advance AS SELECT generate_series(1, 100) AS x; + DROP TABLE dummy_advance; + COMMIT; +]); + +# Wait for the standby to replay +$node_primary->wait_for_catchup($node_standby); +sleep(2); # Wait for hs feedback + +# Now vacuum the user table - this should be able to clean dead tuples +# because catalog_xmin should NOT hold back data table vacuum +my $dead_before = $node_primary->safe_psql('postgres', + "SELECT n_dead_tup FROM pg_stat_user_tables WHERE relname = 'test_data';"); +note "dead tuples before vacuum: $dead_before"; + +$node_primary->safe_psql('postgres', "VACUUM VERBOSE test_data;"); + +my $dead_after = $node_primary->safe_psql('postgres', + "SELECT n_dead_tup FROM pg_stat_user_tables WHERE relname = 'test_data';"); +note "dead tuples after vacuum: $dead_after"; + +# The dead tuples should have been cleaned up. +# With the old code, catalog_xmin would prevent this. +ok($dead_after < $dead_before, + 'vacuum cleaned dead tuples in user table despite standby catalog_xmin'); + +# Verify that the catalog horizon is still properly protected. +# The walsender's catalog_xmin should still be reflected in the system. +# Check that the standby's logical slot still has its catalog_xmin. +my $slot_catalog_xmin_after = $node_standby->safe_psql('postgres', + "SELECT catalog_xmin FROM pg_replication_slots WHERE slot_name = 'test_logical_slot';"); +note "standby slot catalog_xmin after vacuum: $slot_catalog_xmin_after"; +ok($slot_catalog_xmin_after ne '', + 'standby logical slot catalog_xmin is still set after primary vacuum'); + +# Also verify via pg_stat_replication that the walsender xmin is set +# (data protection is still working) +my $final_xmin = $node_primary->safe_psql('postgres', + "SELECT backend_xmin FROM pg_stat_replication WHERE state = 'streaming' LIMIT 1;"); +note "final walsender backend_xmin: $final_xmin"; + +# Additional test: verify that catalog_xmin and xmin are different on the +# primary's walsender (xmin should be newer/larger than catalog_xmin). +# The standby should report both separately and the primary should track +# them independently. +# +# Get the replication horizons: the walsender's proc->xmin should be the +# data horizon (feedbackXmin), not min(feedbackXmin, feedbackCatalogXmin). +# We test this indirectly: if catalog_xmin were merged into xmin, the +# walsender's xmin would be very old (= catalog_xmin), and vacuum would +# not be able to clean dead tuples. Since vacuum DID clean them (tested +# above), the separation is working. + +# Cleanup +$node_standby->safe_psql('postgres', + "SELECT pg_drop_replication_slot('test_logical_slot');"); + +$node_standby->stop; +$node_primary->stop; + +done_testing(); -- 2.39.0