From 518b70713ba08956364f5d48b632ff403eaa8d72 Mon Sep 17 00:00:00 2001 From: Mikhail Kharitonov Date: Wed, 3 Dec 2025 11:17:14 +0300 Subject: [PATCH] VACUUM: use relation birth XID to refine data horizon for new relations Store a relation birth XID in pg_class and use it to refine the VISHORIZON_DATA horizon so that transactions that started before the relation existed do not hold back vacuuming of that relation. --- src/backend/catalog/heap.c | 9 + src/backend/storage/ipc/procarray.c | 108 ++++++++- src/backend/utils/cache/relcache.c | 25 +++ src/include/catalog/pg_class.h | 5 +- src/include/utils/rel.h | 4 + .../recovery/t/121_vacuum_xid_horizons.pl | 211 ++++++++++++++++++ 6 files changed, 360 insertions(+), 2 deletions(-) create mode 100644 src/test/recovery/t/121_vacuum_xid_horizons.pl diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index fd6537567ea..2e3d051eaa6 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -917,6 +917,7 @@ InsertPgClassTuple(Relation pg_class_desc, Datum values[Natts_pg_class]; bool nulls[Natts_pg_class]; HeapTuple tup; + TransactionId cxid = InvalidTransactionId; /* This is a tad tedious, but way cleaner than what we used to do... */ memset(values, 0, sizeof(values)); @@ -953,6 +954,14 @@ InsertPgClassTuple(Relation pg_class_desc, values[Anum_pg_class_relrewrite - 1] = ObjectIdGetDatum(rd_rel->relrewrite); values[Anum_pg_class_relfrozenxid - 1] = TransactionIdGetDatum(rd_rel->relfrozenxid); values[Anum_pg_class_relminmxid - 1] = MultiXactIdGetDatum(rd_rel->relminmxid); + if (!rd_rel->relisshared && !IsBootstrapProcessingMode() && !RecoveryInProgress()) + { + (void)GetCurrentTransactionId(); + cxid = GetTopTransactionIdIfAny(); + } + + values[Anum_pg_class_relminxid - 1] = TransactionIdGetDatum(cxid); + if (relacl != (Datum) 0) values[Anum_pg_class_relacl - 1] = relacl; else diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 200f72c6e25..31852e59a88 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -1938,6 +1938,107 @@ GlobalVisHorizonKindForRel(Relation rel) return VISHORIZON_TEMP; } +static TransactionId +ComputeDataHorizonForRelation(TransactionId create_xid) +{ + ProcArrayStruct *arrayP = procArray; + bool in_recovery = RecoveryInProgress(); + TransactionId kaxmin = InvalidTransactionId; + FullTransactionId latest_completed = TransamVariables->latestCompletedXid; + + TransactionId initial = XidFromFullTransactionId(latest_completed); + TransactionId rel_data_oldest_nonremovable = initial; + + Assert(TransactionIdIsValid(initial)); + TransactionIdAdvance(initial); + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + /* + * Take replication slot xmin into account globally; slots always limit + * how far horizons can advance. + */ + TransactionId slot_xmin = procArray->replication_slot_xmin; + + for (int i = 0; i < arrayP->numProcs; i++) + { + int pgprocno = arrayP->pgprocnos[i]; + PGPROC *proc = &allProcs[pgprocno]; + int8 flags = ProcGlobal->statusFlags[i]; + TransactionId xid; + TransactionId xmin; + TransactionId eff; + + /* As in ComputeXidHorizons: skip VACUUM and logical decoding. */ + if (flags & (PROC_IN_VACUUM | PROC_IN_LOGICAL_DECODING)) + continue; + + /* + * Only consider backends in the current database (as usual for + * the data horizon). + */ + if (!(proc->databaseId == MyDatabaseId || + MyDatabaseId == InvalidOid || /* starting backend */ + (flags & PROC_AFFECTS_ALL_HORIZONS) || + in_recovery)) + continue; + + xid = pg_atomic_read_u64(&ProcGlobal->xids[i]); + xmin = pg_atomic_read_u64(&proc->xmin); + + /* + * As in the generic horizon computation, use the older of + * xid/xmin. + */ + eff = TransactionIdOlder(xmin, xid); + + /* If neither xid nor xmin is valid, this backend does not matter. */ + if (!TransactionIdIsValid(eff)) + continue; + + /* + * Filter by relation creation xid: + * + * If the transaction started before create_xid (xid < create_xid) + * and either it has no snapshot yet (xmin invalid), or its + * snapshot also predates the relation's creation (xmin < + * create_xid), then this backend cannot see any tuples in the + * relation and can be ignored for this horizon. + */ + if (TransactionIdIsValid(create_xid)) + { + bool started_before_create = TransactionIdPrecedes(xid, create_xid); + bool snapshot_before_create; + + snapshot_before_create = + (!TransactionIdIsValid(xmin)) || + TransactionIdPrecedes(xmin, create_xid); + + if (started_before_create && snapshot_before_create) + continue; /* cannot require preserving tuples in this relation */ + } + + /* Otherwise, include its effective xid/xmin in the MIN() computation. */ + rel_data_oldest_nonremovable = + TransactionIdOlder(rel_data_oldest_nonremovable, eff); + } + + if (in_recovery) + kaxmin = KnownAssignedXidsGetOldestXmin(); + + LWLockRelease(ProcArrayLock); + + if (in_recovery) + rel_data_oldest_nonremovable = + TransactionIdOlder(rel_data_oldest_nonremovable, kaxmin); + + /* Replication slots still limit the horizon. */ + rel_data_oldest_nonremovable = + TransactionIdOlder(rel_data_oldest_nonremovable, slot_xmin); + + return rel_data_oldest_nonremovable; +} + /* * Return the oldest XID for which deleted tuples must be preserved in the * passed table. @@ -1963,7 +2064,12 @@ GetOldestNonRemovableTransactionId(Relation rel) case VISHORIZON_CATALOG: return horizons.catalog_oldest_nonremovable; case VISHORIZON_DATA: - return horizons.data_oldest_nonremovable; + TransactionId create_xid = RelationGetCreationXid(rel); + + if (TransactionIdIsValid(create_xid)) + return ComputeDataHorizonForRelation(create_xid); + else + return horizons.data_oldest_nonremovable; case VISHORIZON_TEMP: return horizons.temp_oldest_nonremovable; } diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 915d0bc9084..a67498d6585 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -1316,6 +1316,9 @@ retry: /* It's fully valid */ relation->rd_isvalid = true; + relation->rd_creation_xid = InvalidTransactionId; + relation->rd_creation_xid_valid = false; + #ifdef MAYBE_RECOVER_RELATION_BUILD_MEMORY if (tmpcxt) { @@ -6993,3 +6996,25 @@ ResOwnerReleaseRelation(Datum res) RelationCloseCleanup((Relation) DatumGetPointer(res)); } + +TransactionId +RelationGetCreationXid(Relation rel) +{ + if (rel == NULL) + return InvalidTransactionId; + + if (rel->rd_creation_xid_valid) + return rel->rd_creation_xid; + + if (rel->rd_rel) + { + TransactionId x = rel->rd_rel->relminxid; + if (TransactionIdIsNormal(x)) + { + rel->rd_creation_xid = x; + rel->rd_creation_xid_valid = true; + return x; + } + } + return InvalidTransactionId; +} diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h index 07d182da796..6e2408fda71 100644 --- a/src/include/catalog/pg_class.h +++ b/src/include/catalog/pg_class.h @@ -131,6 +131,9 @@ CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,Relat /* all multixacts in this rel are >= this; it is really a MultiXactId */ TransactionId relminmxid BKI_DEFAULT(1); /* FirstMultiXactId */ + /* Birth XID of the relation. 0 = InvalidTransactionId = unknown. */ + TransactionId relminxid BKI_DEFAULT(0); + #ifdef CATALOG_VARLEN /* variable-length fields start here */ /* NOTE: These fields are not present in a relcache entry's rd_rel field. */ /* access permissions */ @@ -146,7 +149,7 @@ CATALOG(pg_class,1259,RelationRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83,Relat /* Size of fixed part of pg_class tuples, not counting var-length fields */ #define CLASS_TUPLE_SIZE \ - (offsetof(FormData_pg_class,relminmxid) + sizeof(TransactionId)) + (offsetof(FormData_pg_class,relminxid) + sizeof(TransactionId)) /* ---------------- * Form_pg_class corresponds to a pointer to a tuple with diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 80286076a11..fb4eafeea93 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -108,6 +108,9 @@ typedef struct RelationData * any value */ SubTransactionId rd_droppedSubid; /* dropped with another Subid set */ + TransactionId rd_creation_xid; + bool rd_creation_xid_valid; + Form_pg_class rd_rel; /* RELATION tuple */ TupleDesc rd_att; /* tuple descriptor */ Oid rd_id; /* relation's object id */ @@ -718,4 +721,5 @@ RelationCloseSmgr(Relation relation) extern void RelationIncrementReferenceCount(Relation rel); extern void RelationDecrementReferenceCount(Relation rel); +extern TransactionId RelationGetCreationXid(Relation rel); #endif /* REL_H */ diff --git a/src/test/recovery/t/121_vacuum_xid_horizons.pl b/src/test/recovery/t/121_vacuum_xid_horizons.pl new file mode 100644 index 00000000000..58bd45d8a1a --- /dev/null +++ b/src/test/recovery/t/121_vacuum_xid_horizons.pl @@ -0,0 +1,211 @@ +use strict; +use warnings; +use Test::More; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; + +# Start a minimal cluster to exercise VACUUM and XID horizon behavior. +my $node = PostgreSQL::Test::Cluster->new('relhorizon'); +$node->init; + +# Keep autovacuum out of the way and force VACUUM VERBOSE messages to be in +# English and to go to stderr. +$node->append_conf('postgresql.conf', q{ + client_min_messages = info + autovacuum = off + lc_messages = 'C' +}); +$node->start; + +# Run VACUUM (VERBOSE) and return all of its output. +# VACUUM VERBOSE writes to stderr, so collect both stdout and stderr. +sub vacuum_verbose { + my ($relname) = @_; + my ($out, $err) = ('', ''); + $node->psql('postgres', "VACUUM (VERBOSE) $relname;", stdout => \$out, stderr => \$err); + return $out . $err; +} + +# Helper regexp: either VACUUM truncated some table or removed some tuples. +# We know which relation we vacuumed from the test context, so we do not insist +# on a specific table name in the message. +my $vac_removed_or_truncated_re = + qr/(table "[^"]+": truncated \d+ to \d+ pages|tuples:\s+\d+\s+removed)/; + +# Helper regexp: VACUUM reports dead tuples that are "not yet removable". +my $vac_not_yet_removable_re = + qr/\b[1-9]\d*\s+are dead but not yet removable/; + +# One "old" session kept open to emulate a long-running transaction that +# influences the global XID horizon. +my $sess_old = $node->background_psql('postgres'); + +# ============================================================ +# T1: relation created after the long-running transaction; +# VACUUM on such a relation should not be constrained by it +# ============================================================ +$sess_old->query('BEGIN;'); +$sess_old->query('SELECT txid_current();'); # ensure the session has an XID + +$node->safe_psql('postgres', q{ + DROP TABLE IF EXISTS t1; + CREATE TABLE t1(id int); + INSERT INTO t1 SELECT generate_series(1,1000); + DELETE FROM t1; +}); + +my $vac_t1 = vacuum_verbose('t1'); + +# VACUUM should be able to either truncate the relation or remove dead tuples. +like( + $vac_t1, + $vac_removed_or_truncated_re, + 'T1: VACUUM on relation created after old transaction can remove dead tuples' +); + +# There must be no "are dead but not yet removable" due to the old transaction. +unlike( + $vac_t1, + $vac_not_yet_removable_re, + 'T1: old long-running transaction does not retain dead tuples in new relation' +); + +# Statistics should also report no remaining dead tuples. +my $dead_t1 = $node->safe_psql('postgres', q{ + SELECT n_dead_tup FROM pg_stat_all_tables WHERE relname = 't1'; +}); +chomp $dead_t1; +is($dead_t1, '0', 'T1: pg_stat_all_tables reports zero dead tuples'); + +# ============================================================ +# T2: REPEATABLE READ transaction started after relation creation +# must retain dead tuples until it commits +# ============================================================ + +# Create the test relation. +$node->safe_psql('postgres', q{ + DROP TABLE IF EXISTS t2; + CREATE TABLE t2(id int); + INSERT INTO t2 SELECT generate_series(1,5); +}); + +# Start a separate REPEATABLE READ transaction and take a snapshot. +my $sess_rr = $node->background_psql('postgres'); +$sess_rr->query('BEGIN ISOLATION LEVEL REPEATABLE READ;'); +$sess_rr->query('SELECT * FROM t2;'); # take a snapshot + +# Delete rows in the main session. +$node->safe_psql('postgres', q{ + DELETE FROM t2; +}); + +# While the REPEATABLE READ transaction is open, VACUUM should report +# dead tuples as "not yet removable". +my $vac_t2_hold = vacuum_verbose('t2'); +like( + $vac_t2_hold, + $vac_not_yet_removable_re, + 'T2: REPEATABLE READ snapshot keeps dead tuples while transaction is open' +); + +# Once the REPEATABLE READ transaction commits, VACUUM can remove them. +$sess_rr->query('COMMIT;'); + +my $vac_t2_after = vacuum_verbose('t2'); +like( + $vac_t2_after, + $vac_removed_or_truncated_re, + 'T2: after REPEATABLE READ commit VACUUM can remove dead tuples' +); + +# ============================================================ +# T3: VACUUM FULL rewrite must preserve the "birth XID" so that +# subsequent VACUUM can still remove dead tuples correctly +# ============================================================ + +$node->safe_psql('postgres', q{ + DROP TABLE IF EXISTS t3; + CREATE TABLE t3(id int); + INSERT INTO t3 SELECT generate_series(1,10); + DELETE FROM t3; +}); + +my $vac_t3_first = vacuum_verbose('t3'); +like( + $vac_t3_first, + $vac_removed_or_truncated_re, + 'T3: initial VACUUM on rewritten candidate relation works' +); + +# Rewrite the relation with VACUUM FULL, then create and delete tuples again. +$node->safe_psql('postgres', q{ + VACUUM FULL t3; + INSERT INTO t3 SELECT generate_series(11,20); + DELETE FROM t3; +}); + +my $vac_t3_second = vacuum_verbose('t3'); +like( + $vac_t3_second, + $vac_removed_or_truncated_re, + 'T3: VACUUM after VACUUM FULL rewrite still removes dead tuples' +); + +# ============================================================ +# T4: partitioned table — a partition created before the long-running +# transaction must be held; a newer partition must not +# ============================================================ + +$node->safe_psql('postgres', q{ + DROP TABLE IF EXISTS p_parent CASCADE; + CREATE TABLE p_parent(id int, payload text) PARTITION BY RANGE (id); + CREATE TABLE p_child1 PARTITION OF p_parent FOR VALUES FROM (1) TO (1000); +}); + +# Restart the long-running transaction after the first partition exists, +# so that p_child1 is older than its XID. +$sess_old->query('COMMIT;'); +$sess_old->query('BEGIN;'); +$sess_old->query('SELECT txid_current();'); + +# Create a newer partition and populate/delete tuples in both partitions. +$node->safe_psql('postgres', q{ + CREATE TABLE p_child2 PARTITION OF p_parent FOR VALUES FROM (1000) TO (2000); + INSERT INTO p_child1 SELECT generate_series(1,100), repeat('x',10); + DELETE FROM p_child1; + INSERT INTO p_child2 SELECT generate_series(1000,1100), repeat('x',10); + DELETE FROM p_child2; +}); + +# The partition created after the long-running transaction should be fully cleaned. +my $vac_p2 = vacuum_verbose('p_child2'); +like( + $vac_p2, + $vac_removed_or_truncated_re, + 'T4: partition created after old transaction can be vacuumed fully' +); + +# The older partition should still retain dead tuples while the transaction is open. +my $vac_p1_held = vacuum_verbose('p_child1'); +like( + $vac_p1_held, + $vac_not_yet_removable_re, + 'T4: partition created before old transaction retains dead tuples' +); + +# After the long-running transaction commits, VACUUM can remove tuples +# from the older partition as well. +$sess_old->query('COMMIT;'); + +my $vac_p1_after = vacuum_verbose('p_child1'); +like( + $vac_p1_after, + $vac_removed_or_truncated_re, + 'T4: after old transaction commits older partition can be vacuumed fully' +); + +# Cleanup. +$sess_rr->quit if $sess_rr; +$sess_old->quit; +$node->stop; +done_testing(); -- 2.34.1