From 6bcc6c35e480ffa02117c1e6591f0bccdc70ad12 Mon Sep 17 00:00:00 2001 From: Antonin Houska Date: Mon, 19 Jan 2026 16:07:45 +0100 Subject: [PATCH 1/2] Fix race conditions during the setup of logical decoding. Although it's rather unlikely, it can happen that the snapshot builder considers transaction committed (according to WAL) before the commit could be recorded in CLOG. In an extreme case, snapshot can even be created and used in between. Since both snapshot and CLOG are needed for visibility checks, this inconsistency can make them work incorrectly. The typical symptom is that a transaction that the snapshot considers not running anymore is (per CLOG) considered aborted instead of committed. Thus a new tuple version can be evaluated as invisible (if xmin is incorrectly considered aborted) or a deleted tuple version can be evaluated as visible (if xmax is incorrectly considered aborted). This patch fixes the problem by checking if all the XIDs that the new snapshot considers committed are really committed per CLOG. If at least one is not, the check is repeated after a short delay. However, a single check is sufficient in almost all cases, so the performance impact should be minimal. --- src/backend/replication/logical/snapbuild.c | 27 ++++++++++++++++++- .../utils/activity/wait_event_names.txt | 1 + 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 37f0c6028bd..d7ea098cb37 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -377,7 +377,7 @@ SnapBuildBuildSnapshot(SnapBuild *builder) /* * We misuse the original meaning of SnapshotData's xip and subxip fields - * to make the more fitting for our needs. + * to make them more fitting for our needs. * * In the 'xip' array we store transactions that have to be treated as * committed. Since we will only ever look at tuples from transactions @@ -402,6 +402,31 @@ SnapBuildBuildSnapshot(SnapBuild *builder) snapshot->xmin = builder->xmin; snapshot->xmax = builder->xmax; + /* + * Although it's very unlikely, it's possible that a commit WAL record was + * decoded but CLOG is not aware of the commit yet. Should the CLOG update + * be delayed even more, visibility checks that use this snapshot could + * work incorrectly. Therefore we check the CLOG status here. + */ + for (int i = 0; i < builder->committed.xcnt; i++) + { + for (;;) + { + if (TransactionIdDidCommit(builder->committed.xip[i])) + break; + else + { + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | + WL_EXIT_ON_PM_DEATH, + 10L, + WAIT_EVENT_SNAPBUILD_CLOG); + ResetLatch(MyLatch); + } + CHECK_FOR_INTERRUPTS(); + } + } + /* store all transactions to be treated as committed by this snapshot */ snapshot->xip = (TransactionId *) ((char *) snapshot + sizeof(SnapshotData)); diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 4aa864fe3c3..987df777e47 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -181,6 +181,7 @@ PG_SLEEP "Waiting due to a call to pg_sleep or a sibling fu RECOVERY_APPLY_DELAY "Waiting to apply WAL during recovery because of a delay setting." RECOVERY_RETRIEVE_RETRY_INTERVAL "Waiting during recovery when WAL data is not available from any source (pg_wal, archive or stream)." REGISTER_SYNC_REQUEST "Waiting while sending synchronization requests to the checkpointer, because the request queue is full." +SNAPBUILD_CLOG "Waiting for CLOG update before building snapshot." SPIN_DELAY "Waiting while acquiring a contended spinlock." VACUUM_DELAY "Waiting in a cost-based vacuum delay point." VACUUM_TRUNCATE "Waiting to acquire an exclusive lock to truncate off any empty pages at the end of a table vacuumed." -- 2.47.3