From e84d56b24d034694822710bca971eb7ac3a5a930 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 26 Nov 2025 18:07:01 +0200
Subject: [PATCH v11 1/1] Avoid multixact edge case 2 by writing the next
 offset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RecordNewMultiXact now writes the offset of the
following multixact so readers never block waiting for “case 2”.
The new TAP test reproduces IPC/MultixactCreation hangs and verifies
that previously recorded multis stay readable across crash recovery.

Author: Andrey Borodin <amborodin@acm.org>
Reviewed-by: Dmitry Yurichev <dsy.075@yandex.ru>
Reviewed-by: Álvaro Herrera <alvherre@postgresql.org>
Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Reviewed-by: Ivan Bykov <i.bykov@modernsys.ru>
---
 src/backend/access/transam/multixact.c        | 150 +++++++++++-------
 src/test/modules/test_slru/t/001_multixact.pl | 141 ++++++++--------
 src/test/modules/test_slru/test_multixact.c   |   1 -
 src/test/perl/PostgreSQL/Test/Cluster.pm      |  40 +++++
 src/test/recovery/t/017_shm.pl                |  38 +----
 5 files changed, 201 insertions(+), 169 deletions(-)

diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 9d5f130af7e..d1e15ff5624 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -79,7 +79,6 @@
 #include "pg_trace.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
-#include "storage/condition_variable.h"
 #include "storage/pmsignal.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
@@ -271,12 +270,6 @@ typedef struct MultiXactStateData
 	/* support for members anti-wraparound measures */
 	MultiXactOffset offsetStopLimit;	/* known if oldestOffsetKnown */
 
-	/*
-	 * This is used to sleep until a multixact offset is written when we want
-	 * to create the next one.
-	 */
-	ConditionVariable nextoff_cv;
-
 	/*
 	 * Per-backend data starts here.  We have two arrays stored in the area
 	 * immediately following the MultiXactStateData struct. Each is indexed by
@@ -912,13 +905,33 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 	int			entryno;
 	int			slotno;
 	MultiXactOffset *offptr;
-	int			i;
+	MultiXactId next;
+	int64		next_pageno;
+	int			next_entryno;
+	MultiXactOffset *next_offptr;
 	LWLock	   *lock;
 	LWLock	   *prevlock = NULL;
 
+	/* position of this multixid in the offsets SLRU area  */
 	pageno = MultiXactIdToOffsetPage(multi);
 	entryno = MultiXactIdToOffsetEntry(multi);
 
+	/* position of the next multixid */
+	next = multi + 1;
+	if (next < FirstMultiXactId)
+		next = FirstMultiXactId;
+	next_pageno = MultiXactIdToOffsetPage(next);
+	next_entryno = MultiXactIdToOffsetEntry(next);
+
+	/*
+	 * Set the starting offset of this multixid's members.
+	 *
+	 * In the common case, it was already be set by the previous
+	 * RecordNewMultiXact call, as this was the next multixid of the previous
+	 * multixid.  But if multiple backends are generating multixids
+	 * concurrently, we might race ahead and get called before previous
+	 * multixid.
+	 */
 	lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
 	LWLockAcquire(lock, LW_EXCLUSIVE);
 
@@ -933,22 +946,50 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 	offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
 	offptr += entryno;
 
-	*offptr = offset;
+	if (*offptr != offset)
+	{
+		/* should already be set to the correct value, or not at all */
+		Assert(*offptr == 0);
+		*offptr = offset;
+		MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
+	}
 
-	MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
+	/*
+	 * Set the next multixid's offset to the end of this multixid's members.
+	 */
+	if (next_pageno == pageno)
+	{
+		next_offptr = offptr + 1;
+	}
+	else
+	{
+		/* must be the first entry on the page */
+		Assert(next_entryno == 0 || next == FirstMultiXactId);
+
+		/* Swap the lock for a lock on the next page */
+		LWLockRelease(lock);
+		lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
+		LWLockAcquire(lock, LW_EXCLUSIVE);
+
+		slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next);
+		next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+		next_offptr += next_entryno;
+	}
+
+	if (*next_offptr != offset + nmembers)
+	{
+		/* should already be set to the correct value, or not at all */
+		Assert(*next_offptr == 0);
+		*next_offptr = offset + nmembers;
+		MultiXactMemberCtl->shared->page_dirty[slotno] = true;
+	}
 
 	/* Release MultiXactOffset SLRU lock. */
 	LWLockRelease(lock);
 
-	/*
-	 * If anybody was waiting to know the offset of this multixact ID we just
-	 * wrote, they can read it now, so wake them up.
-	 */
-	ConditionVariableBroadcast(&MultiXactState->nextoff_cv);
-
 	prev_pageno = -1;
 
-	for (i = 0; i < nmembers; i++, offset++)
+	for (int i = 0; i < nmembers; i++, offset++)
 	{
 		TransactionId *memberptr;
 		uint32	   *flagsptr;
@@ -1138,8 +1179,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
 			result = FirstMultiXactId;
 	}
 
-	/* Make sure there is room for the MXID in the file.  */
-	ExtendMultiXactOffset(result);
+	/*
+	 * Make sure there is room for the next MXID in the file.  Assigning this
+	 * MXID sets the next MXID offset already.
+	 */
+	ExtendMultiXactOffset(result + 1);
 
 	/*
 	 * Reserve the members space, similarly to above.  Also, be careful not to
@@ -1304,7 +1348,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
 	MultiXactOffset nextOffset;
 	MultiXactMember *ptr;
 	LWLock	   *lock;
-	bool		slept = false;
 
 	debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
 
@@ -1381,23 +1424,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
 	 * one's.  However, there are some corner cases to worry about:
 	 *
 	 * 1. This multixact may be the latest one created, in which case there is
-	 * no next one to look at.  In this case the nextOffset value we just
-	 * saved is the correct endpoint.
+	 * no next one to look at.  The next multixact's offset should be set
+	 * already, as we set it in RecordNewMultiXact(), but we used to not do
+	 * that in older minor versions.  To cope with that case, if this
+	 * multixact is the latest one created, use the nextOffset value we read
+	 * above as the endpoint.
 	 *
-	 * 2. The next multixact may still be in process of being filled in: that
-	 * is, another process may have done GetNewMultiXactId but not yet written
-	 * the offset entry for that ID.  In that scenario, it is guaranteed that
-	 * the offset entry for that multixact exists (because GetNewMultiXactId
-	 * won't release MultiXactGenLock until it does) but contains zero
-	 * (because we are careful to pre-zero offset pages). Because
-	 * GetNewMultiXactId will never return zero as the starting offset for a
-	 * multixact, when we read zero as the next multixact's offset, we know we
-	 * have this case.  We handle this by sleeping on the condition variable
-	 * we have just for this; the process in charge will signal the CV as soon
-	 * as it has finished writing the multixact offset.
-	 *
-	 * 3. Because GetNewMultiXactId increments offset zero to offset one to
-	 * handle case #2, there is an ambiguity near the point of offset
+	 * 2. Because GetNewMultiXactId skips over offset zero, to reserve zero
+	 * for to mean "unset", there is an ambiguity near the point of offset
 	 * wraparound.  If we see next multixact's offset is one, is that our
 	 * multixact's actual endpoint, or did it end at zero with a subsequent
 	 * increment?  We handle this using the knowledge that if the zero'th
@@ -1409,7 +1443,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
 	 * cases, so it seems better than holding the MultiXactGenLock for a long
 	 * time on every multixact creation.
 	 */
-retry:
 	pageno = MultiXactIdToOffsetPage(multi);
 	entryno = MultiXactIdToOffsetEntry(multi);
 
@@ -1473,16 +1506,10 @@ retry:
 
 		if (nextMXOffset == 0)
 		{
-			/* Corner case 2: next multixact is still being filled in */
-			LWLockRelease(lock);
-			CHECK_FOR_INTERRUPTS();
-
-			INJECTION_POINT("multixact-get-members-cv-sleep", NULL);
-
-			ConditionVariableSleep(&MultiXactState->nextoff_cv,
-								   WAIT_EVENT_MULTIXACT_CREATION);
-			slept = true;
-			goto retry;
+			ereport(ERROR,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("MultiXact %d has invalid next offset",
+							multi)));
 		}
 
 		length = nextMXOffset - offset;
@@ -1491,12 +1518,6 @@ retry:
 	LWLockRelease(lock);
 	lock = NULL;
 
-	/*
-	 * If we slept above, clean up state; it's no longer needed.
-	 */
-	if (slept)
-		ConditionVariableCancelSleep();
-
 	ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
 
 	truelength = 0;
@@ -1539,7 +1560,7 @@ retry:
 
 		if (!TransactionIdIsValid(*xactptr))
 		{
-			/* Corner case 3: we must be looking at unused slot zero */
+			/* Corner case 2: we must be looking at unused slot zero */
 			Assert(offset == 0);
 			continue;
 		}
@@ -1986,7 +2007,6 @@ MultiXactShmemInit(void)
 
 		/* Make sure we zero out the per-backend state */
 		MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
-		ConditionVariableInit(&MultiXactState->nextoff_cv);
 	}
 	else
 		Assert(found);
@@ -2132,26 +2152,36 @@ TrimMultiXact(void)
 						pageno);
 
 	/*
+	 * Set the offset of the last multixact on the offsets page.
+	 *
+	 * This is normally done in RecordNewMultiXact() of the previous
+	 * multixact, but we used to not do that in older minor versions.  To
+	 * ensure that the next offset is set if the binary was just upgraded from
+	 * an older minor version, do it now.
+	 *
 	 * Zero out the remainder of the current offsets page.  See notes in
 	 * TrimCLOG() for background.  Unlike CLOG, some WAL record covers every
 	 * pg_multixact SLRU mutation.  Since, also unlike CLOG, we ignore the WAL
 	 * rule "write xlog before data," nextMXact successors may carry obsolete,
-	 * nonzero offset values.  Zero those so case 2 of GetMultiXactIdMembers()
-	 * operates normally.
+	 * nonzero offset values.
 	 */
 	entryno = MultiXactIdToOffsetEntry(nextMXact);
-	if (entryno != 0)
 	{
 		int			slotno;
 		MultiXactOffset *offptr;
 		LWLock	   *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
 
 		LWLockAcquire(lock, LW_EXCLUSIVE);
-		slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
+		if (entryno == 0)
+			slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
+		else
+			slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
 		offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
 		offptr += entryno;
 
-		MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
+		*offptr = offset;
+		if ((entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ)
+			MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset));
 
 		MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
 		LWLockRelease(lock);
diff --git a/src/test/modules/test_slru/t/001_multixact.pl b/src/test/modules/test_slru/t/001_multixact.pl
index e2b567a603d..2f85802f920 100644
--- a/src/test/modules/test_slru/t/001_multixact.pl
+++ b/src/test/modules/test_slru/t/001_multixact.pl
@@ -18,6 +18,10 @@ if ($ENV{enable_injection_points} ne 'yes')
 {
 	plan skip_all => 'Injection points not supported by this build';
 }
+if ($windows_os)
+{
+	plan skip_all => 'Kill9 works unpredicatably on Windows';
+}
 
 my ($node, $result);
 
@@ -25,96 +29,87 @@ $node = PostgreSQL::Test::Cluster->new('mike');
 $node->init;
 $node->append_conf('postgresql.conf',
 	"shared_preload_libraries = 'test_slru,injection_points'");
+# Set the cluster's next multitransaction to 0xFFFFFFF0.
+my $node_pgdata = $node->data_dir;
+command_ok(
+	[
+		'pg_resetwal',
+		'--multixact-ids' => '0xFFFFFFF0,0xFFFFFFF0',
+		$node_pgdata
+	],
+	"set the cluster's next multitransaction to 0xFFFFFFF0");
+command_ok(
+	[
+		'dd',
+		'if=/dev/zero',
+		"of=$node_pgdata/pg_multixact/offsets/FFFF",
+		'bs=4',
+		'count=65536'
+	],
+	"init SLRU file");
+
+command_ok(
+	[
+		'rm',
+		"$node_pgdata/pg_multixact/offsets/0000",
+	],
+	"drop old SLRU file");
+
 $node->start;
 $node->safe_psql('postgres', q(CREATE EXTENSION injection_points));
 $node->safe_psql('postgres', q(CREATE EXTENSION test_slru));
 
-# Test for Multixact generation edge case
-$node->safe_psql('postgres',
-	q{select injection_points_attach('test-multixact-read','wait')});
-$node->safe_psql('postgres',
-	q{select injection_points_attach('multixact-get-members-cv-sleep','wait')}
-);
+# Another multixact test: loosing some multixact must not affect reading near
+# multixacts, even after a crash.
+my $bg_psql = $node->background_psql('postgres');
 
-# This session must observe sleep on the condition variable while generating a
-# multixact.  To achieve this it first will create a multixact, then pause
-# before reading it.
-my $observer = $node->background_psql('postgres');
-
-# This query will create a multixact, and hang just before reading it.
-$observer->query_until(
-	qr/start/,
-	q{
-	\echo start
-	SELECT test_read_multixact(test_create_multixact());
-});
-$node->wait_for_event('client backend', 'test-multixact-read');
-
-# This session will create the next Multixact. This is necessary to avoid
-# multixact.c's non-sleeping edge case 1.
-my $creator = $node->background_psql('postgres');
+my $multi = $bg_psql->query_safe(
+	q(SELECT test_create_multixact();));
+
+# The space for next multi will be allocated, but it will never be actually
+# recorded.
 $node->safe_psql('postgres',
 	q{SELECT injection_points_attach('multixact-create-from-members','wait');}
 );
 
-# We expect this query to hang in the critical section after generating new
-# multixact, but before filling its offset into SLRU.
-# Running an injection point inside a critical section requires it to be
-# loaded beforehand.
-$creator->query_until(
-	qr/start/, q{
-	\echo start
+$bg_psql->query_until(
+	qr/deploying lost multi/, q(
+\echo deploying lost multi
 	SELECT test_create_multixact();
-});
+));
 
 $node->wait_for_event('client backend', 'multixact-create-from-members');
-
-# Ensure we have the backends waiting that we expect
-is( $node->safe_psql(
-		'postgres',
-		q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event)
-		FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'}
-	),
-	'multixact-create-from-members, test-multixact-read',
-	"matching injection point waits");
-
-# Now wake observer to get it to read the initial multixact.  A subsequent
-# multixact already exists, but that one doesn't have an offset assigned, so
-# this will hit multixact.c's edge case 2.
-$node->safe_psql('postgres',
-	q{SELECT injection_points_wakeup('test-multixact-read')});
-$node->wait_for_event('client backend', 'multixact-get-members-cv-sleep');
-
-# Ensure we have the backends waiting that we expect
-is( $node->safe_psql(
-		'postgres',
-		q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event)
-		FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'}
-	),
-	'multixact-create-from-members, multixact-get-members-cv-sleep',
-	"matching injection point waits");
-
-# Now we have two backends waiting in multixact-create-from-members and
-# multixact-get-members-cv-sleep.  Also we have 3 injections points set to wait.
-# If we wakeup multixact-get-members-cv-sleep it will happen again, so we must
-# detach it first. So let's detach all injection points, then wake up all
-# backends.
-
-$node->safe_psql('postgres',
-	q{SELECT injection_points_detach('test-multixact-read')});
 $node->safe_psql('postgres',
 	q{SELECT injection_points_detach('multixact-create-from-members')});
-$node->safe_psql('postgres',
-	q{SELECT injection_points_detach('multixact-get-members-cv-sleep')});
 
 $node->safe_psql('postgres',
-	q{SELECT injection_points_wakeup('multixact-create-from-members')});
-$node->safe_psql('postgres',
-	q{SELECT injection_points_wakeup('multixact-get-members-cv-sleep')});
+	q{checkpoint;});
 
-# Background psql will now be able to read the result and disconnect.
-$observer->quit;
-$creator->quit;
+# One more multitransaction to effectivelt emit WAL record about next
+# multitransaction (to avaoid corener case 1).
+$node->safe_psql('postgres',
+	q{SELECT test_create_multixact();});
+
+# All set and done, it's time for hard restart
+$node->kill9;
+$node->stop('immediate', fail_ok => 1);
+$node->poll_start;
+$bg_psql->{run}->finish;
+
+# Verify thet recorded multi is readble, this call must not hang.
+# Also note that all injection points disappeared after server restart.
+my $timed_out = 0;
+$node->safe_psql(
+	'postgres',
+	qq{SELECT test_read_multixact('$multi'::xid);},
+	timeout => $PostgreSQL::Test::Utils::timeout_default,
+	timed_out => \$timed_out);
+ok($timed_out == 0, 'recorded multi is readble');
+
+# Test mxidwraparound
+foreach my $i (1 .. 32) {
+$node->safe_psql('postgres',q{SELECT test_create_multixact();});
+}
 
 $node->stop;
 
diff --git a/src/test/modules/test_slru/test_multixact.c b/src/test/modules/test_slru/test_multixact.c
index 6c9b0420717..2fd67273ee5 100644
--- a/src/test/modules/test_slru/test_multixact.c
+++ b/src/test/modules/test_slru/test_multixact.c
@@ -46,7 +46,6 @@ test_read_multixact(PG_FUNCTION_ARGS)
 	MultiXactId id = PG_GETARG_TRANSACTIONID(0);
 	MultiXactMember *members;
 
-	INJECTION_POINT("test-multixact-read", NULL);
 	/* discard caches */
 	AtEOXact_MultiXact();
 
diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm
index 35413f14019..e810f123f93 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -1191,6 +1191,46 @@ sub start
 
 =pod
 
+=item $node->poll_start() => success_or_failure
+
+Polls start after kill9
+
+We may need retries to start a new postmaster.  Causes:
+ - kernel is slow to deliver SIGKILL
+ - postmaster parent is slow to waitpid()
+ - postmaster child is slow to exit in response to SIGQUIT
+ - postmaster child is slow to exit after postmaster death
+
+=cut
+
+sub poll_start
+{
+	my ($self) = @_;
+
+	my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default;
+	my $attempts = 0;
+
+	while ($attempts < $max_attempts)
+	{
+		$self->start(fail_ok => 1) && return 1;
+
+		# Wait 0.1 second before retrying.
+		usleep(100_000);
+
+		# Clean up in case the start attempt just timed out or some such.
+		$self->stop('fast', fail_ok => 1);
+
+		$attempts++;
+	}
+
+	# Try one last time without fail_ok, which will BAIL_OUT unless it
+	# succeeds.
+	$self->start && return 1;
+	return 0;
+}
+
+=pod
+
 =item $node->kill9()
 
 Send SIGKILL (signal 9) to the postmaster.
diff --git a/src/test/recovery/t/017_shm.pl b/src/test/recovery/t/017_shm.pl
index c73aa3f0c2c..ac238544217 100644
--- a/src/test/recovery/t/017_shm.pl
+++ b/src/test/recovery/t/017_shm.pl
@@ -67,7 +67,7 @@ log_ipcs();
 # Upon postmaster death, postmaster children exit automatically.
 $gnat->kill9;
 log_ipcs();
-poll_start($gnat);    # gnat recycles its former shm key.
+$gnat->poll_start;    # gnat recycles its former shm key.
 log_ipcs();
 
 note "removing the conflicting shmem ...";
@@ -82,7 +82,7 @@ log_ipcs();
 # the higher-keyed segment that the previous postmaster was using.
 # That's not great, but key collisions should be rare enough to not
 # make this a big problem.
-poll_start($gnat);
+$gnat->poll_start;
 log_ipcs();
 $gnat->stop;
 log_ipcs();
@@ -174,43 +174,11 @@ $slow_client->finish;    # client has detected backend termination
 log_ipcs();
 
 # now startup should work
-poll_start($gnat);
+$gnat->poll_start;
 log_ipcs();
 
 # finish testing
 $gnat->stop;
 log_ipcs();
 
-
-# We may need retries to start a new postmaster.  Causes:
-# - kernel is slow to deliver SIGKILL
-# - postmaster parent is slow to waitpid()
-# - postmaster child is slow to exit in response to SIGQUIT
-# - postmaster child is slow to exit after postmaster death
-sub poll_start
-{
-	my ($node) = @_;
-
-	my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default;
-	my $attempts = 0;
-
-	while ($attempts < $max_attempts)
-	{
-		$node->start(fail_ok => 1) && return 1;
-
-		# Wait 0.1 second before retrying.
-		usleep(100_000);
-
-		# Clean up in case the start attempt just timed out or some such.
-		$node->stop('fast', fail_ok => 1);
-
-		$attempts++;
-	}
-
-	# Try one last time without fail_ok, which will BAIL_OUT unless it
-	# succeeds.
-	$node->start && return 1;
-	return 0;
-}
-
 done_testing();
-- 
2.47.3