From e84d56b24d034694822710bca971eb7ac3a5a930 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 26 Nov 2025 18:07:01 +0200 Subject: [PATCH v11 1/1] Avoid multixact edge case 2 by writing the next offset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RecordNewMultiXact now writes the offset of the following multixact so readers never block waiting for “case 2”. The new TAP test reproduces IPC/MultixactCreation hangs and verifies that previously recorded multis stay readable across crash recovery. Author: Andrey Borodin Reviewed-by: Dmitry Yurichev Reviewed-by: Álvaro Herrera Reviewed-by: Kirill Reshke Reviewed-by: Ivan Bykov --- src/backend/access/transam/multixact.c | 150 +++++++++++------- src/test/modules/test_slru/t/001_multixact.pl | 141 ++++++++-------- src/test/modules/test_slru/test_multixact.c | 1 - src/test/perl/PostgreSQL/Test/Cluster.pm | 40 +++++ src/test/recovery/t/017_shm.pl | 38 +---- 5 files changed, 201 insertions(+), 169 deletions(-) diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 9d5f130af7e..d1e15ff5624 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -79,7 +79,6 @@ #include "pg_trace.h" #include "pgstat.h" #include "postmaster/autovacuum.h" -#include "storage/condition_variable.h" #include "storage/pmsignal.h" #include "storage/proc.h" #include "storage/procarray.h" @@ -271,12 +270,6 @@ typedef struct MultiXactStateData /* support for members anti-wraparound measures */ MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ - /* - * This is used to sleep until a multixact offset is written when we want - * to create the next one. - */ - ConditionVariable nextoff_cv; - /* * Per-backend data starts here. We have two arrays stored in the area * immediately following the MultiXactStateData struct. Each is indexed by @@ -912,13 +905,33 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int entryno; int slotno; MultiXactOffset *offptr; - int i; + MultiXactId next; + int64 next_pageno; + int next_entryno; + MultiXactOffset *next_offptr; LWLock *lock; LWLock *prevlock = NULL; + /* position of this multixid in the offsets SLRU area */ pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); + /* position of the next multixid */ + next = multi + 1; + if (next < FirstMultiXactId) + next = FirstMultiXactId; + next_pageno = MultiXactIdToOffsetPage(next); + next_entryno = MultiXactIdToOffsetEntry(next); + + /* + * Set the starting offset of this multixid's members. + * + * In the common case, it was already be set by the previous + * RecordNewMultiXact call, as this was the next multixid of the previous + * multixid. But if multiple backends are generating multixids + * concurrently, we might race ahead and get called before previous + * multixid. + */ lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); LWLockAcquire(lock, LW_EXCLUSIVE); @@ -933,22 +946,50 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; - *offptr = offset; + if (*offptr != offset) + { + /* should already be set to the correct value, or not at all */ + Assert(*offptr == 0); + *offptr = offset; + MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + } - MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + /* + * Set the next multixid's offset to the end of this multixid's members. + */ + if (next_pageno == pageno) + { + next_offptr = offptr + 1; + } + else + { + /* must be the first entry on the page */ + Assert(next_entryno == 0 || next == FirstMultiXactId); + + /* Swap the lock for a lock on the next page */ + LWLockRelease(lock); + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); + + slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next); + next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + next_offptr += next_entryno; + } + + if (*next_offptr != offset + nmembers) + { + /* should already be set to the correct value, or not at all */ + Assert(*next_offptr == 0); + *next_offptr = offset + nmembers; + MultiXactMemberCtl->shared->page_dirty[slotno] = true; + } /* Release MultiXactOffset SLRU lock. */ LWLockRelease(lock); - /* - * If anybody was waiting to know the offset of this multixact ID we just - * wrote, they can read it now, so wake them up. - */ - ConditionVariableBroadcast(&MultiXactState->nextoff_cv); - prev_pageno = -1; - for (i = 0; i < nmembers; i++, offset++) + for (int i = 0; i < nmembers; i++, offset++) { TransactionId *memberptr; uint32 *flagsptr; @@ -1138,8 +1179,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) result = FirstMultiXactId; } - /* Make sure there is room for the MXID in the file. */ - ExtendMultiXactOffset(result); + /* + * Make sure there is room for the next MXID in the file. Assigning this + * MXID sets the next MXID offset already. + */ + ExtendMultiXactOffset(result + 1); /* * Reserve the members space, similarly to above. Also, be careful not to @@ -1304,7 +1348,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, MultiXactOffset nextOffset; MultiXactMember *ptr; LWLock *lock; - bool slept = false; debug_elog3(DEBUG2, "GetMembers: asked for %u", multi); @@ -1381,23 +1424,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * one's. However, there are some corner cases to worry about: * * 1. This multixact may be the latest one created, in which case there is - * no next one to look at. In this case the nextOffset value we just - * saved is the correct endpoint. + * no next one to look at. The next multixact's offset should be set + * already, as we set it in RecordNewMultiXact(), but we used to not do + * that in older minor versions. To cope with that case, if this + * multixact is the latest one created, use the nextOffset value we read + * above as the endpoint. * - * 2. The next multixact may still be in process of being filled in: that - * is, another process may have done GetNewMultiXactId but not yet written - * the offset entry for that ID. In that scenario, it is guaranteed that - * the offset entry for that multixact exists (because GetNewMultiXactId - * won't release MultiXactGenLock until it does) but contains zero - * (because we are careful to pre-zero offset pages). Because - * GetNewMultiXactId will never return zero as the starting offset for a - * multixact, when we read zero as the next multixact's offset, we know we - * have this case. We handle this by sleeping on the condition variable - * we have just for this; the process in charge will signal the CV as soon - * as it has finished writing the multixact offset. - * - * 3. Because GetNewMultiXactId increments offset zero to offset one to - * handle case #2, there is an ambiguity near the point of offset + * 2. Because GetNewMultiXactId skips over offset zero, to reserve zero + * for to mean "unset", there is an ambiguity near the point of offset * wraparound. If we see next multixact's offset is one, is that our * multixact's actual endpoint, or did it end at zero with a subsequent * increment? We handle this using the knowledge that if the zero'th @@ -1409,7 +1443,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * cases, so it seems better than holding the MultiXactGenLock for a long * time on every multixact creation. */ -retry: pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); @@ -1473,16 +1506,10 @@ retry: if (nextMXOffset == 0) { - /* Corner case 2: next multixact is still being filled in */ - LWLockRelease(lock); - CHECK_FOR_INTERRUPTS(); - - INJECTION_POINT("multixact-get-members-cv-sleep", NULL); - - ConditionVariableSleep(&MultiXactState->nextoff_cv, - WAIT_EVENT_MULTIXACT_CREATION); - slept = true; - goto retry; + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %d has invalid next offset", + multi))); } length = nextMXOffset - offset; @@ -1491,12 +1518,6 @@ retry: LWLockRelease(lock); lock = NULL; - /* - * If we slept above, clean up state; it's no longer needed. - */ - if (slept) - ConditionVariableCancelSleep(); - ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); truelength = 0; @@ -1539,7 +1560,7 @@ retry: if (!TransactionIdIsValid(*xactptr)) { - /* Corner case 3: we must be looking at unused slot zero */ + /* Corner case 2: we must be looking at unused slot zero */ Assert(offset == 0); continue; } @@ -1986,7 +2007,6 @@ MultiXactShmemInit(void) /* Make sure we zero out the per-backend state */ MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE); - ConditionVariableInit(&MultiXactState->nextoff_cv); } else Assert(found); @@ -2132,26 +2152,36 @@ TrimMultiXact(void) pageno); /* + * Set the offset of the last multixact on the offsets page. + * + * This is normally done in RecordNewMultiXact() of the previous + * multixact, but we used to not do that in older minor versions. To + * ensure that the next offset is set if the binary was just upgraded from + * an older minor version, do it now. + * * Zero out the remainder of the current offsets page. See notes in * TrimCLOG() for background. Unlike CLOG, some WAL record covers every * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL * rule "write xlog before data," nextMXact successors may carry obsolete, - * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers() - * operates normally. + * nonzero offset values. */ entryno = MultiXactIdToOffsetEntry(nextMXact); - if (entryno != 0) { int slotno; MultiXactOffset *offptr; LWLock *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); LWLockAcquire(lock, LW_EXCLUSIVE); - slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); + if (entryno == 0) + slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); + else + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; - MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); + *offptr = offset; + if ((entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ) + MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset)); MultiXactOffsetCtl->shared->page_dirty[slotno] = true; LWLockRelease(lock); diff --git a/src/test/modules/test_slru/t/001_multixact.pl b/src/test/modules/test_slru/t/001_multixact.pl index e2b567a603d..2f85802f920 100644 --- a/src/test/modules/test_slru/t/001_multixact.pl +++ b/src/test/modules/test_slru/t/001_multixact.pl @@ -18,6 +18,10 @@ if ($ENV{enable_injection_points} ne 'yes') { plan skip_all => 'Injection points not supported by this build'; } +if ($windows_os) +{ + plan skip_all => 'Kill9 works unpredicatably on Windows'; +} my ($node, $result); @@ -25,96 +29,87 @@ $node = PostgreSQL::Test::Cluster->new('mike'); $node->init; $node->append_conf('postgresql.conf', "shared_preload_libraries = 'test_slru,injection_points'"); +# Set the cluster's next multitransaction to 0xFFFFFFF0. +my $node_pgdata = $node->data_dir; +command_ok( + [ + 'pg_resetwal', + '--multixact-ids' => '0xFFFFFFF0,0xFFFFFFF0', + $node_pgdata + ], + "set the cluster's next multitransaction to 0xFFFFFFF0"); +command_ok( + [ + 'dd', + 'if=/dev/zero', + "of=$node_pgdata/pg_multixact/offsets/FFFF", + 'bs=4', + 'count=65536' + ], + "init SLRU file"); + +command_ok( + [ + 'rm', + "$node_pgdata/pg_multixact/offsets/0000", + ], + "drop old SLRU file"); + $node->start; $node->safe_psql('postgres', q(CREATE EXTENSION injection_points)); $node->safe_psql('postgres', q(CREATE EXTENSION test_slru)); -# Test for Multixact generation edge case -$node->safe_psql('postgres', - q{select injection_points_attach('test-multixact-read','wait')}); -$node->safe_psql('postgres', - q{select injection_points_attach('multixact-get-members-cv-sleep','wait')} -); +# Another multixact test: loosing some multixact must not affect reading near +# multixacts, even after a crash. +my $bg_psql = $node->background_psql('postgres'); -# This session must observe sleep on the condition variable while generating a -# multixact. To achieve this it first will create a multixact, then pause -# before reading it. -my $observer = $node->background_psql('postgres'); - -# This query will create a multixact, and hang just before reading it. -$observer->query_until( - qr/start/, - q{ - \echo start - SELECT test_read_multixact(test_create_multixact()); -}); -$node->wait_for_event('client backend', 'test-multixact-read'); - -# This session will create the next Multixact. This is necessary to avoid -# multixact.c's non-sleeping edge case 1. -my $creator = $node->background_psql('postgres'); +my $multi = $bg_psql->query_safe( + q(SELECT test_create_multixact();)); + +# The space for next multi will be allocated, but it will never be actually +# recorded. $node->safe_psql('postgres', q{SELECT injection_points_attach('multixact-create-from-members','wait');} ); -# We expect this query to hang in the critical section after generating new -# multixact, but before filling its offset into SLRU. -# Running an injection point inside a critical section requires it to be -# loaded beforehand. -$creator->query_until( - qr/start/, q{ - \echo start +$bg_psql->query_until( + qr/deploying lost multi/, q( +\echo deploying lost multi SELECT test_create_multixact(); -}); +)); $node->wait_for_event('client backend', 'multixact-create-from-members'); - -# Ensure we have the backends waiting that we expect -is( $node->safe_psql( - 'postgres', - q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event) - FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'} - ), - 'multixact-create-from-members, test-multixact-read', - "matching injection point waits"); - -# Now wake observer to get it to read the initial multixact. A subsequent -# multixact already exists, but that one doesn't have an offset assigned, so -# this will hit multixact.c's edge case 2. -$node->safe_psql('postgres', - q{SELECT injection_points_wakeup('test-multixact-read')}); -$node->wait_for_event('client backend', 'multixact-get-members-cv-sleep'); - -# Ensure we have the backends waiting that we expect -is( $node->safe_psql( - 'postgres', - q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event) - FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'} - ), - 'multixact-create-from-members, multixact-get-members-cv-sleep', - "matching injection point waits"); - -# Now we have two backends waiting in multixact-create-from-members and -# multixact-get-members-cv-sleep. Also we have 3 injections points set to wait. -# If we wakeup multixact-get-members-cv-sleep it will happen again, so we must -# detach it first. So let's detach all injection points, then wake up all -# backends. - -$node->safe_psql('postgres', - q{SELECT injection_points_detach('test-multixact-read')}); $node->safe_psql('postgres', q{SELECT injection_points_detach('multixact-create-from-members')}); -$node->safe_psql('postgres', - q{SELECT injection_points_detach('multixact-get-members-cv-sleep')}); $node->safe_psql('postgres', - q{SELECT injection_points_wakeup('multixact-create-from-members')}); -$node->safe_psql('postgres', - q{SELECT injection_points_wakeup('multixact-get-members-cv-sleep')}); + q{checkpoint;}); -# Background psql will now be able to read the result and disconnect. -$observer->quit; -$creator->quit; +# One more multitransaction to effectivelt emit WAL record about next +# multitransaction (to avaoid corener case 1). +$node->safe_psql('postgres', + q{SELECT test_create_multixact();}); + +# All set and done, it's time for hard restart +$node->kill9; +$node->stop('immediate', fail_ok => 1); +$node->poll_start; +$bg_psql->{run}->finish; + +# Verify thet recorded multi is readble, this call must not hang. +# Also note that all injection points disappeared after server restart. +my $timed_out = 0; +$node->safe_psql( + 'postgres', + qq{SELECT test_read_multixact('$multi'::xid);}, + timeout => $PostgreSQL::Test::Utils::timeout_default, + timed_out => \$timed_out); +ok($timed_out == 0, 'recorded multi is readble'); + +# Test mxidwraparound +foreach my $i (1 .. 32) { +$node->safe_psql('postgres',q{SELECT test_create_multixact();}); +} $node->stop; diff --git a/src/test/modules/test_slru/test_multixact.c b/src/test/modules/test_slru/test_multixact.c index 6c9b0420717..2fd67273ee5 100644 --- a/src/test/modules/test_slru/test_multixact.c +++ b/src/test/modules/test_slru/test_multixact.c @@ -46,7 +46,6 @@ test_read_multixact(PG_FUNCTION_ARGS) MultiXactId id = PG_GETARG_TRANSACTIONID(0); MultiXactMember *members; - INJECTION_POINT("test-multixact-read", NULL); /* discard caches */ AtEOXact_MultiXact(); diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index 35413f14019..e810f123f93 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -1191,6 +1191,46 @@ sub start =pod +=item $node->poll_start() => success_or_failure + +Polls start after kill9 + +We may need retries to start a new postmaster. Causes: + - kernel is slow to deliver SIGKILL + - postmaster parent is slow to waitpid() + - postmaster child is slow to exit in response to SIGQUIT + - postmaster child is slow to exit after postmaster death + +=cut + +sub poll_start +{ + my ($self) = @_; + + my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default; + my $attempts = 0; + + while ($attempts < $max_attempts) + { + $self->start(fail_ok => 1) && return 1; + + # Wait 0.1 second before retrying. + usleep(100_000); + + # Clean up in case the start attempt just timed out or some such. + $self->stop('fast', fail_ok => 1); + + $attempts++; + } + + # Try one last time without fail_ok, which will BAIL_OUT unless it + # succeeds. + $self->start && return 1; + return 0; +} + +=pod + =item $node->kill9() Send SIGKILL (signal 9) to the postmaster. diff --git a/src/test/recovery/t/017_shm.pl b/src/test/recovery/t/017_shm.pl index c73aa3f0c2c..ac238544217 100644 --- a/src/test/recovery/t/017_shm.pl +++ b/src/test/recovery/t/017_shm.pl @@ -67,7 +67,7 @@ log_ipcs(); # Upon postmaster death, postmaster children exit automatically. $gnat->kill9; log_ipcs(); -poll_start($gnat); # gnat recycles its former shm key. +$gnat->poll_start; # gnat recycles its former shm key. log_ipcs(); note "removing the conflicting shmem ..."; @@ -82,7 +82,7 @@ log_ipcs(); # the higher-keyed segment that the previous postmaster was using. # That's not great, but key collisions should be rare enough to not # make this a big problem. -poll_start($gnat); +$gnat->poll_start; log_ipcs(); $gnat->stop; log_ipcs(); @@ -174,43 +174,11 @@ $slow_client->finish; # client has detected backend termination log_ipcs(); # now startup should work -poll_start($gnat); +$gnat->poll_start; log_ipcs(); # finish testing $gnat->stop; log_ipcs(); - -# We may need retries to start a new postmaster. Causes: -# - kernel is slow to deliver SIGKILL -# - postmaster parent is slow to waitpid() -# - postmaster child is slow to exit in response to SIGQUIT -# - postmaster child is slow to exit after postmaster death -sub poll_start -{ - my ($node) = @_; - - my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default; - my $attempts = 0; - - while ($attempts < $max_attempts) - { - $node->start(fail_ok => 1) && return 1; - - # Wait 0.1 second before retrying. - usleep(100_000); - - # Clean up in case the start attempt just timed out or some such. - $node->stop('fast', fail_ok => 1); - - $attempts++; - } - - # Try one last time without fail_ok, which will BAIL_OUT unless it - # succeeds. - $node->start && return 1; - return 0; -} - done_testing(); -- 2.47.3