From f295b9b0cd6f5597b88b5c302b2ab434262f5e0e Mon Sep 17 00:00:00 2001 From: Andrey Borodin Date: Sun, 27 Jul 2025 11:37:55 +0500 Subject: [PATCH v9] Avoid edge case 2 in multixacts --- src/backend/access/transam/multixact.c | 85 +++++++++------ src/test/modules/test_slru/t/001_multixact.pl | 103 ++++++------------ src/test/modules/test_slru/test_multixact.c | 1 - src/test/perl/PostgreSQL/Test/Cluster.pm | 40 +++++++ src/test/recovery/t/017_shm.pl | 38 +------ 5 files changed, 123 insertions(+), 144 deletions(-) diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 3cb09c3d598..8e530558c2f 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -275,12 +275,6 @@ typedef struct MultiXactStateData /* support for members anti-wraparound measures */ MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ - /* - * This is used to sleep until a multixact offset is written when we want - * to create the next one. - */ - ConditionVariable nextoff_cv; - /* * Per-backend data starts here. We have two arrays stored in the area * immediately following the MultiXactStateData struct. Each is indexed by @@ -921,10 +915,20 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int i; LWLock *lock; LWLock *prevlock = NULL; + MultiXactId next = multi + 1; + int next_pageno; pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); + /* + * We must also fill next offset to keep current multi readable + * Handle wraparound as GetMultiXactIdMembers() does it. + */ + if (next < FirstMultiXactId) + next = FirstMultiXactId; + next_pageno = MultiXactIdToOffsetPage(next); + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); LWLockAcquire(lock, LW_EXCLUSIVE); @@ -939,19 +943,41 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; - *offptr = offset; + /* + * We might have filled this offset previosuly. + * Cross-check for correctness. + */ + Assert((*offptr == 0) || (*offptr == offset)); + *offptr = offset; MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + if (next_pageno == pageno) + { + offptr[1] = offset + nmembers; + } + else + { + int next_slotno; + MultiXactOffset *next_offptr; + int next_entryno = MultiXactIdToOffsetEntry(next); + Assert(next_entryno == 0); /* This is an overflow-only branch */ + + /* Swap the lock for a lock of next page */ + LWLockRelease(lock); + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); + + /* Read and adjust next page */ + next_slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next); + next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[next_slotno]; + next_offptr[next_entryno] = offset + nmembers; + MultiXactMemberCtl->shared->page_dirty[next_slotno] = true; + } + /* Release MultiXactOffset SLRU lock. */ LWLockRelease(lock); - /* - * If anybody was waiting to know the offset of this multixact ID we just - * wrote, they can read it now, so wake them up. - */ - ConditionVariableBroadcast(&MultiXactState->nextoff_cv); - prev_pageno = -1; for (i = 0; i < nmembers; i++, offset++) @@ -1144,8 +1170,8 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) result = FirstMultiXactId; } - /* Make sure there is room for the MXID in the file. */ - ExtendMultiXactOffset(result); + /* Make sure there is room for the MXID and next offset in the file. */ + ExtendMultiXactOffset(MultiXactState->nextMXact + 1); /* * Reserve the members space, similarly to above. Also, be careful not to @@ -1310,7 +1336,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, MultiXactOffset nextOffset; MultiXactMember *ptr; LWLock *lock; - bool slept = false; debug_elog3(DEBUG2, "GetMembers: asked for %u", multi); @@ -1389,7 +1414,10 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * 1. This multixact may be the latest one created, in which case there is * no next one to look at. In this case the nextOffset value we just * saved is the correct endpoint. + * TODO: how does it work on Standby? MultiXactState->nextMXact does not seem to be up-to date. + * nextMXact and nextOffset are in sync, so nothing bad can happen, but nextMXact seems mostly random. * + * THIS IS NOT POSSIBLE ANYMORE, KEEP IT FOR HISTORIC REASONS. * 2. The next multixact may still be in process of being filled in: that * is, another process may have done GetNewMultiXactId but not yet written * the offset entry for that ID. In that scenario, it is guaranteed that @@ -1415,7 +1443,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * cases, so it seems better than holding the MultiXactGenLock for a long * time on every multixact creation. */ -retry: pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); @@ -1479,16 +1506,10 @@ retry: if (nextMXOffset == 0) { - /* Corner case 2: next multixact is still being filled in */ - LWLockRelease(lock); - CHECK_FOR_INTERRUPTS(); - - INJECTION_POINT("multixact-get-members-cv-sleep", NULL); - - ConditionVariableSleep(&MultiXactState->nextoff_cv, - WAIT_EVENT_MULTIXACT_CREATION); - slept = true; - goto retry; + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %d has invalid next offset", + multi))); } length = nextMXOffset - offset; @@ -1497,12 +1518,6 @@ retry: LWLockRelease(lock); lock = NULL; - /* - * If we slept above, clean up state; it's no longer needed. - */ - if (slept) - ConditionVariableCancelSleep(); - ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); truelength = 0; @@ -1992,7 +2007,6 @@ MultiXactShmemInit(void) /* Make sure we zero out the per-backend state */ MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE); - ConditionVariableInit(&MultiXactState->nextoff_cv); } else Assert(found); @@ -2142,8 +2156,7 @@ TrimMultiXact(void) * TrimCLOG() for background. Unlike CLOG, some WAL record covers every * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL * rule "write xlog before data," nextMXact successors may carry obsolete, - * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers() - * operates normally. + * nonzero offset values. */ entryno = MultiXactIdToOffsetEntry(nextMXact); if (entryno != 0) diff --git a/src/test/modules/test_slru/t/001_multixact.pl b/src/test/modules/test_slru/t/001_multixact.pl index e2b567a603d..752a72dcc23 100644 --- a/src/test/modules/test_slru/t/001_multixact.pl +++ b/src/test/modules/test_slru/t/001_multixact.pl @@ -18,6 +18,10 @@ if ($ENV{enable_injection_points} ne 'yes') { plan skip_all => 'Injection points not supported by this build'; } +if ($windows_os) +{ + plan skip_all => 'Kill9 works unpredicatably on Windows'; +} my ($node, $result); @@ -29,92 +33,47 @@ $node->start; $node->safe_psql('postgres', q(CREATE EXTENSION injection_points)); $node->safe_psql('postgres', q(CREATE EXTENSION test_slru)); -# Test for Multixact generation edge case -$node->safe_psql('postgres', - q{select injection_points_attach('test-multixact-read','wait')}); -$node->safe_psql('postgres', - q{select injection_points_attach('multixact-get-members-cv-sleep','wait')} -); +# Another multixact test: loosing some multixact must not affect reading near +# multixacts, even after a crash. +my $bg_psql = $node->background_psql('postgres'); + +my $multi = $bg_psql->query_safe( + q(SELECT test_create_multixact();)); -# This session must observe sleep on the condition variable while generating a -# multixact. To achieve this it first will create a multixact, then pause -# before reading it. -my $observer = $node->background_psql('postgres'); - -# This query will create a multixact, and hang just before reading it. -$observer->query_until( - qr/start/, - q{ - \echo start - SELECT test_read_multixact(test_create_multixact()); -}); -$node->wait_for_event('client backend', 'test-multixact-read'); - -# This session will create the next Multixact. This is necessary to avoid -# multixact.c's non-sleeping edge case 1. -my $creator = $node->background_psql('postgres'); +# The space for next multi will be allocated, but it will never be actually +# recorded. $node->safe_psql('postgres', q{SELECT injection_points_attach('multixact-create-from-members','wait');} ); -# We expect this query to hang in the critical section after generating new -# multixact, but before filling its offset into SLRU. -# Running an injection point inside a critical section requires it to be -# loaded beforehand. -$creator->query_until( - qr/start/, q{ - \echo start +$bg_psql->query_until( + qr/deploying lost multi/, q( +\echo deploying lost multi SELECT test_create_multixact(); -}); +)); $node->wait_for_event('client backend', 'multixact-create-from-members'); - -# Ensure we have the backends waiting that we expect -is( $node->safe_psql( - 'postgres', - q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event) - FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'} - ), - 'multixact-create-from-members, test-multixact-read', - "matching injection point waits"); - -# Now wake observer to get it to read the initial multixact. A subsequent -# multixact already exists, but that one doesn't have an offset assigned, so -# this will hit multixact.c's edge case 2. -$node->safe_psql('postgres', - q{SELECT injection_points_wakeup('test-multixact-read')}); -$node->wait_for_event('client backend', 'multixact-get-members-cv-sleep'); - -# Ensure we have the backends waiting that we expect -is( $node->safe_psql( - 'postgres', - q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event) - FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'} - ), - 'multixact-create-from-members, multixact-get-members-cv-sleep', - "matching injection point waits"); - -# Now we have two backends waiting in multixact-create-from-members and -# multixact-get-members-cv-sleep. Also we have 3 injections points set to wait. -# If we wakeup multixact-get-members-cv-sleep it will happen again, so we must -# detach it first. So let's detach all injection points, then wake up all -# backends. - -$node->safe_psql('postgres', - q{SELECT injection_points_detach('test-multixact-read')}); $node->safe_psql('postgres', q{SELECT injection_points_detach('multixact-create-from-members')}); -$node->safe_psql('postgres', - q{SELECT injection_points_detach('multixact-get-members-cv-sleep')}); $node->safe_psql('postgres', - q{SELECT injection_points_wakeup('multixact-create-from-members')}); + q{checkpoint;}); + +# One more multitransaction to effectivelt emit WAL record about next +# multitransaction (to avaoid corener case 1). $node->safe_psql('postgres', - q{SELECT injection_points_wakeup('multixact-get-members-cv-sleep')}); + q{SELECT test_create_multixact();}); + +# All set and done, it's time for hard restart +$node->kill9; +$node->stop('immediate', fail_ok => 1); +$node->poll_start; +$bg_psql->{run}->finish; -# Background psql will now be able to read the result and disconnect. -$observer->quit; -$creator->quit; +# Verify thet recorded multi is readble, this call must not hang. +# Also note that all injection points disappeared after server restart. +$node->safe_psql('postgres', + qq{SELECT test_read_multixact('$multi'::xid);}); $node->stop; diff --git a/src/test/modules/test_slru/test_multixact.c b/src/test/modules/test_slru/test_multixact.c index 6c9b0420717..2fd67273ee5 100644 --- a/src/test/modules/test_slru/test_multixact.c +++ b/src/test/modules/test_slru/test_multixact.c @@ -46,7 +46,6 @@ test_read_multixact(PG_FUNCTION_ARGS) MultiXactId id = PG_GETARG_TRANSACTIONID(0); MultiXactMember *members; - INJECTION_POINT("test-multixact-read", NULL); /* discard caches */ AtEOXact_MultiXact(); diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm index 61f68e0cc2e..a162baa55c6 100644 --- a/src/test/perl/PostgreSQL/Test/Cluster.pm +++ b/src/test/perl/PostgreSQL/Test/Cluster.pm @@ -1190,6 +1190,46 @@ sub start =pod +=item $node->poll_start() => success_or_failure + +Polls start after kill9 + +We may need retries to start a new postmaster. Causes: + - kernel is slow to deliver SIGKILL + - postmaster parent is slow to waitpid() + - postmaster child is slow to exit in response to SIGQUIT + - postmaster child is slow to exit after postmaster death + +=cut + +sub poll_start +{ + my ($self) = @_; + + my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default; + my $attempts = 0; + + while ($attempts < $max_attempts) + { + $self->start(fail_ok => 1) && return 1; + + # Wait 0.1 second before retrying. + usleep(100_000); + + # Clean up in case the start attempt just timed out or some such. + $self->stop('fast', fail_ok => 1); + + $attempts++; + } + + # Try one last time without fail_ok, which will BAIL_OUT unless it + # succeeds. + $self->start && return 1; + return 0; +} + +=pod + =item $node->kill9() Send SIGKILL (signal 9) to the postmaster. diff --git a/src/test/recovery/t/017_shm.pl b/src/test/recovery/t/017_shm.pl index c73aa3f0c2c..ac238544217 100644 --- a/src/test/recovery/t/017_shm.pl +++ b/src/test/recovery/t/017_shm.pl @@ -67,7 +67,7 @@ log_ipcs(); # Upon postmaster death, postmaster children exit automatically. $gnat->kill9; log_ipcs(); -poll_start($gnat); # gnat recycles its former shm key. +$gnat->poll_start; # gnat recycles its former shm key. log_ipcs(); note "removing the conflicting shmem ..."; @@ -82,7 +82,7 @@ log_ipcs(); # the higher-keyed segment that the previous postmaster was using. # That's not great, but key collisions should be rare enough to not # make this a big problem. -poll_start($gnat); +$gnat->poll_start; log_ipcs(); $gnat->stop; log_ipcs(); @@ -174,43 +174,11 @@ $slow_client->finish; # client has detected backend termination log_ipcs(); # now startup should work -poll_start($gnat); +$gnat->poll_start; log_ipcs(); # finish testing $gnat->stop; log_ipcs(); - -# We may need retries to start a new postmaster. Causes: -# - kernel is slow to deliver SIGKILL -# - postmaster parent is slow to waitpid() -# - postmaster child is slow to exit in response to SIGQUIT -# - postmaster child is slow to exit after postmaster death -sub poll_start -{ - my ($node) = @_; - - my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default; - my $attempts = 0; - - while ($attempts < $max_attempts) - { - $node->start(fail_ok => 1) && return 1; - - # Wait 0.1 second before retrying. - usleep(100_000); - - # Clean up in case the start attempt just timed out or some such. - $node->stop('fast', fail_ok => 1); - - $attempts++; - } - - # Try one last time without fail_ok, which will BAIL_OUT unless it - # succeeds. - $node->start && return 1; - return 0; -} - done_testing(); -- 2.39.5 (Apple Git-154)