v10-0001-Avoid-multixact-edge-case-2-by-writing-the-next-.patch
application/octet-stream
Filename: v10-0001-Avoid-multixact-edge-case-2-by-writing-the-next-.patch
Type: application/octet-stream
Part: 0
From d21ba6f71280e03b0e402558626d00ec4b79a156 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Sun, 27 Jul 2025 11:37:55 +0500
Subject: [PATCH v10] Avoid multixact edge case 2 by writing the next offset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
RecordNewMultiXact now writes the offset of the
following multixact so readers never block waiting for “case 2”.
The new TAP test reproduces IPC/MultixactCreation hangs and verifies
that previously recorded multis stay readable across crash recovery.
Reviewed-by: Dmitry Yurichev <dsy.075@yandex.ru>
Reviewed-by: Álvaro Herrera <alvherre@postgresql.org>
Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Reviewed-by: Ivan Bykov <i.bykov@modernsys.ru>
---
src/backend/access/transam/multixact.c | 101 ++++++++-----
src/test/modules/test_slru/t/001_multixact.pl | 141 +++++++++---------
src/test/modules/test_slru/test_multixact.c | 1 -
src/test/perl/PostgreSQL/Test/Cluster.pm | 40 +++++
src/test/recovery/t/017_shm.pl | 38 +----
5 files changed, 173 insertions(+), 148 deletions(-)
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 9d5f130af7e..ddca8433cc1 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -271,12 +271,6 @@ typedef struct MultiXactStateData
/* support for members anti-wraparound measures */
MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */
- /*
- * This is used to sleep until a multixact offset is written when we want
- * to create the next one.
- */
- ConditionVariable nextoff_cv;
-
/*
* Per-backend data starts here. We have two arrays stored in the area
* immediately following the MultiXactStateData struct. Each is indexed by
@@ -915,10 +909,20 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
int i;
LWLock *lock;
LWLock *prevlock = NULL;
+ MultiXactId next = multi + 1;
+ int next_pageno;
pageno = MultiXactIdToOffsetPage(multi);
entryno = MultiXactIdToOffsetEntry(multi);
+ /*
+ * We must also fill next offset to keep current multi readable
+ * Handle wraparound as GetMultiXactIdMembers() does it.
+ */
+ if (next < FirstMultiXactId)
+ next = FirstMultiXactId;
+ next_pageno = MultiXactIdToOffsetPage(next);
+
lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
LWLockAcquire(lock, LW_EXCLUSIVE);
@@ -933,19 +937,45 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
offptr += entryno;
- *offptr = offset;
+ /*
+ * We might have filled this offset previosuly.
+ * Cross-check for correctness.
+ */
+ Assert((*offptr == 0) || (*offptr == offset));
+ *offptr = offset;
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
+ /* Also we records next offset here */
+ if (next_pageno == pageno)
+ {
+ offptr[1] = offset + nmembers;
+ }
+ else
+ {
+ int next_slotno;
+ MultiXactOffset *next_offptr;
+ int next_entryno = MultiXactIdToOffsetEntry(next);
+ /* This is an overflow-only branch */
+ Assert(next_entryno == 0 || next == FirstMultiXactId);
+
+ /* Swap the lock for a lock of next page */
+ LWLockRelease(lock);
+ lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
+ LWLockAcquire(lock, LW_EXCLUSIVE);
+
+ /* Read and adjust next page */
+ next_slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next);
+ next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[next_slotno];
+ Assert((next_offptr[next_entryno] == 0)
+ || (next_offptr[next_entryno] == offset + nmembers));
+ next_offptr[next_entryno] = offset + nmembers;
+ MultiXactMemberCtl->shared->page_dirty[next_slotno] = true;
+ }
+
/* Release MultiXactOffset SLRU lock. */
LWLockRelease(lock);
- /*
- * If anybody was waiting to know the offset of this multixact ID we just
- * wrote, they can read it now, so wake them up.
- */
- ConditionVariableBroadcast(&MultiXactState->nextoff_cv);
-
prev_pageno = -1;
for (i = 0; i < nmembers; i++, offset++)
@@ -1138,8 +1168,13 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
result = FirstMultiXactId;
}
- /* Make sure there is room for the MXID in the file. */
- ExtendMultiXactOffset(result);
+ /*
+ * Make sure there is room for the MXID and next offset in the file.
+ * We might overflow to the next segment, but we don't need to handle
+ * FirstMultiXactId specifically, because ExtendMultiXactOffset handles
+ * both cases well: 0 offset and FirstMultiXactId would create segment.
+ */
+ ExtendMultiXactOffset(MultiXactState->nextMXact + 1);
/*
* Reserve the members space, similarly to above. Also, be careful not to
@@ -1304,7 +1339,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
MultiXactOffset nextOffset;
MultiXactMember *ptr;
LWLock *lock;
- bool slept = false;
debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
@@ -1383,7 +1417,10 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
* 1. This multixact may be the latest one created, in which case there is
* no next one to look at. In this case the nextOffset value we just
* saved is the correct endpoint.
+ * TODO: how does it work on Standby? MultiXactState->nextMXact does not seem to be up-to date.
+ * nextMXact and nextOffset are in sync, so nothing bad can happen, but nextMXact seems mostly random.
*
+ * THIS IS NOT POSSIBLE ANYMORE, KEEP IT FOR HISTORIC REASONS.
* 2. The next multixact may still be in process of being filled in: that
* is, another process may have done GetNewMultiXactId but not yet written
* the offset entry for that ID. In that scenario, it is guaranteed that
@@ -1409,7 +1446,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
* cases, so it seems better than holding the MultiXactGenLock for a long
* time on every multixact creation.
*/
-retry:
pageno = MultiXactIdToOffsetPage(multi);
entryno = MultiXactIdToOffsetEntry(multi);
@@ -1473,16 +1509,10 @@ retry:
if (nextMXOffset == 0)
{
- /* Corner case 2: next multixact is still being filled in */
- LWLockRelease(lock);
- CHECK_FOR_INTERRUPTS();
-
- INJECTION_POINT("multixact-get-members-cv-sleep", NULL);
-
- ConditionVariableSleep(&MultiXactState->nextoff_cv,
- WAIT_EVENT_MULTIXACT_CREATION);
- slept = true;
- goto retry;
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("MultiXact %d has invalid next offset",
+ multi)));
}
length = nextMXOffset - offset;
@@ -1491,12 +1521,6 @@ retry:
LWLockRelease(lock);
lock = NULL;
- /*
- * If we slept above, clean up state; it's no longer needed.
- */
- if (slept)
- ConditionVariableCancelSleep();
-
ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
truelength = 0;
@@ -1986,7 +2010,6 @@ MultiXactShmemInit(void)
/* Make sure we zero out the per-backend state */
MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
- ConditionVariableInit(&MultiXactState->nextoff_cv);
}
else
Assert(found);
@@ -2136,8 +2159,7 @@ TrimMultiXact(void)
* TrimCLOG() for background. Unlike CLOG, some WAL record covers every
* pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL
* rule "write xlog before data," nextMXact successors may carry obsolete,
- * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers()
- * operates normally.
+ * nonzero offset values.
*/
entryno = MultiXactIdToOffsetEntry(nextMXact);
if (entryno != 0)
@@ -2487,10 +2509,11 @@ ExtendMultiXactOffset(MultiXactId multi)
/*
* No work except at first MultiXactId of a page. But beware: just after
- * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
+ * wraparound, the first MultiXactId of page zero is FirstMultiXactId,
+ * make sure we are not in that case.
*/
- if (MultiXactIdToOffsetEntry(multi) != 0 &&
- multi != FirstMultiXactId)
+ Assert(multi != FirstMultiXactId);
+ if (MultiXactIdToOffsetEntry(multi) != 0)
return;
pageno = MultiXactIdToOffsetPage(multi);
diff --git a/src/test/modules/test_slru/t/001_multixact.pl b/src/test/modules/test_slru/t/001_multixact.pl
index e2b567a603d..2f85802f920 100644
--- a/src/test/modules/test_slru/t/001_multixact.pl
+++ b/src/test/modules/test_slru/t/001_multixact.pl
@@ -18,6 +18,10 @@ if ($ENV{enable_injection_points} ne 'yes')
{
plan skip_all => 'Injection points not supported by this build';
}
+if ($windows_os)
+{
+ plan skip_all => 'Kill9 works unpredicatably on Windows';
+}
my ($node, $result);
@@ -25,96 +29,87 @@ $node = PostgreSQL::Test::Cluster->new('mike');
$node->init;
$node->append_conf('postgresql.conf',
"shared_preload_libraries = 'test_slru,injection_points'");
+# Set the cluster's next multitransaction to 0xFFFFFFF0.
+my $node_pgdata = $node->data_dir;
+command_ok(
+ [
+ 'pg_resetwal',
+ '--multixact-ids' => '0xFFFFFFF0,0xFFFFFFF0',
+ $node_pgdata
+ ],
+ "set the cluster's next multitransaction to 0xFFFFFFF0");
+command_ok(
+ [
+ 'dd',
+ 'if=/dev/zero',
+ "of=$node_pgdata/pg_multixact/offsets/FFFF",
+ 'bs=4',
+ 'count=65536'
+ ],
+ "init SLRU file");
+
+command_ok(
+ [
+ 'rm',
+ "$node_pgdata/pg_multixact/offsets/0000",
+ ],
+ "drop old SLRU file");
+
$node->start;
$node->safe_psql('postgres', q(CREATE EXTENSION injection_points));
$node->safe_psql('postgres', q(CREATE EXTENSION test_slru));
-# Test for Multixact generation edge case
-$node->safe_psql('postgres',
- q{select injection_points_attach('test-multixact-read','wait')});
-$node->safe_psql('postgres',
- q{select injection_points_attach('multixact-get-members-cv-sleep','wait')}
-);
+# Another multixact test: loosing some multixact must not affect reading near
+# multixacts, even after a crash.
+my $bg_psql = $node->background_psql('postgres');
-# This session must observe sleep on the condition variable while generating a
-# multixact. To achieve this it first will create a multixact, then pause
-# before reading it.
-my $observer = $node->background_psql('postgres');
-
-# This query will create a multixact, and hang just before reading it.
-$observer->query_until(
- qr/start/,
- q{
- \echo start
- SELECT test_read_multixact(test_create_multixact());
-});
-$node->wait_for_event('client backend', 'test-multixact-read');
-
-# This session will create the next Multixact. This is necessary to avoid
-# multixact.c's non-sleeping edge case 1.
-my $creator = $node->background_psql('postgres');
+my $multi = $bg_psql->query_safe(
+ q(SELECT test_create_multixact();));
+
+# The space for next multi will be allocated, but it will never be actually
+# recorded.
$node->safe_psql('postgres',
q{SELECT injection_points_attach('multixact-create-from-members','wait');}
);
-# We expect this query to hang in the critical section after generating new
-# multixact, but before filling its offset into SLRU.
-# Running an injection point inside a critical section requires it to be
-# loaded beforehand.
-$creator->query_until(
- qr/start/, q{
- \echo start
+$bg_psql->query_until(
+ qr/deploying lost multi/, q(
+\echo deploying lost multi
SELECT test_create_multixact();
-});
+));
$node->wait_for_event('client backend', 'multixact-create-from-members');
-
-# Ensure we have the backends waiting that we expect
-is( $node->safe_psql(
- 'postgres',
- q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event)
- FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'}
- ),
- 'multixact-create-from-members, test-multixact-read',
- "matching injection point waits");
-
-# Now wake observer to get it to read the initial multixact. A subsequent
-# multixact already exists, but that one doesn't have an offset assigned, so
-# this will hit multixact.c's edge case 2.
-$node->safe_psql('postgres',
- q{SELECT injection_points_wakeup('test-multixact-read')});
-$node->wait_for_event('client backend', 'multixact-get-members-cv-sleep');
-
-# Ensure we have the backends waiting that we expect
-is( $node->safe_psql(
- 'postgres',
- q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event)
- FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'}
- ),
- 'multixact-create-from-members, multixact-get-members-cv-sleep',
- "matching injection point waits");
-
-# Now we have two backends waiting in multixact-create-from-members and
-# multixact-get-members-cv-sleep. Also we have 3 injections points set to wait.
-# If we wakeup multixact-get-members-cv-sleep it will happen again, so we must
-# detach it first. So let's detach all injection points, then wake up all
-# backends.
-
-$node->safe_psql('postgres',
- q{SELECT injection_points_detach('test-multixact-read')});
$node->safe_psql('postgres',
q{SELECT injection_points_detach('multixact-create-from-members')});
-$node->safe_psql('postgres',
- q{SELECT injection_points_detach('multixact-get-members-cv-sleep')});
$node->safe_psql('postgres',
- q{SELECT injection_points_wakeup('multixact-create-from-members')});
-$node->safe_psql('postgres',
- q{SELECT injection_points_wakeup('multixact-get-members-cv-sleep')});
+ q{checkpoint;});
-# Background psql will now be able to read the result and disconnect.
-$observer->quit;
-$creator->quit;
+# One more multitransaction to effectivelt emit WAL record about next
+# multitransaction (to avaoid corener case 1).
+$node->safe_psql('postgres',
+ q{SELECT test_create_multixact();});
+
+# All set and done, it's time for hard restart
+$node->kill9;
+$node->stop('immediate', fail_ok => 1);
+$node->poll_start;
+$bg_psql->{run}->finish;
+
+# Verify thet recorded multi is readble, this call must not hang.
+# Also note that all injection points disappeared after server restart.
+my $timed_out = 0;
+$node->safe_psql(
+ 'postgres',
+ qq{SELECT test_read_multixact('$multi'::xid);},
+ timeout => $PostgreSQL::Test::Utils::timeout_default,
+ timed_out => \$timed_out);
+ok($timed_out == 0, 'recorded multi is readble');
+
+# Test mxidwraparound
+foreach my $i (1 .. 32) {
+$node->safe_psql('postgres',q{SELECT test_create_multixact();});
+}
$node->stop;
diff --git a/src/test/modules/test_slru/test_multixact.c b/src/test/modules/test_slru/test_multixact.c
index 6c9b0420717..2fd67273ee5 100644
--- a/src/test/modules/test_slru/test_multixact.c
+++ b/src/test/modules/test_slru/test_multixact.c
@@ -46,7 +46,6 @@ test_read_multixact(PG_FUNCTION_ARGS)
MultiXactId id = PG_GETARG_TRANSACTIONID(0);
MultiXactMember *members;
- INJECTION_POINT("test-multixact-read", NULL);
/* discard caches */
AtEOXact_MultiXact();
diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm
index 35413f14019..e810f123f93 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -1191,6 +1191,46 @@ sub start
=pod
+=item $node->poll_start() => success_or_failure
+
+Polls start after kill9
+
+We may need retries to start a new postmaster. Causes:
+ - kernel is slow to deliver SIGKILL
+ - postmaster parent is slow to waitpid()
+ - postmaster child is slow to exit in response to SIGQUIT
+ - postmaster child is slow to exit after postmaster death
+
+=cut
+
+sub poll_start
+{
+ my ($self) = @_;
+
+ my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default;
+ my $attempts = 0;
+
+ while ($attempts < $max_attempts)
+ {
+ $self->start(fail_ok => 1) && return 1;
+
+ # Wait 0.1 second before retrying.
+ usleep(100_000);
+
+ # Clean up in case the start attempt just timed out or some such.
+ $self->stop('fast', fail_ok => 1);
+
+ $attempts++;
+ }
+
+ # Try one last time without fail_ok, which will BAIL_OUT unless it
+ # succeeds.
+ $self->start && return 1;
+ return 0;
+}
+
+=pod
+
=item $node->kill9()
Send SIGKILL (signal 9) to the postmaster.
diff --git a/src/test/recovery/t/017_shm.pl b/src/test/recovery/t/017_shm.pl
index c73aa3f0c2c..ac238544217 100644
--- a/src/test/recovery/t/017_shm.pl
+++ b/src/test/recovery/t/017_shm.pl
@@ -67,7 +67,7 @@ log_ipcs();
# Upon postmaster death, postmaster children exit automatically.
$gnat->kill9;
log_ipcs();
-poll_start($gnat); # gnat recycles its former shm key.
+$gnat->poll_start; # gnat recycles its former shm key.
log_ipcs();
note "removing the conflicting shmem ...";
@@ -82,7 +82,7 @@ log_ipcs();
# the higher-keyed segment that the previous postmaster was using.
# That's not great, but key collisions should be rare enough to not
# make this a big problem.
-poll_start($gnat);
+$gnat->poll_start;
log_ipcs();
$gnat->stop;
log_ipcs();
@@ -174,43 +174,11 @@ $slow_client->finish; # client has detected backend termination
log_ipcs();
# now startup should work
-poll_start($gnat);
+$gnat->poll_start;
log_ipcs();
# finish testing
$gnat->stop;
log_ipcs();
-
-# We may need retries to start a new postmaster. Causes:
-# - kernel is slow to deliver SIGKILL
-# - postmaster parent is slow to waitpid()
-# - postmaster child is slow to exit in response to SIGQUIT
-# - postmaster child is slow to exit after postmaster death
-sub poll_start
-{
- my ($node) = @_;
-
- my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default;
- my $attempts = 0;
-
- while ($attempts < $max_attempts)
- {
- $node->start(fail_ok => 1) && return 1;
-
- # Wait 0.1 second before retrying.
- usleep(100_000);
-
- # Clean up in case the start attempt just timed out or some such.
- $node->stop('fast', fail_ok => 1);
-
- $attempts++;
- }
-
- # Try one last time without fail_ok, which will BAIL_OUT unless it
- # succeeds.
- $node->start && return 1;
- return 0;
-}
-
done_testing();
--
2.51.2