v10-0001-Avoid-multixact-edge-case-2-by-writing-the-next-.patch

application/octet-stream
Filename: v10-0001-Avoid-multixact-edge-case-2-by-writing-the-next-.patch
Type: application/octet-stream
Part: 0
Message: Re: IPC/MultixactCreation on the Standby server
From d21ba6f71280e03b0e402558626d00ec4b79a156 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Sun, 27 Jul 2025 11:37:55 +0500
Subject: [PATCH v10] Avoid multixact edge case 2 by writing the next offset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RecordNewMultiXact now writes the offset of the
following multixact so readers never block waiting for “case 2”.
The new TAP test reproduces IPC/MultixactCreation hangs and verifies
that previously recorded multis stay readable across crash recovery.

Reviewed-by: Dmitry Yurichev <dsy.075@yandex.ru>
Reviewed-by: Álvaro Herrera <alvherre@postgresql.org>
Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
Reviewed-by: Ivan Bykov <i.bykov@modernsys.ru>
---
 src/backend/access/transam/multixact.c        | 101 ++++++++-----
 src/test/modules/test_slru/t/001_multixact.pl | 141 +++++++++---------
 src/test/modules/test_slru/test_multixact.c   |   1 -
 src/test/perl/PostgreSQL/Test/Cluster.pm      |  40 +++++
 src/test/recovery/t/017_shm.pl                |  38 +----
 5 files changed, 173 insertions(+), 148 deletions(-)

diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 9d5f130af7e..ddca8433cc1 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -271,12 +271,6 @@ typedef struct MultiXactStateData
 	/* support for members anti-wraparound measures */
 	MultiXactOffset offsetStopLimit;	/* known if oldestOffsetKnown */
 
-	/*
-	 * This is used to sleep until a multixact offset is written when we want
-	 * to create the next one.
-	 */
-	ConditionVariable nextoff_cv;
-
 	/*
 	 * Per-backend data starts here.  We have two arrays stored in the area
 	 * immediately following the MultiXactStateData struct. Each is indexed by
@@ -915,10 +909,20 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 	int			i;
 	LWLock	   *lock;
 	LWLock	   *prevlock = NULL;
+	MultiXactId next = multi + 1;
+	int			next_pageno;
 
 	pageno = MultiXactIdToOffsetPage(multi);
 	entryno = MultiXactIdToOffsetEntry(multi);
 
+	/*
+	 * We must also fill next offset to keep current multi readable
+	 * Handle wraparound as GetMultiXactIdMembers() does it.
+	 */
+	if (next < FirstMultiXactId)
+		next = FirstMultiXactId;
+	next_pageno = MultiXactIdToOffsetPage(next);
+
 	lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno);
 	LWLockAcquire(lock, LW_EXCLUSIVE);
 
@@ -933,19 +937,45 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 	offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
 	offptr += entryno;
 
-	*offptr = offset;
+	/*
+	 * We might have filled this offset previosuly.
+	 * Cross-check for correctness.
+	 */
+	Assert((*offptr == 0) || (*offptr == offset));
 
+	*offptr = offset;
 	MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
 
+	/* Also we records next offset here */
+	if (next_pageno == pageno)
+	{
+		offptr[1] = offset + nmembers;
+	}
+	else
+	{
+		int	next_slotno;
+		MultiXactOffset *next_offptr;
+		int	next_entryno = MultiXactIdToOffsetEntry(next);
+		/* This is an overflow-only branch */
+		Assert(next_entryno == 0 || next == FirstMultiXactId);
+
+		/* Swap the lock for a lock of next page */
+		LWLockRelease(lock);
+		lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno);
+		LWLockAcquire(lock, LW_EXCLUSIVE);
+
+		/* Read and adjust next page */
+		next_slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next);
+		next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[next_slotno];
+		Assert((next_offptr[next_entryno] == 0)
+				|| (next_offptr[next_entryno] == offset + nmembers));
+		next_offptr[next_entryno] = offset + nmembers;
+		MultiXactMemberCtl->shared->page_dirty[next_slotno] = true;
+	}
+
 	/* Release MultiXactOffset SLRU lock. */
 	LWLockRelease(lock);
 
-	/*
-	 * If anybody was waiting to know the offset of this multixact ID we just
-	 * wrote, they can read it now, so wake them up.
-	 */
-	ConditionVariableBroadcast(&MultiXactState->nextoff_cv);
-
 	prev_pageno = -1;
 
 	for (i = 0; i < nmembers; i++, offset++)
@@ -1138,8 +1168,13 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
 			result = FirstMultiXactId;
 	}
 
-	/* Make sure there is room for the MXID in the file.  */
-	ExtendMultiXactOffset(result);
+	/*
+	 * Make sure there is room for the MXID and next offset in the file.
+	 * We might overflow to the next segment, but we don't need to handle
+	 * FirstMultiXactId specifically, because ExtendMultiXactOffset handles
+	 * both cases well: 0 offset and FirstMultiXactId would create segment.
+	 */
+	ExtendMultiXactOffset(MultiXactState->nextMXact + 1);
 
 	/*
 	 * Reserve the members space, similarly to above.  Also, be careful not to
@@ -1304,7 +1339,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
 	MultiXactOffset nextOffset;
 	MultiXactMember *ptr;
 	LWLock	   *lock;
-	bool		slept = false;
 
 	debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
 
@@ -1383,7 +1417,10 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
 	 * 1. This multixact may be the latest one created, in which case there is
 	 * no next one to look at.  In this case the nextOffset value we just
 	 * saved is the correct endpoint.
+	 * TODO: how does it work on Standby?  MultiXactState->nextMXact does not seem to be up-to date.
+	 * nextMXact and nextOffset are in sync, so nothing bad can happen, but nextMXact seems mostly random.
 	 *
+	 * THIS IS NOT POSSIBLE ANYMORE, KEEP IT FOR HISTORIC REASONS.
 	 * 2. The next multixact may still be in process of being filled in: that
 	 * is, another process may have done GetNewMultiXactId but not yet written
 	 * the offset entry for that ID.  In that scenario, it is guaranteed that
@@ -1409,7 +1446,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
 	 * cases, so it seems better than holding the MultiXactGenLock for a long
 	 * time on every multixact creation.
 	 */
-retry:
 	pageno = MultiXactIdToOffsetPage(multi);
 	entryno = MultiXactIdToOffsetEntry(multi);
 
@@ -1473,16 +1509,10 @@ retry:
 
 		if (nextMXOffset == 0)
 		{
-			/* Corner case 2: next multixact is still being filled in */
-			LWLockRelease(lock);
-			CHECK_FOR_INTERRUPTS();
-
-			INJECTION_POINT("multixact-get-members-cv-sleep", NULL);
-
-			ConditionVariableSleep(&MultiXactState->nextoff_cv,
-								   WAIT_EVENT_MULTIXACT_CREATION);
-			slept = true;
-			goto retry;
+			ereport(ERROR,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("MultiXact %d has invalid next offset",
+							multi)));
 		}
 
 		length = nextMXOffset - offset;
@@ -1491,12 +1521,6 @@ retry:
 	LWLockRelease(lock);
 	lock = NULL;
 
-	/*
-	 * If we slept above, clean up state; it's no longer needed.
-	 */
-	if (slept)
-		ConditionVariableCancelSleep();
-
 	ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
 
 	truelength = 0;
@@ -1986,7 +2010,6 @@ MultiXactShmemInit(void)
 
 		/* Make sure we zero out the per-backend state */
 		MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
-		ConditionVariableInit(&MultiXactState->nextoff_cv);
 	}
 	else
 		Assert(found);
@@ -2136,8 +2159,7 @@ TrimMultiXact(void)
 	 * TrimCLOG() for background.  Unlike CLOG, some WAL record covers every
 	 * pg_multixact SLRU mutation.  Since, also unlike CLOG, we ignore the WAL
 	 * rule "write xlog before data," nextMXact successors may carry obsolete,
-	 * nonzero offset values.  Zero those so case 2 of GetMultiXactIdMembers()
-	 * operates normally.
+	 * nonzero offset values.
 	 */
 	entryno = MultiXactIdToOffsetEntry(nextMXact);
 	if (entryno != 0)
@@ -2487,10 +2509,11 @@ ExtendMultiXactOffset(MultiXactId multi)
 
 	/*
 	 * No work except at first MultiXactId of a page.  But beware: just after
-	 * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
+	 * wraparound, the first MultiXactId of page zero is FirstMultiXactId,
+	 * make sure we are not in that case.
 	 */
-	if (MultiXactIdToOffsetEntry(multi) != 0 &&
-		multi != FirstMultiXactId)
+	Assert(multi != FirstMultiXactId);
+	if (MultiXactIdToOffsetEntry(multi) != 0)
 		return;
 
 	pageno = MultiXactIdToOffsetPage(multi);
diff --git a/src/test/modules/test_slru/t/001_multixact.pl b/src/test/modules/test_slru/t/001_multixact.pl
index e2b567a603d..2f85802f920 100644
--- a/src/test/modules/test_slru/t/001_multixact.pl
+++ b/src/test/modules/test_slru/t/001_multixact.pl
@@ -18,6 +18,10 @@ if ($ENV{enable_injection_points} ne 'yes')
 {
 	plan skip_all => 'Injection points not supported by this build';
 }
+if ($windows_os)
+{
+	plan skip_all => 'Kill9 works unpredicatably on Windows';
+}
 
 my ($node, $result);
 
@@ -25,96 +29,87 @@ $node = PostgreSQL::Test::Cluster->new('mike');
 $node->init;
 $node->append_conf('postgresql.conf',
 	"shared_preload_libraries = 'test_slru,injection_points'");
+# Set the cluster's next multitransaction to 0xFFFFFFF0.
+my $node_pgdata = $node->data_dir;
+command_ok(
+	[
+		'pg_resetwal',
+		'--multixact-ids' => '0xFFFFFFF0,0xFFFFFFF0',
+		$node_pgdata
+	],
+	"set the cluster's next multitransaction to 0xFFFFFFF0");
+command_ok(
+	[
+		'dd',
+		'if=/dev/zero',
+		"of=$node_pgdata/pg_multixact/offsets/FFFF",
+		'bs=4',
+		'count=65536'
+	],
+	"init SLRU file");
+
+command_ok(
+	[
+		'rm',
+		"$node_pgdata/pg_multixact/offsets/0000",
+	],
+	"drop old SLRU file");
+
 $node->start;
 $node->safe_psql('postgres', q(CREATE EXTENSION injection_points));
 $node->safe_psql('postgres', q(CREATE EXTENSION test_slru));
 
-# Test for Multixact generation edge case
-$node->safe_psql('postgres',
-	q{select injection_points_attach('test-multixact-read','wait')});
-$node->safe_psql('postgres',
-	q{select injection_points_attach('multixact-get-members-cv-sleep','wait')}
-);
+# Another multixact test: loosing some multixact must not affect reading near
+# multixacts, even after a crash.
+my $bg_psql = $node->background_psql('postgres');
 
-# This session must observe sleep on the condition variable while generating a
-# multixact.  To achieve this it first will create a multixact, then pause
-# before reading it.
-my $observer = $node->background_psql('postgres');
-
-# This query will create a multixact, and hang just before reading it.
-$observer->query_until(
-	qr/start/,
-	q{
-	\echo start
-	SELECT test_read_multixact(test_create_multixact());
-});
-$node->wait_for_event('client backend', 'test-multixact-read');
-
-# This session will create the next Multixact. This is necessary to avoid
-# multixact.c's non-sleeping edge case 1.
-my $creator = $node->background_psql('postgres');
+my $multi = $bg_psql->query_safe(
+	q(SELECT test_create_multixact();));
+
+# The space for next multi will be allocated, but it will never be actually
+# recorded.
 $node->safe_psql('postgres',
 	q{SELECT injection_points_attach('multixact-create-from-members','wait');}
 );
 
-# We expect this query to hang in the critical section after generating new
-# multixact, but before filling its offset into SLRU.
-# Running an injection point inside a critical section requires it to be
-# loaded beforehand.
-$creator->query_until(
-	qr/start/, q{
-	\echo start
+$bg_psql->query_until(
+	qr/deploying lost multi/, q(
+\echo deploying lost multi
 	SELECT test_create_multixact();
-});
+));
 
 $node->wait_for_event('client backend', 'multixact-create-from-members');
-
-# Ensure we have the backends waiting that we expect
-is( $node->safe_psql(
-		'postgres',
-		q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event)
-		FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'}
-	),
-	'multixact-create-from-members, test-multixact-read',
-	"matching injection point waits");
-
-# Now wake observer to get it to read the initial multixact.  A subsequent
-# multixact already exists, but that one doesn't have an offset assigned, so
-# this will hit multixact.c's edge case 2.
-$node->safe_psql('postgres',
-	q{SELECT injection_points_wakeup('test-multixact-read')});
-$node->wait_for_event('client backend', 'multixact-get-members-cv-sleep');
-
-# Ensure we have the backends waiting that we expect
-is( $node->safe_psql(
-		'postgres',
-		q{SELECT string_agg(wait_event, ', ' ORDER BY wait_event)
-		FROM pg_stat_activity WHERE wait_event_type = 'InjectionPoint'}
-	),
-	'multixact-create-from-members, multixact-get-members-cv-sleep',
-	"matching injection point waits");
-
-# Now we have two backends waiting in multixact-create-from-members and
-# multixact-get-members-cv-sleep.  Also we have 3 injections points set to wait.
-# If we wakeup multixact-get-members-cv-sleep it will happen again, so we must
-# detach it first. So let's detach all injection points, then wake up all
-# backends.
-
-$node->safe_psql('postgres',
-	q{SELECT injection_points_detach('test-multixact-read')});
 $node->safe_psql('postgres',
 	q{SELECT injection_points_detach('multixact-create-from-members')});
-$node->safe_psql('postgres',
-	q{SELECT injection_points_detach('multixact-get-members-cv-sleep')});
 
 $node->safe_psql('postgres',
-	q{SELECT injection_points_wakeup('multixact-create-from-members')});
-$node->safe_psql('postgres',
-	q{SELECT injection_points_wakeup('multixact-get-members-cv-sleep')});
+	q{checkpoint;});
 
-# Background psql will now be able to read the result and disconnect.
-$observer->quit;
-$creator->quit;
+# One more multitransaction to effectivelt emit WAL record about next
+# multitransaction (to avaoid corener case 1).
+$node->safe_psql('postgres',
+	q{SELECT test_create_multixact();});
+
+# All set and done, it's time for hard restart
+$node->kill9;
+$node->stop('immediate', fail_ok => 1);
+$node->poll_start;
+$bg_psql->{run}->finish;
+
+# Verify thet recorded multi is readble, this call must not hang.
+# Also note that all injection points disappeared after server restart.
+my $timed_out = 0;
+$node->safe_psql(
+	'postgres',
+	qq{SELECT test_read_multixact('$multi'::xid);},
+	timeout => $PostgreSQL::Test::Utils::timeout_default,
+	timed_out => \$timed_out);
+ok($timed_out == 0, 'recorded multi is readble');
+
+# Test mxidwraparound
+foreach my $i (1 .. 32) {
+$node->safe_psql('postgres',q{SELECT test_create_multixact();});
+}
 
 $node->stop;
 
diff --git a/src/test/modules/test_slru/test_multixact.c b/src/test/modules/test_slru/test_multixact.c
index 6c9b0420717..2fd67273ee5 100644
--- a/src/test/modules/test_slru/test_multixact.c
+++ b/src/test/modules/test_slru/test_multixact.c
@@ -46,7 +46,6 @@ test_read_multixact(PG_FUNCTION_ARGS)
 	MultiXactId id = PG_GETARG_TRANSACTIONID(0);
 	MultiXactMember *members;
 
-	INJECTION_POINT("test-multixact-read", NULL);
 	/* discard caches */
 	AtEOXact_MultiXact();
 
diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm
index 35413f14019..e810f123f93 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -1191,6 +1191,46 @@ sub start
 
 =pod
 
+=item $node->poll_start() => success_or_failure
+
+Polls start after kill9
+
+We may need retries to start a new postmaster.  Causes:
+ - kernel is slow to deliver SIGKILL
+ - postmaster parent is slow to waitpid()
+ - postmaster child is slow to exit in response to SIGQUIT
+ - postmaster child is slow to exit after postmaster death
+
+=cut
+
+sub poll_start
+{
+	my ($self) = @_;
+
+	my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default;
+	my $attempts = 0;
+
+	while ($attempts < $max_attempts)
+	{
+		$self->start(fail_ok => 1) && return 1;
+
+		# Wait 0.1 second before retrying.
+		usleep(100_000);
+
+		# Clean up in case the start attempt just timed out or some such.
+		$self->stop('fast', fail_ok => 1);
+
+		$attempts++;
+	}
+
+	# Try one last time without fail_ok, which will BAIL_OUT unless it
+	# succeeds.
+	$self->start && return 1;
+	return 0;
+}
+
+=pod
+
 =item $node->kill9()
 
 Send SIGKILL (signal 9) to the postmaster.
diff --git a/src/test/recovery/t/017_shm.pl b/src/test/recovery/t/017_shm.pl
index c73aa3f0c2c..ac238544217 100644
--- a/src/test/recovery/t/017_shm.pl
+++ b/src/test/recovery/t/017_shm.pl
@@ -67,7 +67,7 @@ log_ipcs();
 # Upon postmaster death, postmaster children exit automatically.
 $gnat->kill9;
 log_ipcs();
-poll_start($gnat);    # gnat recycles its former shm key.
+$gnat->poll_start;    # gnat recycles its former shm key.
 log_ipcs();
 
 note "removing the conflicting shmem ...";
@@ -82,7 +82,7 @@ log_ipcs();
 # the higher-keyed segment that the previous postmaster was using.
 # That's not great, but key collisions should be rare enough to not
 # make this a big problem.
-poll_start($gnat);
+$gnat->poll_start;
 log_ipcs();
 $gnat->stop;
 log_ipcs();
@@ -174,43 +174,11 @@ $slow_client->finish;    # client has detected backend termination
 log_ipcs();
 
 # now startup should work
-poll_start($gnat);
+$gnat->poll_start;
 log_ipcs();
 
 # finish testing
 $gnat->stop;
 log_ipcs();
 
-
-# We may need retries to start a new postmaster.  Causes:
-# - kernel is slow to deliver SIGKILL
-# - postmaster parent is slow to waitpid()
-# - postmaster child is slow to exit in response to SIGQUIT
-# - postmaster child is slow to exit after postmaster death
-sub poll_start
-{
-	my ($node) = @_;
-
-	my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default;
-	my $attempts = 0;
-
-	while ($attempts < $max_attempts)
-	{
-		$node->start(fail_ok => 1) && return 1;
-
-		# Wait 0.1 second before retrying.
-		usleep(100_000);
-
-		# Clean up in case the start attempt just timed out or some such.
-		$node->stop('fast', fail_ok => 1);
-
-		$attempts++;
-	}
-
-	# Try one last time without fail_ok, which will BAIL_OUT unless it
-	# succeeds.
-	$node->start && return 1;
-	return 0;
-}
-
 done_testing();
-- 
2.51.2