tap-fixes.txt

text/plain

Filename: tap-fixes.txt
Type: text/plain
Part: 1
Message: Re: Changing the state of data checksums in a running cluster
From 57bb79b1bc8faac646131336abcc1596711c5f32 Mon Sep 17 00:00:00 2001
From: tomas <tomas>
Date: Thu, 28 Aug 2025 22:10:25 +0200
Subject: [PATCH] TAP fixes

---
 .../t/006_concurrent_pgbench.pl               | 88 ++++++++++++++-----
 1 file changed, 68 insertions(+), 20 deletions(-)

diff --git a/src/test/modules/test_checksums/t/006_concurrent_pgbench.pl b/src/test/modules/test_checksums/t/006_concurrent_pgbench.pl
index b33ca6e0c26..374eac7e6a3 100644
--- a/src/test/modules/test_checksums/t/006_concurrent_pgbench.pl
+++ b/src/test/modules/test_checksums/t/006_concurrent_pgbench.pl
@@ -23,11 +23,14 @@ my $node_primary_loglocation = 0;
 my $node_standby_1;
 my $node_standby_1_loglocation = 0;
 
+my $pgbench_primary = undef;
+my $pgbench_standby = undef;
+
 # The number of full test iterations which will be performed. The exact number
 # of tests performed and the wall time taken is non-deterministic as the test
 # performs a lot of randomized actions, but 50 iterations will be a long test
 # run regardless.
-my $TEST_ITERATIONS = 50;
+my $TEST_ITERATIONS = 1000;
 
 # Variables which record the current state of the cluster
 my $data_checksum_state = 'off';
@@ -55,7 +58,7 @@ if ($ENV{enable_injection_points} ne 'yes')
 # whether to turn things off during testing.
 sub cointoss
 {
-	return int(rand(2) == 1);
+	return int(rand() < 0.5);
 }
 
 # Helper for injecting random sleeps here and there in the testrun. The sleep
@@ -73,11 +76,16 @@ sub background_ro_pgbench
 {
 	my ($port, $stdin, $stdout, $stderr) = @_;
 
-	my $pgbench_primary = IPC::Run::start(
-		[ 'pgbench', '-p', $port, '-S', '-T', '600', '-c', '10', 'postgres' ],
-		'<' => \$stdin,
-		'>' => \$stdout,
-		'2>' => \$stderr,
+	if ($pgbench_standby)
+	{
+		$pgbench_standby->finish;
+	}
+
+	$pgbench_standby = IPC::Run::start(
+		[ 'pgbench', '-n', '-p', $port, '-S', '-T', '600', '-c', '10', 'postgres' ],
+		'<' => '/dev/null',
+		'>' => '/dev/null',
+		'2>' => '/dev/null',
 		IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default));
 }
 
@@ -87,11 +95,16 @@ sub background_rw_pgbench
 {
 	my ($port, $stdin, $stdout, $stderr) = @_;
 
-	my $pgbench_primary = IPC::Run::start(
+	if ($pgbench_primary)
+	{
+		$pgbench_primary->finish;
+	}
+
+	$pgbench_primary = IPC::Run::start(
 		[ 'pgbench', '-p', $port, '-T', '600', '-c', '10', 'postgres' ],
-		'<' => \$stdin,
-		'>' => \$stdout,
-		'2>' => \$stderr,
+		'<' => '/dev/null',
+		'>' => '/dev/null',
+		'2>' => '/dev/null',
 		IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default));
 }
 
@@ -224,6 +237,9 @@ background_rw_pgbench(
 	$node_primary->port, $pgb_primary_stdin,
 	$pgb_primary_stdout, $pgb_primary_stderr);
 
+my $primary_shutdown_clean = 0;
+my $standby_shutdown_clean = 0;
+
 # Main test suite. This loop will start a pgbench run on the cluster and while
 # that's running flip the state of data checksums concurrently. It will then
 # randomly restart thec cluster (in fast or immediate) mode and then check for
@@ -246,9 +262,11 @@ for (my $i = 0; $i < $TEST_ITERATIONS; $i++)
 		$node_primary_loglocation = -s $node_primary->logfile;
 
 		# If data checksums are enabled, take the opportunity to verify them
-		# while the cluster is offline
-		$node_primary->checksum_verify_offline()
-		  unless $data_checksum_state eq 'off';
+		# while the cluster is offline (but only if stopped in a clean way,
+		# not after immediate shutdown)
+		#$node_primary->checksum_verify_offline()
+		#  unless $data_checksum_state eq 'off' or !$primary_shutdown_clean;
+
 		random_sleep();
 		$node_primary->start;
 		# Start a pgbench in the background against the primary
@@ -270,9 +288,11 @@ for (my $i = 0; $i < $TEST_ITERATIONS; $i++)
 		$node_standby_1_loglocation = -s $node_standby_1->logfile;
 
 		# If data checksums are enabled, take the opportunity to verify them
-		# while the cluster is offline
-		$node_standby_1->checksum_verify_offline()
-		  unless $data_checksum_state eq 'off';
+		# while the cluster is offline (but only if stopped in a clean way,
+		# not after immediate shutdown)
+		#$node_standby_1->checksum_verify_offline()
+		#  unless $data_checksum_state eq 'off' or !$standby_shutdown_clean;
+
 		random_sleep();
 		$node_standby_1->start;
 		# Start a select-only pgbench in the background on the standby
@@ -287,13 +307,41 @@ for (my $i = 0; $i < $TEST_ITERATIONS; $i++)
 	my $result = $node_primary->safe_psql('postgres',
 		"SELECT count(*) FROM t WHERE a > 1");
 	is($result, '100000', 'ensure data pages can be read back on primary');
+
 	random_sleep();
+
 	$node_primary->wait_for_catchup($node_standby_1, 'write');
 
-	# Potentially powercycle the cluster
-	$node_primary->stop($stop_modes[ int(rand(100)) ]) if cointoss();
 	random_sleep();
-	$node_standby_1->stop($stop_modes[ int(rand(100)) ]) if cointoss();
+
+	# Potentially powercycle the cluster (the nodes independently)
+	# XXX should maybe try stopping nodes in the opposite order too?
+	if (cointoss())
+	{
+		my $mode = $stop_modes[ int(rand(100)) ];
+		$node_primary->stop($mode);
+		$primary_shutdown_clean = ($mode eq 'fast');
+	}
+
+	random_sleep();
+
+	if (cointoss())
+	{
+		my $mode = $stop_modes[ int(rand(100)) ];
+		$node_standby_1->stop($mode);
+		$standby_shutdown_clean = ($mode eq 'fast');
+	}
+}
+
+# make sure the nodes are running
+if (!$node_primary->is_alive)
+{
+	$node_primary->start;
+}
+
+if (!$node_standby_1->is_alive)
+{
+        $node_standby_1->start;
 }
 
 # Testrun is over, ensure that data reads back as expected and perform a final
-- 
2.39.5