v1-pg14.4-0001-Avoid-copying-WAL-segments-before-divergence-to-spee.patch

application/octet-stream

Filename: v1-pg14.4-0001-Avoid-copying-WAL-segments-before-divergence-to-spee.patch
Type: application/octet-stream
Part: 0
Message: Re: Making pg_rewind faster

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: format-patch
Series: patch v1-0001
Subject: Avoid copying WAL segments before divergence to speed up pg_rewind
File+
src/bin/pg_rewind/filemap.c 20 5
src/bin/pg_rewind/pg_rewind.c 37 2
src/bin/pg_rewind/pg_rewind.h 2 0
src/bin/pg_rewind/t/001_basic.pl 64 1
src/bin/pg_rewind/t/008_min_recovery_point.pl 73 1
From 745fc4bde84cae7e595a65a0e23c8476b0e3e323 Mon Sep 17 00:00:00 2001
From: Justin Kwan <jkwan@cloudflare.com>
Date: Fri, 15 Jul 2022 18:48:59 -0700
Subject: [PATCH] Avoid copying WAL segments before divergence to speed up
 pg_rewind

"Optimize pg_rewind to Skip Copying Common WAL Files". It adds a conditional
check to avoid uncessesarily copying any WAL segment files from source to
target if they are common between both servers, before the point of WAL
divergence during pg_rewind. On the source server, each WAL file's
corresponding segment number is computed and compared against the segement
number of the first diverged LSN. All WAL files which fall before the segment
of the first diverged LSN can safely be skipped from copying to the target.

The reduction in WAL segment files transmitted over the network from source to
target server massively reduces overall pg_rewind execution time, when a large
amount of WAL segment files are retained.

This patch is intended for immediate application into the Postgres master
branch on version 14.4.

Regression tests are included to verify that WAL segment files prior to WAL
diverge are not copied. The source code successfully compiles, and all tests
successfully pass.

Author: Justin Kwan, Vignesh Ravichandran
---
 src/bin/pg_rewind/filemap.c                   | 25 +++++--
 src/bin/pg_rewind/pg_rewind.c                 | 39 +++++++++-
 src/bin/pg_rewind/pg_rewind.h                 |  2 +
 src/bin/pg_rewind/t/001_basic.pl              | 65 +++++++++++++++-
 src/bin/pg_rewind/t/008_min_recovery_point.pl | 74 ++++++++++++++++++-
 5 files changed, 196 insertions(+), 9 deletions(-)

diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c
index 2618b4c957..0df84bc115 100644
--- a/src/bin/pg_rewind/filemap.c
+++ b/src/bin/pg_rewind/filemap.c
@@ -26,6 +26,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include "access/xlog_internal.h"
 #include "catalog/pg_tablespace_d.h"
 #include "common/hashfn.h"
 #include "common/string.h"
@@ -729,11 +730,25 @@ decide_file_action(file_entry_t *entry)
 		case FILE_TYPE_REGULAR:
 			if (!entry->isrelfile)
 			{
-				/*
-				 * It's a non-data file that we have no special processing
-				 * for. Copy it in toto.
-				 */
-				return FILE_ACTION_COPY;
+                /* Handle WAL segment file. */
+                const char  *fname;
+                char        *slash;
+
+                /* Split filepath into directory & filename. */
+                slash = strrchr(path, '/');
+                if (slash)
+                    fname = slash + 1;
+                else
+                    fname = path;
+
+                if (IsXLogFileName(fname))
+                    return decide_wal_file_action(fname);
+
+                /*
+                 * It's a non-data file that we have no special processing
+                 * for. Copy it in toto.
+                 */
+                return FILE_ACTION_COPY;
 			}
 			else
 			{
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 2ac4910778..591f6674d7 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -24,7 +24,6 @@
 #include "common/string.h"
 #include "fe_utils/recovery_gen.h"
 #include "file_ops.h"
-#include "filemap.h"
 #include "getopt_long.h"
 #include "pg_rewind.h"
 #include "rewind_source.h"
@@ -67,6 +66,8 @@ bool		dry_run = false;
 bool		do_sync = true;
 bool		restore_wal = false;
 
+static XLogRecPtr   divergerec;
+
 /* Target history */
 TimeLineHistoryEntry *targetHistory;
 int			targetNentries;
@@ -124,7 +125,6 @@ main(int argc, char **argv)
 	};
 	int			option_index;
 	int			c;
-	XLogRecPtr	divergerec;
 	int			lastcommontliIndex;
 	XLogRecPtr	chkptrec;
 	TimeLineID	chkpttli;
@@ -483,6 +483,41 @@ main(int argc, char **argv)
 	return 0;
 }
 
+file_action_t
+decide_wal_file_action(const char *fname)
+{
+    TimeLineID  file_tli;
+    XLogSegNo	file_segno;
+    XLogSegNo   last_common_segno;
+
+    /*
+	 * Find last common WAL segment number between source and target before
+	 * divergence given last common LSN (byte position).
+	 */
+    XLByteToSeg(divergerec, last_common_segno, ControlFile_target.xlog_seg_size);
+
+    /* Get current WAL segment number given current segment file name. */
+    XLogFromFileName(fname, &file_tli, &file_segno, ControlFile_target.xlog_seg_size);
+
+    /*
+     * Avoid unnecessarily copying WAL segment files created before last common
+     * segment to avoid performance penalty when many WAL segment files are
+     * retained on source and copied to target.
+     *
+     * These files are already common between new source (old target) and new
+     * target (old source). Only WAL segment files after the last common segment
+     * number on the new source need to be copied to the new target.
+     */
+    if (file_segno < last_common_segno)
+    {
+        pg_log_debug("WAL file entry \"%s\" not copied to target", fname);
+        return FILE_ACTION_NONE;
+    }
+
+    pg_log_debug("WAL file entry \"%s\" copied to target", fname);
+    return FILE_ACTION_COPY;
+}
+
 /*
  * Perform the rewind.
  *
diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h
index d38635a73d..c7c72d4b38 100644
--- a/src/bin/pg_rewind/pg_rewind.h
+++ b/src/bin/pg_rewind/pg_rewind.h
@@ -14,6 +14,7 @@
 #include "access/timeline.h"
 #include "common/logging.h"
 #include "datapagemap.h"
+#include "filemap.h"
 #include "libpq-fe.h"
 #include "storage/block.h"
 #include "storage/relfilenode.h"
@@ -50,6 +51,7 @@ extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr,
 
 /* in pg_rewind.c */
 extern void progress_report(bool finished);
+extern file_action_t decide_wal_file_action(const char *fname);
 
 /* in timeline.c */
 extern TimeLineHistoryEntry *rewind_parseTimeLineHistory(char *buffer,
diff --git a/src/bin/pg_rewind/t/001_basic.pl b/src/bin/pg_rewind/t/001_basic.pl
index d636f35f5e..f9a079357e 100644
--- a/src/bin/pg_rewind/t/001_basic.pl
+++ b/src/bin/pg_rewind/t/001_basic.pl
@@ -4,7 +4,7 @@
 use strict;
 use warnings;
 use TestLib;
-use Test::More tests => 23;
+use Test::More tests => 29;
 
 use FindBin;
 use lib $FindBin::RealBin;
@@ -78,6 +78,31 @@ sub run_test
 		"insert into drop_tbl values ('in primary, after promotion')");
 	primary_psql("DROP TABLE drop_tbl");
 
+	# Record last modification time of WAL segment file common between both
+	# source and target, stored on current primary server. pg_rewind should
+	# refrain from overwriting this file with the source's newly forked copy.
+	$node_primary->psql(
+		'postgres',
+		"SELECT extract(epoch from modification) FROM pg_stat_file('pg_wal/000000010000000000000002');",
+		stdout => \my $common_wal_modified_at);
+
+	# Record last modification time of WAL segment file that is partially
+	# written to just before the new timeline, but still common between both
+	# source and target. pg_rewind should overwrite this file with the source's
+	# copy to be safe.
+	$node_primary->psql(
+		'postgres',
+		"SELECT extract(epoch from modification) FROM pg_stat_file('pg_wal/000000010000000000000003');",
+		stdout => \my $last_common_tli1_wal_modified_at);
+
+	# Record last modification time of first diverged WAL segment file on the
+	# old primary's timeline. pg_rewind should overwrite this file with the
+	# source's newly forked copy.
+	$node_primary->psql(
+		'postgres',
+		"SELECT extract(epoch from modification) FROM pg_stat_file('pg_wal/000000020000000000000003');",
+		stdout => \my $last_common_tli2_wal_modified_at);
+
 	# Before running pg_rewind, do a couple of extra tests with several
 	# option combinations.  As the code paths taken by those tests
 	# do not change for the "local" and "remote" modes, just run them
@@ -172,6 +197,44 @@ in primary, before promotion
 ),
 		'drop');
 
+	if ($test_mode eq 'local' || $test_mode eq 'remote')
+	{
+		# Last modification time of WAL segment files common between both
+		# source and target is unchanged on the target. This indicates that
+		# pg_rewind skipped copying the source's files and overwriting the
+		# target's files.
+		$node_primary->psql(
+			'postgres',
+			"SELECT extract(epoch from modification) FROM pg_stat_file('pg_wal/000000010000000000000002');",
+			stdout => \my $common_wal_last_modified_at);
+
+		cmp_ok($common_wal_last_modified_at,
+			'==', $common_wal_modified_at,
+			'common WAL segment file on target, before divergence, not overwritten');
+
+		# Last modification time of WAL segment files just before and after
+		# the new timeline should now be further ahead. (Both of these WAL
+		# files are internally represented by segment 3.) This indicates that
+		# pg_rewind copied the source's files and overwrote the target's files.
+		$node_primary->psql(
+			'postgres',
+			"SELECT extract(epoch from modification) FROM pg_stat_file('pg_wal/000000010000000000000003');",
+			stdout => \my $last_common_tli1_wal_last_modified_at);
+
+		cmp_ok($last_common_tli1_wal_last_modified_at,
+			'>=', $last_common_tli1_wal_modified_at,
+			'last common WAL segment file on target, before divergence, overwritten');
+
+		$node_primary->psql(
+			'postgres',
+			"SELECT extract(epoch from modification) FROM pg_stat_file('pg_wal/000000020000000000000003');",
+			stdout => \my $last_common_tli2_wal_last_modified_at);
+
+		cmp_ok($last_common_tli2_wal_last_modified_at,
+			'>=', $last_common_tli2_wal_modified_at,
+			'last common WAL segment file on target, before divergence, overwritten');
+	}
+
 	# Permissions on PGDATA should be default
   SKIP:
 	{
diff --git a/src/bin/pg_rewind/t/008_min_recovery_point.pl b/src/bin/pg_rewind/t/008_min_recovery_point.pl
index 9ebcbad0d2..1fa3d0907f 100644
--- a/src/bin/pg_rewind/t/008_min_recovery_point.pl
+++ b/src/bin/pg_rewind/t/008_min_recovery_point.pl
@@ -34,7 +34,7 @@ use strict;
 use warnings;
 use PostgresNode;
 use TestLib;
-use Test::More tests => 3;
+use Test::More tests => 7;
 
 use File::Copy;
 
@@ -138,6 +138,32 @@ $node_1->safe_psql('postgres',
 $node_2->poll_query_until('postgres',
 	q|SELECT COUNT(*) > 1 FROM public.bar|, 't');
 
+# Record last modification time of WAL segment file that is partially
+# written to just before the new timeline, but still common between both
+# source and target. pg_rewind should overwrite this file with the source's
+# copy to be safe.
+$node_2->psql(
+	'postgres',
+	"SELECT extract(epoch from modification) FROM pg_stat_file('pg_wal/000000010000000000000002');",
+	stdout => \my $common_wal_modified_at);
+
+# Record last modification time of WAL segment file that is partially
+# written to just before the new timeline, but still common between both
+# source and target. pg_rewind should overwrite this file with the source's
+# copy to be safe.
+$node_2->psql(
+	'postgres',
+	"SELECT extract(epoch from modification) FROM pg_stat_file('pg_wal/000000010000000000000003');",
+	stdout => \my $last_common_tli1_wal_modified_at);
+
+# Record last modification time of first diverged WAL segment file on the
+# old primary's timeline. pg_rewind should overwrite this file with the
+# source's newly forked copy.
+$node_2->psql(
+	'postgres',
+	"SELECT extract(epoch from modification) FROM pg_stat_file('pg_wal/000000020000000000000003');",
+	stdout => \my $last_common_tli2_wal_modified_at);
+
 # At this point node_2 will shut down without a shutdown checkpoint,
 # but with WAL entries beyond the preceding shutdown checkpoint.
 $node_2->stop('fast');
@@ -175,3 +201,49 @@ and this too), 'table foo after rewind');
 
 $result = $node_2->safe_psql('postgres', 'SELECT * FROM public.bar');
 is($result, qq(in both), 'table bar after rewind');
+
+# Last modification time of WAL segment files common between both source and
+# target are unchanged on the target. This indicates that pg_rewind skipped
+# copying the source's files and overwriting the target's files.
+$node_2->psql(
+	'postgres',
+	"SELECT extract(epoch from modification) FROM pg_stat_file('pg_wal/000000010000000000000002');",
+	stdout => \my $common_wal_last_modified_at);
+
+cmp_ok($common_wal_last_modified_at,
+	'==', $common_wal_modified_at,
+	'common WAL segment file on target, before divergence, not overwritten');
+
+# Last modification time of WAL segment files just before and after the new
+# timelines should now be further ahead. (Both of these WAL files are
+# internally represented by segment 3.) This indicates that pg_rewind copied
+# the source's files and overwrote the target's files.
+$node_2->psql(
+	'postgres',
+	"SELECT extract(epoch from modification) FROM pg_stat_file('pg_wal/000000010000000000000003');",
+	stdout => \my $last_common_tli1_wal_last_modified_at);
+
+cmp_ok($last_common_tli1_wal_last_modified_at,
+	'>=', $last_common_tli1_wal_modified_at,
+	'common WAL segment file on target, before divergence, not overwritten');
+
+$node_2->psql(
+	'postgres',
+	"SELECT extract(epoch from modification) FROM pg_stat_file('pg_wal/000000020000000000000003');",
+	stdout => \my $last_common_tli2_wal_last_modified_at);
+
+cmp_ok($last_common_tli2_wal_last_modified_at,
+	'>=', $last_common_tli2_wal_modified_at,
+	'last common WAL segment file on target, before divergence, not overwritten');
+
+# Last modification time of WAL segment files in source but not target should
+# now be further ahead on the target. This indicates that pg_rewind copied the
+# source's files and overwrote the target's files.
+$node_2->psql(
+	'postgres',
+	"SELECT extract(epoch from modification) FROM pg_stat_file('pg_wal/000000030000000000000003');",
+	stdout => \my $first_diverged_tli3_wal_last_modified_at);
+
+cmp_ok($first_diverged_tli3_wal_last_modified_at,
+	'>=', $last_common_tli2_wal_modified_at,
+	'last common WAL segment file on target, before divergence, not overwritten');
-- 
2.35.3