v1-0001-Skip-pg_control-re-read-in-EXEC_BACKEND-sub-proce.patch
application/octet-stream
Filename: v1-0001-Skip-pg_control-re-read-in-EXEC_BACKEND-sub-proce.patch
Type: application/octet-stream
Part: 0
From b961438264bc6d279655b8852110c4751060964a Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Wed, 6 May 2026 13:10:07 +0300
Subject: [PATCH v1] Skip pg_control re-read in EXEC_BACKEND sub-processes
In EXEC_BACKEND mode every newly exec'd sub-process re-reads pg_control
from disk in SubPostmasterMain() via LocalProcessControlFile(). That
read is not interlocked against update_controlfile() in the postmaster's
checkpointer, which writes the file non-atomically. On filesystems that
do not enforce POSIX read/write atomicity for concurrent access to the
same region (notably NTFS and ext4) the sub-process can observe a torn
image and abort with "incorrect checksum in control file".
The sub-process did not actually need to re-read the file: pg_control
contents that matter before shared memory is attached are either
initdb-time invariants (already validated by the postmaster) or values
that the postmaster has long since installed in shared memory. The only
piece of state pulled out of LocalControlFile in the child that is not
already propagated through the regular non-default-GUC mechanism is
data_checksum_version, because the data_checksums GUC is updated by
direct variable assignment (see SetLocalDataChecksumState) rather than
through SetConfigOption, so its source stays at PGC_S_DEFAULT and it
never lands on guc_nondef_list.
Fix by removing the LocalProcessControlFile(false) call from
SubPostmasterMain() and instead:
- Pass data_checksum_version through BackendParameters; the child
applies it via SetLocalDataChecksumState() in
restore_backend_variables(), as part of its normal forkable-state
restore.
- wal_segment_size already propagates correctly: the postmaster sets
it via SetConfigOption(..., PGC_S_DYNAMIC_DEFAULT) in
ReadControlFile(), which puts the GUC on guc_nondef_list, so
write_nondefault_variables/read_nondefault_variables carries it to
the child. Settings derived from wal_segment_size that are not
GUCs (UsableBytesInSegment, CheckPointSegments) are recomputed by a
new helper, ApplyWalSegSizeSettings(), that is also factored out of
ReadControlFile() in this commit.
After the change the EXEC_BACKEND child never opens pg_control, so the
torn-read window is closed. The global ControlFile pointer remains NULL
in the child until the standard shmem attach hooks rebind it to the
shared-memory copy. This brings EXEC_BACKEND startup behaviorally in
line with fork(): both observe shared-memory pg_control as soon as
they're permitted to, and neither carries a private snapshot.
Discussion: https://postgr.es/m/f59335a4-83ff-438a-a30e-7cf2200276b6%40postgrespro.ru
Discussion: https://postgr.es/m/70BF24D6-DC51-443F-B55A-95735803842A%40amazon.com
Author: Alexander Korotkov <aekorotkov@gmail.com>
---
src/backend/access/transam/xlog.c | 22 +++++++++--
src/backend/postmaster/launch_backend.c | 50 +++++++++++++++++++++++--
src/include/access/xlog.h | 1 +
3 files changed, 66 insertions(+), 7 deletions(-)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f0434da40c9..9f26b72e0d9 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -4594,6 +4594,24 @@ ReadControlFile(void)
wal_segment_size = ControlFile->xlog_seg_size;
+ snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
+ SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
+ PGC_S_DYNAMIC_DEFAULT);
+
+ ApplyWalSegSizeSettings();
+}
+
+/*
+ * Validate wal_segment_size and (re)compute derived settings that aren't
+ * propagated through GUC variables.
+ *
+ * This is called by ReadControlFile() in the postmaster after reading
+ * pg_control, and by EXEC_BACKEND child startup after wal_segment_size
+ * has been restored via the GUC propagation mechanism.
+ */
+void
+ApplyWalSegSizeSettings(void)
+{
if (!IsValidWalSegSize(wal_segment_size))
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg_plural("invalid WAL segment size in control file (%d byte)",
@@ -4602,10 +4620,6 @@ ReadControlFile(void)
wal_segment_size),
errdetail("The WAL segment size must be a power of two between 1 MB and 1 GB.")));
- snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
- SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
- PGC_S_DYNAMIC_DEFAULT);
-
/* check and update variables dependent on wal_segment_size */
if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c
index 8f3cfea880c..5a04160c214 100644
--- a/src/backend/postmaster/launch_backend.c
+++ b/src/backend/postmaster/launch_backend.c
@@ -33,6 +33,7 @@
#include <unistd.h>
+#include "access/xlog.h"
#include "libpq/libpq-be.h"
#include "miscadmin.h"
#include "postmaster/autovacuum.h"
@@ -132,6 +133,14 @@ typedef struct
int32 timing_tsc_frequency_khz;
+ /*
+ * data_checksum_version from pg_control. Propagated explicitly because
+ * the data_checksums GUC is bound directly to its variable (no
+ * SetConfigOption call), so it doesn't ride along the regular
+ * non-default-GUCs propagation path used for, e.g., wal_segment_size.
+ */
+ uint32 data_checksum_version;
+
/*
* These are only used by backend processes, but are here because passing
* a socket needs some special handling on Windows. 'client_sock' is an
@@ -662,10 +671,28 @@ SubPostmasterMain(int argc, char *argv[])
checkDataDir();
/*
- * (re-)read control file, as it contains config. The postmaster will
- * already have read this, but this process doesn't know about that.
+ * The postmaster already validated pg_control at startup. We don't
+ * re-read it here: doing so would race against concurrent writes by
+ * the checkpointer and could see a torn image (NTFS and ext4 can
+ * return mashed-together data for a concurrent read/write of the same
+ * region). Instead, we rely on inherited state:
+ *
+ * - wal_segment_size and other initdb-time GUCs are restored above by
+ * read_nondefault_variables() since SetConfigOption() in the
+ * postmaster placed them on guc_nondef_list.
+ *
+ * - data_checksum_version was applied in restore_backend_variables()
+ * from the BackendParameters snapshot.
+ *
+ * - The shared-memory ControlFile is re-attached below by
+ * InitShmemAllocator() / ShmemCallRequestCallbacks(), at which
+ * point the global ControlFile pointer is again valid for any
+ * code that needs the live (volatile) fields.
+ *
+ * Recompute settings derived from wal_segment_size that aren't
+ * propagated through GUCs (UsableBytesInSegment, CheckPointSegments).
*/
- LocalProcessControlFile(false);
+ ApplyWalSegSizeSettings();
RegisterBuiltinShmemCallbacks();
@@ -755,6 +782,15 @@ save_backend_variables(BackendParameters *param,
param->timing_tsc_frequency_khz = timing_tsc_frequency_khz;
+ /*
+ * Propagate the data checksum version. In the postmaster the GUC-bound
+ * 'data_checksums' variable was set from pg_control by
+ * SetLocalDataChecksumState(); 'data_checksums' is not propagated via
+ * the regular non-default-GUC mechanism (its value is assigned directly,
+ * so its source stays at PGC_S_DEFAULT), so we ship it explicitly here.
+ */
+ param->data_checksum_version = data_checksums;
+
#ifdef WIN32
param->PostmasterHandle = PostmasterHandle;
if (!write_duplicated_handle(¶m->initial_signal_pipe,
@@ -1011,6 +1047,14 @@ restore_backend_variables(BackendParameters *param)
timing_tsc_frequency_khz = param->timing_tsc_frequency_khz;
+ /*
+ * Restore data checksum state from the inherited snapshot of
+ * pg_control. Doing this here, before any code can need it, avoids
+ * having to re-read pg_control from disk in this child (which used to
+ * race against concurrent writes by the postmaster's checkpointer).
+ */
+ SetLocalDataChecksumState(param->data_checksum_version);
+
/* Re-run logic usually done by assign_timing_clock_source */
pg_initialize_timing();
pg_set_timing_clock_source(timing_clock_source);
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 4dd98624204..b9867702bf9 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -264,6 +264,7 @@ extern XLogRecPtr GetFakeLSNForUnloggedRel(void);
extern void BootStrapXLOG(uint32 data_checksum_version);
extern void InitializeWalConsistencyChecking(void);
extern void LocalProcessControlFile(bool reset);
+extern void ApplyWalSegSizeSettings(void);
extern WalLevel GetActiveWalLevelOnStandby(void);
extern void StartupXLOG(void);
extern void ShutdownXLOG(int code, Datum arg);
--
2.39.5 (Apple Git-154)