v20250312-0005-data_checksum_version-reworks.patch

text/x-patch

Filename: v20250312-0005-data_checksum_version-reworks.patch
Type: text/x-patch
Part: 4
Message: Re: Changing the state of data checksums in a running cluster

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: format-patch
Series: patch v20250312-0005
Subject: data_checksum_version reworks
File+
src/backend/access/transam/xlog.c 98 35
src/backend/postmaster/launch_backend.c 10 0
src/include/catalog/pg_control.h 4 1
src/include/miscadmin.h 4 0
From b78ba0ddda20dd2331358c157589ac80c807705a Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Tue, 11 Mar 2025 19:16:23 +0100
Subject: [PATCH v20250312 5/6] data_checksum_version reworks

---
 src/backend/access/transam/xlog.c       | 133 +++++++++++++++++-------
 src/backend/postmaster/launch_backend.c |  10 ++
 src/include/catalog/pg_control.h        |   5 +-
 src/include/miscadmin.h                 |   4 +
 4 files changed, 116 insertions(+), 36 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f137cdc6d42..61da6d583cd 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -550,6 +550,9 @@ typedef struct XLogCtlData
 	 */
 	XLogRecPtr	lastFpwDisableRecPtr;
 
+	/* last data_checksum_version we've seen */
+	uint32		data_checksum_version;
+
 	slock_t		info_lck;		/* locks shared variables shown above */
 } XLogCtlData;
 
@@ -4262,6 +4265,12 @@ InitControlFile(uint64 sysidentifier, uint32 data_checksum_version)
 	ControlFile->wal_log_hints = wal_log_hints;
 	ControlFile->track_commit_timestamp = track_commit_timestamp;
 	ControlFile->data_checksum_version = data_checksum_version;
+
+	/*
+	 * Set the data_checksum_version value into XLogCtl, which is where all
+	 * processes get the current value from. (Maybe it should go just there?)
+	 */
+	XLogCtl->data_checksum_version = data_checksum_version;
 }
 
 static void
@@ -4601,8 +4610,6 @@ ReadControlFile(void)
 		(SizeOfXLogLongPHD - SizeOfXLogShortPHD);
 
 	CalculateCheckpointSegments();
-
-	SetLocalDataChecksumVersion(ControlFile->data_checksum_version);
 }
 
 /*
@@ -4734,9 +4741,9 @@ SetDataChecksumsOnInProgress(void)
 
 	XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION);
 
-	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-	ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION;
-	LWLockRelease(ControlFileLock);
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION;
+	SpinLockRelease(&XLogCtl->info_lck);
 
 	barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON);
 
@@ -4780,28 +4787,28 @@ SetDataChecksumsOn(void)
 
 	Assert(ControlFile != NULL);
 
-	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	SpinLockAcquire(&XLogCtl->info_lck);
 
 	/*
 	 * The only allowed state transition to "on" is from "inprogress-on" since
 	 * that state ensures that all pages will have data checksums written.
 	 */
-	if (ControlFile->data_checksum_version != PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION)
+	if (XLogCtl->data_checksum_version != PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION)
 	{
-		LWLockRelease(ControlFileLock);
+		SpinLockRelease(&XLogCtl->info_lck);
 		elog(ERROR, "checksums not in \"inprogress-on\" mode");
 	}
 
-	LWLockRelease(ControlFileLock);
+	SpinLockRelease(&XLogCtl->info_lck);
 
 	MyProc->delayChkptFlags |= DELAY_CHKPT_START;
 	START_CRIT_SECTION();
 
 	XLogChecksums(PG_DATA_CHECKSUM_VERSION);
 
-	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-	ControlFile->data_checksum_version = PG_DATA_CHECKSUM_VERSION;
-	LWLockRelease(ControlFileLock);
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_VERSION;
+	SpinLockRelease(&XLogCtl->info_lck);
 
 	barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON);
 
@@ -4836,12 +4843,12 @@ SetDataChecksumsOff(void)
 
 	Assert(ControlFile);
 
-	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	SpinLockAcquire(&XLogCtl->info_lck);
 
 	/* If data checksums are already disabled there is nothing to do */
-	if (ControlFile->data_checksum_version == 0)
+	if (XLogCtl->data_checksum_version == 0)
 	{
-		LWLockRelease(ControlFileLock);
+		SpinLockRelease(&XLogCtl->info_lck);
 		return;
 	}
 
@@ -4852,18 +4859,18 @@ SetDataChecksumsOff(void)
 	 * "inprogress-off" the next transition to "off" can be performed, after
 	 * which all data checksum processing is disabled.
 	 */
-	if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION)
+	if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_VERSION)
 	{
-		LWLockRelease(ControlFileLock);
+		SpinLockRelease(&XLogCtl->info_lck);
 
 		MyProc->delayChkptFlags |= DELAY_CHKPT_START;
 		START_CRIT_SECTION();
 
 		XLogChecksums(PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION);
 
-		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-		ControlFile->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION;
-		LWLockRelease(ControlFileLock);
+		SpinLockAcquire(&XLogCtl->info_lck);
+		XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION;
+		SpinLockRelease(&XLogCtl->info_lck);
 
 		barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF);
 
@@ -4890,7 +4897,7 @@ SetDataChecksumsOff(void)
 		 * or "inprogress-off" and we can transition directly to "off" from
 		 * there.
 		 */
-		LWLockRelease(ControlFileLock);
+		SpinLockRelease(&XLogCtl->info_lck);
 	}
 
 	/*
@@ -4901,9 +4908,9 @@ SetDataChecksumsOff(void)
 
 	XLogChecksums(0);
 
-	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-	ControlFile->data_checksum_version = 0;
-	LWLockRelease(ControlFileLock);
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->data_checksum_version = 0;
+	SpinLockRelease(&XLogCtl->info_lck);
 
 	barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF);
 
@@ -4958,9 +4965,9 @@ AbsorbChecksumsOffBarrier(void)
 void
 InitLocalControldata(void)
 {
-	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-	SetLocalDataChecksumVersion(ControlFile->data_checksum_version);
-	LWLockRelease(ControlFileLock);
+	SpinLockAcquire(&XLogCtl->info_lck);
+	SetLocalDataChecksumVersion(XLogCtl->data_checksum_version);
+	SpinLockRelease(&XLogCtl->info_lck);
 }
 
 /*
@@ -4994,6 +5001,40 @@ SetLocalDataChecksumVersion(uint32 data_checksum_version)
 	}
 }
 
+/*
+ * Initialize the various data checksum values - GUC, local, ....
+ */
+void
+InitLocalDataChecksumVersion(void)
+{
+	uint32	data_checksum_version;
+
+	SpinLockAcquire(&XLogCtl->info_lck);
+	data_checksum_version = XLogCtl->data_checksum_version;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	SetLocalDataChecksumVersion(data_checksum_version);
+}
+
+/*
+ * Get the local data_checksum_version (cached XLogCtl value).
+ */
+uint32
+GetLocalDataChecksumVersion(void)
+{
+	return LocalDataChecksumVersion;
+}
+
+/*
+ * Get the *current* data_checksum_version (might not be written to control
+ * file yet).
+ */
+uint32
+GetCurrentDataChecksumVersion(void)
+{
+	return XLogCtl->data_checksum_version;
+}
+
 /* guc hook */
 const char *
 show_data_checksums(void)
@@ -5447,6 +5488,11 @@ XLOGShmemInit(void)
 	XLogCtl->InstallXLogFileSegmentActive = false;
 	XLogCtl->WalWriterSleeping = false;
 
+	/* use the checksum info from control file */
+	XLogCtl->data_checksum_version = ControlFile->data_checksum_version;
+
+	SetLocalDataChecksumVersion(XLogCtl->data_checksum_version);
+
 	SpinLockInit(&XLogCtl->Insert.insertpos_lck);
 	SpinLockInit(&XLogCtl->info_lck);
 	pg_atomic_init_u64(&XLogCtl->logInsertResult, InvalidXLogRecPtr);
@@ -6585,7 +6631,7 @@ StartupXLOG(void)
 	 * background worker directly from here, it has to be launched from a
 	 * regular backend.
 	 */
-	if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION)
+	if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION)
 		ereport(WARNING,
 				(errmsg("data checksums are being enabled, but no worker is running"),
 				 errhint("If checksums were being enabled during shutdown then processing must be manually restarted.")));
@@ -6596,13 +6642,13 @@ StartupXLOG(void)
 	 * checksums and we can move to off instead of prompting the user to
 	 * perform any action.
 	 */
-	if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION)
+	if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION)
 	{
 		XLogChecksums(0);
 
-		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-		ControlFile->data_checksum_version = 0;
-		LWLockRelease(ControlFileLock);
+		SpinLockAcquire(&XLogCtl->info_lck);
+		XLogCtl->data_checksum_version = 0;
+		SpinLockRelease(&XLogCtl->info_lck);
 	}
 
 	/*
@@ -7460,6 +7506,12 @@ CreateCheckPoint(int flags)
 	checkPoint.fullPageWrites = Insert->fullPageWrites;
 	checkPoint.wal_level = wal_level;
 
+	/*
+	 * Get the current data_checksum_version value from xlogctl, valid at
+	 * the time of the checkpoint.
+	 */
+	checkPoint.data_checksum_version = XLogCtl->data_checksum_version;
+
 	if (shutdown)
 	{
 		XLogRecPtr	curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
@@ -7715,6 +7767,9 @@ CreateCheckPoint(int flags)
 	ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
 	ControlFile->minRecoveryPointTLI = 0;
 
+	/* make sure we start with the checksum version as of the checkpoint */
+	ControlFile->data_checksum_version = checkPoint.data_checksum_version;
+
 	/*
 	 * Persist unloggedLSN value. It's reset on crash recovery, so this goes
 	 * unused on non-shutdown checkpoints, but seems useful to store it always
@@ -7861,6 +7916,10 @@ CreateEndOfRecoveryRecord(void)
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 	ControlFile->minRecoveryPoint = recptr;
 	ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID;
+
+	/* start with the latest checksum version (as of the end of recovery) */
+	ControlFile->data_checksum_version = XLogCtl->data_checksum_version;
+
 	UpdateControlFile();
 	LWLockRelease(ControlFileLock);
 
@@ -8203,6 +8262,10 @@ CreateRestartPoint(int flags)
 			if (flags & CHECKPOINT_IS_SHUTDOWN)
 				ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
 		}
+
+		/* we shall start with the latest checksum version */
+		ControlFile->data_checksum_version = lastCheckPoint.data_checksum_version;
+
 		UpdateControlFile();
 	}
 	LWLockRelease(ControlFileLock);
@@ -9065,9 +9128,9 @@ xlog_redo(XLogReaderState *record)
 
 		memcpy(&state, XLogRecGetData(record), sizeof(xl_checksum_state));
 
-		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-		ControlFile->data_checksum_version = state.new_checksumtype;
-		LWLockRelease(ControlFileLock);
+		SpinLockAcquire(&XLogCtl->info_lck);
+		XLogCtl->data_checksum_version = state.new_checksumtype;
+		SpinLockRelease(&XLogCtl->info_lck);
 
 		/*
 		 * Block on a procsignalbarrier to await all processes having seen the
diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c
index 92d8017fd56..fc6e2c6ed41 100644
--- a/src/backend/postmaster/launch_backend.c
+++ b/src/backend/postmaster/launch_backend.c
@@ -271,6 +271,16 @@ postmaster_child_launch(BackendType child_type, int child_slot,
 			memcpy(MyClientSocket, client_sock, sizeof(ClientSocket));
 		}
 
+		/*
+		 * update the LocalProcessControlFile to match XLogCtl->data_checksum_version
+		 *
+		 * XXX It seems the postmaster (which is what gets forked into the new
+		 * child process) does not absorb the checksum barriers, therefore it
+		 * does not update the value (except after a restart). Not sure if there
+		 * is some sort of race condition.
+		 */
+		InitLocalDataChecksumVersion();
+
 		/*
 		 * Run the appropriate Main function
 		 */
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 9b5a50adf54..727042b73be 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -62,6 +62,9 @@ typedef struct CheckPoint
 	 * set to InvalidTransactionId.
 	 */
 	TransactionId oldestActiveXid;
+
+	/* data checksums at the time of the checkpoint  */
+	uint32		data_checksum_version;
 } CheckPoint;
 
 /* XLOG info values for XLOG rmgr */
@@ -220,7 +223,7 @@ typedef struct ControlFileData
 	bool		float8ByVal;	/* float8, int8, etc pass-by-value? */
 
 	/* Are data pages protected by checksums? Zero if no checksum version */
-	uint32		data_checksum_version;
+	uint32		data_checksum_version;		/* persistent */
 
 	/*
 	 * True if the default signedness of char is "signed" on a platform where
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 69b1dc720f6..befab157ecf 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -532,6 +532,10 @@ extern Size EstimateClientConnectionInfoSpace(void);
 extern void SerializeClientConnectionInfo(Size maxsize, char *start_address);
 extern void RestoreClientConnectionInfo(char *conninfo);
 
+extern uint32 GetLocalDataChecksumVersion(void);
+extern uint32 GetCurrentDataChecksumVersion(void);
+extern void InitLocalDataChecksumVersion(void);
+
 /* in executor/nodeHash.c */
 extern size_t get_hash_memory_limit(void);
 
-- 
2.48.1