checksums-review.txt

text/plain

Filename: checksums-review.txt
Type: text/plain
Part: 0
Message: Re: Changing the state of data checksums in a running cluster
From 40d26a0406a213207101a837999e8b57df25fc1d Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Wed, 5 Nov 2025 13:26:41 +0100
Subject: [PATCH v20251105 2/3] review

---
 doc/src/sgml/func/func-admin.sgml            | 20 ++++++++++++-----
 doc/src/sgml/glossary.sgml                   |  2 +-
 doc/src/sgml/monitoring.sgml                 | 23 ++++++++++++++++++--
 doc/src/sgml/wal.sgml                        | 18 +++++++++++++++
 src/backend/access/transam/xlog.c            | 17 +++++++++------
 src/backend/postmaster/datachecksumsworker.c | 14 ++++++++++++
 src/backend/storage/ipc/ipci.c               |  1 -
 src/include/postmaster/datachecksumsworker.h |  2 +-
 src/tools/pgindent/typedefs.list             |  2 ++
 9 files changed, 82 insertions(+), 17 deletions(-)

diff --git a/doc/src/sgml/func/func-admin.sgml b/doc/src/sgml/func/func-admin.sgml
index f3a8782ede0..4d6f6a7e486 100644
--- a/doc/src/sgml/func/func-admin.sgml
+++ b/doc/src/sgml/func/func-admin.sgml
@@ -3008,11 +3008,11 @@ SELECT convert_from(pg_read_binary_file('file_in_utf8.txt'), 'UTF8');
         <indexterm>
          <primary>pg_enable_data_checksums</primary>
         </indexterm>
-        <function>pg_enable_data_checksums</function> ( <optional><parameter>cost_delay</parameter> <type>int</type>, <parameter>cost_limit</parameter> <type>int</type></optional> )
+        <function>pg_enable_data_checksums</function> ( <optional><parameter>cost_delay</parameter> <type>int</type>, <parameter>cost_limit</parameter> <type>int</type></optional>, <parameter>fast</parameter> <type>bool</type></optional> )
         <returnvalue>void</returnvalue>
        </para>
        <para>
-        Initiates data checksums for the cluster. This will switch the data
+        Initiates the process of enabling data checksums for the cluster. This will switch the data
         checksums mode to <literal>inprogress-on</literal> as well as start a
         background worker that will process all pages in the database and
         enable checksums on them. When all data pages have had checksums
@@ -3023,7 +3023,9 @@ SELECT convert_from(pg_read_binary_file('file_in_utf8.txt'), 'UTF8');
         If <parameter>cost_delay</parameter> and <parameter>cost_limit</parameter> are
         specified, the speed of the process is throttled using the same principles as
         <link linkend="runtime-config-resource-vacuum-cost">Cost-based Vacuum Delay</link>.
-       </para></entry>
+       </para>
+       <para>FIXME missing documentation of the "fast" parameter</para>
+       </entry>
       </row>
 
       <row>
@@ -3031,7 +3033,7 @@ SELECT convert_from(pg_read_binary_file('file_in_utf8.txt'), 'UTF8');
         <indexterm>
          <primary>pg_disable_data_checksums</primary>
         </indexterm>
-        <function>pg_disable_data_checksums</function> ()
+        <function>pg_disable_data_checksums</function> ( <parameter>fast</parameter> <type>bool</type></optional> )
         <returnvalue>void</returnvalue>
        </para>
        <para>
@@ -3042,12 +3044,20 @@ SELECT convert_from(pg_read_binary_file('file_in_utf8.txt'), 'UTF8');
         changed to <literal>off</literal>.  At this point the data pages will
         still have checksums recorded but they are not updated when pages are
         modified.
-       </para></entry>
+       </para>
+       <para>FIXME missing documentation of the "fast" parameter</para>
+       </entry>
       </row>
      </tbody>
     </tgroup>
    </table>
 
+   <para>
+    FIXME I think this should briefly explain how this interacts with checkpoints (through
+    the fast parameters). It should probably also discuss how this affects streaming standby
+    due to forcing a restart point, etc. And maybe comment on possible mitigations?
+   </para>
+
   </sect2>
 
   </sect1>
diff --git a/doc/src/sgml/glossary.sgml b/doc/src/sgml/glossary.sgml
index 9bac0c96348..3ba0e8c6c5c 100644
--- a/doc/src/sgml/glossary.sgml
+++ b/doc/src/sgml/glossary.sgml
@@ -580,7 +580,7 @@
    <glossdef>
     <para>
      An <glossterm linkend="glossary-auxiliary-proc">auxiliary process</glossterm>
-     which enables or disables data checksums in a specific database.
+     which enables data checksums in a specific database.
     </para>
    </glossdef>
   </glossentry>
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index b56e220f3d8..7efa1af746a 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -3554,7 +3554,10 @@ description | Waiting for a newly initialized WAL file to reach durable storage
        database (or on a shared object).
        Detected failures are reported regardless of the
        <xref linkend="guc-data-checksums"/> setting.
-      </para></entry>
+      </para>
+      <para>XXX I'm not sure what "regardless" means in this context. I guess
+      it means we don't reset the counters when disabling checksums?</para>
+     </entry>
      </row>
 
      <row>
@@ -6959,6 +6962,9 @@ FROM pg_stat_get_backend_idset() AS backendid;
    <structname>pg_stat_progress_data_checksums</structname> view will contain
    a row for the launcher process, and one row for each worker process which
    is currently calculating checksums for the data pages in one database.
+   The launcher provides overview of the overall progress (how many database
+   have been processed, how many remain), while the workers track progress for
+   currently processed databases.
   </para>
 
   <table id="pg-stat-progress-data-checksums-view" xreflabel="pg_stat_progress_data_checksums">
@@ -6984,7 +6990,8 @@ FROM pg_stat_get_backend_idset() AS backendid;
         <structfield>pid</structfield> <type>integer</type>
        </para>
        <para>
-        Process ID of a datachecksumworker process.
+        Process ID of a datachecksumsworker process.
+        FIXME Is datachecksumsworker defined somewhere? Does it refer to the launcher too?
        </para>
       </entry>
      </row>
@@ -7127,6 +7134,12 @@ FROM pg_stat_get_backend_idset() AS backendid;
        The command is currently disabling data checksums on the cluster.
       </entry>
      </row>
+     <row>
+      <entry><literal>waiting</literal></entry>
+      <entry>
+       FIXME
+      </entry>
+     </row>
      <row>
       <entry><literal>waiting on temporary tables</literal></entry>
       <entry>
@@ -7141,6 +7154,12 @@ FROM pg_stat_get_backend_idset() AS backendid;
        state before finishing.
       </entry>
      </row>
+     <row>
+      <entry><literal>done</literal></entry>
+      <entry>
+       FIXME
+      </entry>
+     </row>
     </tbody>
    </tgroup>
   </table>
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml
index 0ada90ca0b1..8ef16d7769f 100644
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -284,6 +284,11 @@
     <link linkend="functions-admin-checksum">functions</link>.
    </para>
 
+   <para>
+    Both enabling and disabling checksums happens in two phases, separated by
+    a checkpoint to ensure durability.
+   </para>
+
    <para>
     Enabling checksums will put the cluster checksum mode in
     <literal>inprogress-on</literal> mode.  During this time, checksums will be
@@ -314,6 +319,14 @@
     no support for resuming work from where it was interrupted.
    </para>
 
+   <para>
+    Disabling checksums will put the cluster checksum mode in
+    <literal>inprogress-off</literal> mode.  During this time, checksums will be
+    written but not verified. After all processes acknowledge the change,
+    the mode will automatically switch to <literal>off</literal>. In this case
+    rewriting all blocks is not needed, but checkpoints are still required.
+   </para>
+
    <note>
     <para>
      Enabling checksums can cause significant I/O to the system, as most of the
@@ -324,6 +337,11 @@
     </para>
    </note>
 
+   <para>
+    XXX Maybe this is the place that should mention checkpoints/restartpoints,
+    how it impacts systems/replication and how to mitigate that?
+   </para>
+
   </sect2>
  </sect1>
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f7633f47551..807b82eed4f 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -607,7 +607,7 @@ typedef struct ChecksumBarrierCondition
 	int			barrier_ne[MAX_BARRIER_CONDITIONS];
 	/* The number of elements in the barrier_ne set */
 	int			barrier_ne_sz;
-}			ChecksumBarrierCondition;
+} ChecksumBarrierCondition;
 
 static const ChecksumBarrierCondition checksum_barriers[] =
 {
@@ -618,7 +618,6 @@ static const ChecksumBarrierCondition checksum_barriers[] =
 	{-1}
 };
 
-
 /*
  * Calculate the amount of space left on the page after 'endptr'. Beware
  * multiple evaluation!
@@ -694,10 +693,10 @@ static TimeLineID LocalMinRecoveryPointTLI;
 static bool updateMinRecoveryPoint = true;
 
 /*
- * Local state fror Controlfile data_checksum_version.  After initialization
+ * Local state for Controlfile data_checksum_version.  After initialization
  * this is only updated when absorbing a procsignal barrier during interrupt
  * processing.  The reason for keeping a copy in backend-private memory is to
- * avoid locking for interrogating checksum state.  Possible values are the
+ * avoid locking for interrogating the checksum state.  Possible values are the
  * checksum versions defined in storage/bufpage.h as well as zero when data
  * checksums are disabled.
  */
@@ -712,6 +711,10 @@ static uint32 LocalDataChecksumVersion = 0;
  * processing the barrier.  This may happen if the process is spawned between
  * the update of XLogCtl->data_checksum_version and the barrier being emitted.
  * This can only happen on the very first barrier so mark that with this flag.
+ *
+ * XXX Is PG_DATA_CHECKSUM_ON_VERSION still the only transition with an assert?
+ * I think it was replaced by checking checksum_barriers for every transition,
+ * with elog(ERROR), no?
  */
 static bool InitialDataChecksumTransition = true;
 
@@ -4335,7 +4338,7 @@ InitControlFile(uint64 sysidentifier, uint32 data_checksum_version)
 
 	/*
 	 * Set the data_checksum_version value into XLogCtl, which is where all
-	 * processes get the current value from. (Maybe it should go just there?)
+	 * processes get the current value from.
 	 */
 	XLogCtl->data_checksum_version = data_checksum_version;
 }
@@ -4921,7 +4924,7 @@ SetDataChecksumsOff(bool immediate_checkpoint)
 	uint64		barrier;
 	int			flags;
 
-	Assert(ControlFile);
+	Assert(ControlFile != NULL);
 
 	SpinLockAcquire(&XLogCtl->info_lck);
 
@@ -5030,7 +5033,7 @@ SetDataChecksumsOff(bool immediate_checkpoint)
 bool
 AbsorbDataChecksumsBarrier(int target_state)
 {
-	const		ChecksumBarrierCondition *condition = checksum_barriers;
+	const ChecksumBarrierCondition *condition = checksum_barriers;
 	int			current = LocalDataChecksumVersion;
 	bool		found = false;
 
diff --git a/src/backend/postmaster/datachecksumsworker.c b/src/backend/postmaster/datachecksumsworker.c
index 3deb57a96de..67045b9014d 100644
--- a/src/backend/postmaster/datachecksumsworker.c
+++ b/src/backend/postmaster/datachecksumsworker.c
@@ -52,6 +52,9 @@
  *    - Data checksums SHALL NOT be considered enabled cluster-wide until all
  *      currently connected backends have the local state "enabled"
  *
+ * FIXME I'm not 100% sure I understand what the two above points say. What does
+ * "violate local data_checksums state" means"?
+ *
  * There are two levels of synchronization required for enabling data checksums
  * in an online cluster: (i) changing state in the active backends ("on",
  * "off", "inprogress-on" and "inprogress-off"), and (ii) ensuring no
@@ -60,6 +63,10 @@
  * latter with ensuring that any concurrent activity cannot break the data
  * checksum contract during processing.
  *
+ * FIXME This para talks about "enabling" but then mentions all four states,
+ * including "inprogress-off" and "off". Maybe it should talk about "changing
+ * data_checksums" instead?
+ *
  * Synchronizing the state change is done with procsignal barriers, where the
  * WAL logging backend updating the global state in the controlfile will wait
  * for all other backends to absorb the barrier. Barrier absorption will happen
@@ -88,6 +95,10 @@
  *   enables data checksums cluster-wide, there are four sets of backends where
  *   Bd shall be an empty set:
  *
+ * FIXME Maybe mention which process initiates the procsignalbarrier?
+ * FIXME Don't we actually wait or the barrier before we start rewriting data?
+ * I think Bd has to be empty prior to that, otherwise it might break checksums.
+ *
  *   Bg: Backend updating the global state and emitting the procsignalbarrier
  *   Bd: Backends in "off" state
  *   Be: Backends in "on" state
@@ -124,6 +135,8 @@
  *   stop writing data checksums as no backend is enforcing data checksum
  *   validation any longer.
  *
+ * XXX Maybe it'd make sense to "visualize" the progress between the states
+ * and barriers in some way. Say, by doing
  *
  * Potential optimizations
  * -----------------------
@@ -148,6 +161,7 @@
  *     to enable checksums on a cluster which is in inprogress-on state and
  *     may have checksummed pages (make pg_checksums be able to resume an
  *     online operation).
+ *   * Restartability (not necessarily with page granularity).
  *
  *
  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 44213d140ae..9014e90f1c7 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -31,7 +31,6 @@
 #include "postmaster/bgworker_internals.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/datachecksumsworker.h"
-#include "postmaster/postmaster.h"
 #include "postmaster/walsummarizer.h"
 #include "replication/logicallauncher.h"
 #include "replication/origin.h"
diff --git a/src/include/postmaster/datachecksumsworker.h b/src/include/postmaster/datachecksumsworker.h
index 2cd066fd0fe..0daef709ec8 100644
--- a/src/include/postmaster/datachecksumsworker.h
+++ b/src/include/postmaster/datachecksumsworker.h
@@ -24,7 +24,7 @@ typedef enum DataChecksumsWorkerOperation
 	ENABLE_DATACHECKSUMS,
 	DISABLE_DATACHECKSUMS,
 	/* TODO: VERIFY_DATACHECKSUMS, */
-}			DataChecksumsWorkerOperation;
+} DataChecksumsWorkerOperation;
 
 /*
  * Possible states for a database entry which has been processed. Exported
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 9d6b4e57cf3..3049e6018b3 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -417,6 +417,7 @@ CheckPointStmt
 CheckpointStatsData
 CheckpointerRequest
 CheckpointerShmemStruct
+ChecksumBarrierCondition
 ChecksumType
 Chromosome
 CkptSortItem
@@ -587,6 +588,7 @@ CustomScan
 CustomScanMethods
 CustomScanState
 CycleCtr
+DataChecksumsWorkerOperation
 DBState
 DCHCacheEntry
 DEADLOCK_INFO
-- 
2.51.0