v1-0001-pg_rewind-ignore-shutdown-only-WAL-when-determining-.patch
application/octet-stream
Filename: v1-0001-pg_rewind-ignore-shutdown-only-WAL-when-determining-.patch
Type: application/octet-stream
Part: 1
From 9418668687a4ecf36531c2999efde29248488ae3 Mon Sep 17 00:00:00 2001
From: srinathv2 <srinath2133@gmail.com>
Date: Sat, 6 Sep 2025 21:15:57 +0530
Subject: [PATCH 1/1] pg_rewind: ignore shutdown-only WAL when determining
end-of-WAL
Previously, pg_rewind determined the end-of-WAL on the target by using
the last shutdown checkpoint (or minRecoveryPoint for a standby). This
caused false positives in scenarios where the old primary was shut down
after a failover: the only WAL record generated was a shutdown checkpoint,
while the new primary and old primary still contained identical data.
In such cases, pg_rewind incorrectly concluded that
if (target_wal_endrec > divergerec) rewind_needed = true;
and performed a rewind even though no real changes existed after the
divergence point.
With this patch, pg_rewind now scans backward from the last checkpoint
to locate the most recent valid WAL record that is not a shutdown
checkpoint or XLOG switch. As a result, a rewind is only required when
the target contains actual changes past the divergence point, avoiding
unnecessary rewind operations in clean failover scenarios.
---
src/bin/pg_rewind/parsexlog.c | 36 ++++++++++++++++++++++++++++++-----
src/bin/pg_rewind/pg_rewind.c | 33 ++++++++++++++++++++++----------
src/bin/pg_rewind/pg_rewind.h | 2 +-
3 files changed, 55 insertions(+), 16 deletions(-)
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index 8f4b282c6b1..442515249f4 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -117,11 +117,15 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex,
}
/*
- * Reads one WAL record. Returns the end position of the record, without
- * doing anything with the record itself.
+ * Find the last valid WAL record after the divergence point.
+ *
+ * Skips over records such as shutdown checkpoints and XLOG
+ * switch records, which otherwise could make pg_rewind think a
+ * rewind is required even when no real changes happened after failover.
+ * Returns the end position of the last meaningful record.
*/
XLogRecPtr
-readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex,
+findLastValidRecord(const char *datadir, XLogRecPtr ptr, int tliIndex,
const char *restoreCommand)
{
XLogRecord *record;
@@ -129,6 +133,7 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex,
char *errormsg;
XLogPageReadPrivate private;
XLogRecPtr endptr;
+ uint8 info;
private.tliIndex = tliIndex;
private.restoreCommand = restoreCommand;
@@ -138,16 +143,37 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex,
if (xlogreader == NULL)
pg_fatal("out of memory while allocating a WAL reading processor");
+ for (;;)
+ {
+ XLogBeginRead(xlogreader, ptr);
+ record = XLogReadRecord(xlogreader, &errormsg);
+ if (record == NULL)
+ {
+ if (errormsg)
+ pg_fatal("could not read WAL record at %X/%08X: %s",
+ LSN_FORMAT_ARGS(ptr), errormsg);
+ else
+ pg_fatal("could not read WAL record at %X/%08X",
+ LSN_FORMAT_ARGS(ptr));
+ }
+ ptr = record->xl_prev;
+ info = record->xl_info & ~XLR_INFO_MASK;
+ if((info != XLOG_CHECKPOINT_SHUTDOWN) && (info != XLOG_SWITCH))
+ {
+ break;
+ }
+ }
+ ptr = xlogreader->EndRecPtr;
XLogBeginRead(xlogreader, ptr);
record = XLogReadRecord(xlogreader, &errormsg);
if (record == NULL)
{
if (errormsg)
pg_fatal("could not read WAL record at %X/%08X: %s",
- LSN_FORMAT_ARGS(ptr), errormsg);
+ LSN_FORMAT_ARGS(ptr), errormsg);
else
pg_fatal("could not read WAL record at %X/%08X",
- LSN_FORMAT_ARGS(ptr));
+ LSN_FORMAT_ARGS(ptr));
}
endptr = xlogreader->EndRecPtr;
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 0c68dd4235e..c97789584d7 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -405,16 +405,29 @@ main(int argc, char **argv)
/*
- * Determine the end-of-WAL on the target.
- *
- * The WAL ends at the last shutdown checkpoint, or at
- * minRecoveryPoint if it was a standby. (If we supported rewinding a
- * server that was not shut down cleanly, we would need to replay
- * until we reach the first invalid record, like crash recovery does.)
- */
-
- /* read the checkpoint record on the target to see where it ends. */
- chkptendrec = readOneRecord(datadir_target,
+ * Determine the effective end-of-WAL on the target.
+ *
+ * Previously, this was taken directly from the last shutdown checkpoint,
+ * or from minRecoveryPoint if the server was a standby. However, this
+ * approach can falsely indicate divergence: when the old primary is shut
+ * down after promoting a standby, the only WAL record generated on the
+ * old primary is a shutdown checkpoint. In such cases, both clusters have
+ * identical data, yet the presence of that extra checkpoint record makes
+ * pg_rewind believe the target WAL extends past the divergence point:
+ *
+ * if (target_wal_endrec > divergerec)
+ * rewind_needed = true;
+ *
+ * That sets rewind_needed = true even though no user data changes exist.
+ *
+ * To avoid this, we no longer treat a plain shutdown checkpoint
+ * as a meaningful record when determining end-of-WAL. We instead
+ * scan backward to the last valid WAL record *after* divergence,
+ * skipping over shutdown-only artifacts. This ensures rewind is only
+ * triggered if there are actual changes on the target after divergence.
+ */
+
+ chkptendrec = findLastValidRecord(datadir_target,
ControlFile_target.checkPoint,
targetNentries - 1,
restore_command);
diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h
index 9cea144d2b2..304c9cd5ca9 100644
--- a/src/bin/pg_rewind/pg_rewind.h
+++ b/src/bin/pg_rewind/pg_rewind.h
@@ -40,7 +40,7 @@ extern void findLastCheckpoint(const char *datadir, XLogRecPtr forkptr,
XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli,
XLogRecPtr *lastchkptredo,
const char *restoreCommand);
-extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr,
+extern XLogRecPtr findLastValidRecord(const char *datadir, XLogRecPtr ptr,
int tliIndex, const char *restoreCommand);
/* in pg_rewind.c */
--
2.43.0