From 9418668687a4ecf36531c2999efde29248488ae3 Mon Sep 17 00:00:00 2001 From: srinathv2 Date: Sat, 6 Sep 2025 21:15:57 +0530 Subject: [PATCH 1/1] pg_rewind: ignore shutdown-only WAL when determining end-of-WAL Previously, pg_rewind determined the end-of-WAL on the target by using the last shutdown checkpoint (or minRecoveryPoint for a standby). This caused false positives in scenarios where the old primary was shut down after a failover: the only WAL record generated was a shutdown checkpoint, while the new primary and old primary still contained identical data. In such cases, pg_rewind incorrectly concluded that if (target_wal_endrec > divergerec) rewind_needed = true; and performed a rewind even though no real changes existed after the divergence point. With this patch, pg_rewind now scans backward from the last checkpoint to locate the most recent valid WAL record that is not a shutdown checkpoint or XLOG switch. As a result, a rewind is only required when the target contains actual changes past the divergence point, avoiding unnecessary rewind operations in clean failover scenarios. --- src/bin/pg_rewind/parsexlog.c | 36 ++++++++++++++++++++++++++++++----- src/bin/pg_rewind/pg_rewind.c | 33 ++++++++++++++++++++++---------- src/bin/pg_rewind/pg_rewind.h | 2 +- 3 files changed, 55 insertions(+), 16 deletions(-) diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 8f4b282c6b1..442515249f4 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -117,11 +117,15 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, } /* - * Reads one WAL record. Returns the end position of the record, without - * doing anything with the record itself. + * Find the last valid WAL record after the divergence point. + * + * Skips over records such as shutdown checkpoints and XLOG + * switch records, which otherwise could make pg_rewind think a + * rewind is required even when no real changes happened after failover. + * Returns the end position of the last meaningful record. */ XLogRecPtr -readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex, +findLastValidRecord(const char *datadir, XLogRecPtr ptr, int tliIndex, const char *restoreCommand) { XLogRecord *record; @@ -129,6 +133,7 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex, char *errormsg; XLogPageReadPrivate private; XLogRecPtr endptr; + uint8 info; private.tliIndex = tliIndex; private.restoreCommand = restoreCommand; @@ -138,16 +143,37 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex, if (xlogreader == NULL) pg_fatal("out of memory while allocating a WAL reading processor"); + for (;;) + { + XLogBeginRead(xlogreader, ptr); + record = XLogReadRecord(xlogreader, &errormsg); + if (record == NULL) + { + if (errormsg) + pg_fatal("could not read WAL record at %X/%08X: %s", + LSN_FORMAT_ARGS(ptr), errormsg); + else + pg_fatal("could not read WAL record at %X/%08X", + LSN_FORMAT_ARGS(ptr)); + } + ptr = record->xl_prev; + info = record->xl_info & ~XLR_INFO_MASK; + if((info != XLOG_CHECKPOINT_SHUTDOWN) && (info != XLOG_SWITCH)) + { + break; + } + } + ptr = xlogreader->EndRecPtr; XLogBeginRead(xlogreader, ptr); record = XLogReadRecord(xlogreader, &errormsg); if (record == NULL) { if (errormsg) pg_fatal("could not read WAL record at %X/%08X: %s", - LSN_FORMAT_ARGS(ptr), errormsg); + LSN_FORMAT_ARGS(ptr), errormsg); else pg_fatal("could not read WAL record at %X/%08X", - LSN_FORMAT_ARGS(ptr)); + LSN_FORMAT_ARGS(ptr)); } endptr = xlogreader->EndRecPtr; diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 0c68dd4235e..c97789584d7 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -405,16 +405,29 @@ main(int argc, char **argv) /* - * Determine the end-of-WAL on the target. - * - * The WAL ends at the last shutdown checkpoint, or at - * minRecoveryPoint if it was a standby. (If we supported rewinding a - * server that was not shut down cleanly, we would need to replay - * until we reach the first invalid record, like crash recovery does.) - */ - - /* read the checkpoint record on the target to see where it ends. */ - chkptendrec = readOneRecord(datadir_target, + * Determine the effective end-of-WAL on the target. + * + * Previously, this was taken directly from the last shutdown checkpoint, + * or from minRecoveryPoint if the server was a standby. However, this + * approach can falsely indicate divergence: when the old primary is shut + * down after promoting a standby, the only WAL record generated on the + * old primary is a shutdown checkpoint. In such cases, both clusters have + * identical data, yet the presence of that extra checkpoint record makes + * pg_rewind believe the target WAL extends past the divergence point: + * + * if (target_wal_endrec > divergerec) + * rewind_needed = true; + * + * That sets rewind_needed = true even though no user data changes exist. + * + * To avoid this, we no longer treat a plain shutdown checkpoint + * as a meaningful record when determining end-of-WAL. We instead + * scan backward to the last valid WAL record *after* divergence, + * skipping over shutdown-only artifacts. This ensures rewind is only + * triggered if there are actual changes on the target after divergence. + */ + + chkptendrec = findLastValidRecord(datadir_target, ControlFile_target.checkPoint, targetNentries - 1, restore_command); diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h index 9cea144d2b2..304c9cd5ca9 100644 --- a/src/bin/pg_rewind/pg_rewind.h +++ b/src/bin/pg_rewind/pg_rewind.h @@ -40,7 +40,7 @@ extern void findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli, XLogRecPtr *lastchkptredo, const char *restoreCommand); -extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr, +extern XLogRecPtr findLastValidRecord(const char *datadir, XLogRecPtr ptr, int tliIndex, const char *restoreCommand); /* in pg_rewind.c */ -- 2.43.0