v3-0003-Handle-rewind-failure-when-a-timeline-ends-with-an-overwritten-contrecord.patch
text/x-patch
Filename: v3-0003-Handle-rewind-failure-when-a-timeline-ends-with-an-overwritten-contrecord.patch
Type: text/x-patch
Part: 2
From a189e0d922c19a23739a4508b0e722c7459e872d Mon Sep 17 00:00:00 2001
From: Alena Vinter <dlaaren8@gmail.com>
Date: Wed, 10 Sep 2025 14:06:09 +0700
Subject: [PATCH 3/3] Handle rewind failure when a timeline ends with an
overwritten contrecord.
When a common timeline ends with an overwritten contrecord, the
divergence point may not point to the start of a valid WAL record on the
target, causing errors and making rewind impossible.
To handle this case, when the target timeline is unfinished, we look for
a checkpoint preceding the divergence point starting from the last
checkpoint on the target rather than from the divergence point itself.
This ensures we always begin from a known-valid position in WAL.
---
src/bin/pg_rewind/parsexlog.c | 25 +++++++++++++------------
src/bin/pg_rewind/pg_rewind.c | 18 ++++++++++++++++--
src/bin/pg_rewind/pg_rewind.h | 7 ++++---
3 files changed, 33 insertions(+), 17 deletions(-)
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index 8f4b282c6b1..7d01fefb1d5 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -165,9 +165,10 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex,
* Find the previous checkpoint preceding given WAL location.
*/
void
-findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
- XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli,
- XLogRecPtr *lastchkptredo, const char *restoreCommand)
+findLastCheckpoint(const char *datadir, XLogRecPtr startptr, XLogRecPtr forkptr,
+ int tliIndex, XLogRecPtr *lastchkptrec,
+ TimeLineID *lastchkpttli, XLogRecPtr *lastchkptredo,
+ const char *restoreCommand)
{
/* Walk backwards, starting from the given record */
XLogRecord *record;
@@ -179,17 +180,17 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
TimeLineID current_tli = 0;
/*
- * The given fork pointer points to the end of the last common record,
- * which is not necessarily the beginning of the next record, if the
- * previous record happens to end at a page boundary. Skip over the page
- * header in that case to find the next record.
+ * The given start pointer may point to a page boundary if the startptr is
+ * the end of the last common record which is not necessarily the beginning
+ * of the next record. Skip over the page header in that case to find the
+ * next record.
*/
- if (forkptr % XLOG_BLCKSZ == 0)
+ if (startptr % XLOG_BLCKSZ == 0)
{
- if (XLogSegmentOffset(forkptr, WalSegSz) == 0)
- forkptr += SizeOfXLogLongPHD;
+ if (XLogSegmentOffset(startptr, WalSegSz) == 0)
+ startptr += SizeOfXLogLongPHD;
else
- forkptr += SizeOfXLogShortPHD;
+ startptr += SizeOfXLogShortPHD;
}
private.tliIndex = tliIndex;
@@ -200,7 +201,7 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
if (xlogreader == NULL)
pg_fatal("out of memory while allocating a WAL reading processor");
- searchptr = forkptr;
+ searchptr = startptr;
for (;;)
{
uint8 info;
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 0c68dd4235e..c2ecf72cacf 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -140,6 +140,7 @@ main(int argc, char **argv)
int option_index;
int c;
XLogRecPtr divergerec;
+ XLogRecPtr checkpoint_search_startrec;
int lastcommontliIndex;
XLogRecPtr chkptrec;
TimeLineID chkpttli;
@@ -459,8 +460,21 @@ main(int argc, char **argv)
/* Initialize hashtable that tracks WAL files protected from removal */
keepwal_init();
- findLastCheckpoint(datadir_target, divergerec, lastcommontliIndex,
- &chkptrec, &chkpttli, &chkptredo, restore_command);
+ /*
+ * If the last common timeline is incomplete on the target, a divergence
+ * point from the source's finished timeline may not exist in the target's
+ * WAL. Therefore, start searching for a checkpoint preceding the divergence
+ * point from the last checkpoint on the target server to find a safe common
+ * point.
+ */
+ if (targetHistory[lastcommontliIndex].end == InvalidXLogRecPtr)
+ checkpoint_search_startrec = ControlFile_target.checkPoint;
+ else
+ checkpoint_search_startrec = divergerec;
+
+ findLastCheckpoint(datadir_target, checkpoint_search_startrec, divergerec,
+ lastcommontliIndex, &chkptrec, &chkpttli, &chkptredo,
+ restore_command);
pg_log_info("rewinding from last common checkpoint at %X/%08X on timeline %u",
LSN_FORMAT_ARGS(chkptrec), chkpttli);
diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h
index 9cea144d2b2..4879be1d1d4 100644
--- a/src/bin/pg_rewind/pg_rewind.h
+++ b/src/bin/pg_rewind/pg_rewind.h
@@ -35,9 +35,10 @@ extern uint64 fetch_done;
extern void extractPageMap(const char *datadir, XLogRecPtr startpoint,
int tliIndex, XLogRecPtr endpoint,
const char *restoreCommand);
-extern void findLastCheckpoint(const char *datadir, XLogRecPtr forkptr,
- int tliIndex,
- XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli,
+extern void findLastCheckpoint(const char *datadir, XLogRecPtr startptr,
+ XLogRecPtr forkptr, int tliIndex,
+ XLogRecPtr *lastchkptrec,
+ TimeLineID *lastchkpttli,
XLogRecPtr *lastchkptredo,
const char *restoreCommand);
extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr,
--
2.51.0