v2.0001-pg_rewind-use-UUIDs-to-detect-independent-same-TLI-p.patch
text/x-patch
Filename: v2.0001-pg_rewind-use-UUIDs-to-detect-independent-same-TLI-p.patch
Type: text/x-patch
Part: 1
From 16456473c61537c5f8c7689a6dac340be6b84c43 Mon Sep 17 00:00:00 2001
From: Mats Kindahl <mats@kindahl.net>
Date: Thu, 30 Apr 2026 07:05:36 +0200
Subject: pg_rewind: use UUIDs to detect independent same-TLI promotions
Two PostgreSQL standbys can independently promote to the same timeline
ID if their primary stopped before either had a chance to promote. In
that situation both clusters share a timeline history prefix that looks
identical to pg_rewind: same TLI numbers and same begin/end LSNs. The
existing same-TLI shortcut therefore treated the source as a valid
rewind target and skipped the rewind entirely, leaving the target's
diverged WAL intact.
Fix this by embedding a UUIDv7 value in every timeline history file
entry at promotion time. Each promotion generates a fresh UUID, so two
independent promotions to the same TLI will carry different UUIDs even
though the TLI number and begin LSN are identical.
When loading the timeline history, pg_rewind uses these UUIDs in two
places:
1. findCommonAncestorTimeline checks that the TLI and UUID in each entry
match. A mismatch signals independent promotions and the search
continues to earlier entries to find the true common ancestor.
2. The same-TLI shortcut (source and target on the same current TLI)
compares the UUID stored in the last completed history entry and a
mismatch forces a full rewind instead of a no-op.
UUIDs are zero for clusters that predate this change, and the comparison
function treats a zero UUID on either side as "unknown / compatible", so
the new code is fully backward-compatible with old history files.
A new test in t/005_same_timeline.pl covers the same-TLI shortcut case:
two standbys independently promote to TLI 2, each with a distinct UUID.
---
src/backend/access/transam/timeline.c | 49 ++++++++-
src/backend/access/transam/xlog.c | 40 +++++++-
src/backend/utils/adt/uuid.c | 14 ++-
src/bin/pg_rewind/pg_rewind.c | 120 +++++++++++++++++++++--
src/bin/pg_rewind/t/005_same_timeline.pl | 87 ++++++++++++++++
src/bin/pg_rewind/timeline.c | 47 ++++++++-
src/include/access/timeline.h | 5 +-
src/include/access/xlog_internal.h | 1 +
src/include/utils/uuid.h | 10 +-
9 files changed, 349 insertions(+), 24 deletions(-)
diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c
index 68e5f692d26..bc768efa8a6 100644
--- a/src/backend/access/transam/timeline.c
+++ b/src/backend/access/transam/timeline.c
@@ -42,6 +42,8 @@
#include "pgstat.h"
#include "storage/fd.h"
#include "utils/wait_event.h"
+#include "utils/fmgrprotos.h"
+#include "utils/uuid.h"
/*
* Copies all timeline history files with id's between 'begin' and 'end'
@@ -114,6 +116,7 @@ readTimeLineHistory(TimeLineID targetTLI)
entry = palloc_object(TimeLineHistoryEntry);
entry->tli = targetTLI;
entry->begin = entry->end = InvalidXLogRecPtr;
+ memset(&entry->tluuid, 0, sizeof(pg_uuid_t));
return list_make1(entry);
}
@@ -125,6 +128,7 @@ readTimeLineHistory(TimeLineID targetTLI)
prevend = InvalidXLogRecPtr;
for (;;)
{
+ char uuid_str[UUID_STR_LEN + 1] = {0};
char fline[MAXPGPATH];
char *res;
char *ptr;
@@ -155,7 +159,8 @@ readTimeLineHistory(TimeLineID targetTLI)
if (*ptr == '\0' || *ptr == '#')
continue;
- nfields = sscanf(fline, "%u\t%X/%08X", &tli, &switchpoint_hi, &switchpoint_lo);
+ nfields =
+ sscanf(fline, "%u\t%X/%08X\t%36s", &tli, &switchpoint_hi, &switchpoint_lo, uuid_str);
if (nfields < 1)
{
@@ -164,7 +169,7 @@ readTimeLineHistory(TimeLineID targetTLI)
(errmsg("syntax error in history file: %s", fline),
errhint("Expected a numeric timeline ID.")));
}
- if (nfields != 3)
+ if (nfields < 3)
ereport(FATAL,
(errmsg("syntax error in history file: %s", fline),
errhint("Expected a write-ahead log switchpoint location.")));
@@ -182,6 +187,23 @@ readTimeLineHistory(TimeLineID targetTLI)
entry->end = ((uint64) (switchpoint_hi)) << 32 | (uint64) switchpoint_lo;
prevend = entry->end;
+ /*
+ * Parse the optional UUID field. Old history files have the
+ * reason string in field 4. It is in theory possible that the
+ * reason string starts with a UUID, but the current usage do
+ * not store a UUID. This allows us to support both old and new
+ * formats of history files without breaking compatibility by
+ * checking if the field contains a valid UUID.
+ */
+ memset(&entry->tluuid, 0, sizeof(pg_uuid_t));
+ if (nfields == 4 && strlen(uuid_str) == UUID_STR_LEN)
+ {
+ Datum datum = DirectFunctionCall1(uuid_in, CStringGetDatum(uuid_str));
+ pg_uuid_t *up = DatumGetUUIDP(datum);
+
+ memcpy(&entry->tluuid, up, sizeof(pg_uuid_t));
+ }
+
/* Build list with newest item first */
result = lcons(entry, result);
@@ -203,6 +225,7 @@ readTimeLineHistory(TimeLineID targetTLI)
entry->tli = targetTLI;
entry->begin = prevend;
entry->end = InvalidXLogRecPtr;
+ memset(&entry->tluuid, 0, sizeof(pg_uuid_t));
result = lcons(entry, result);
@@ -294,21 +317,33 @@ findNewestTimeLine(TimeLineID startTLI)
*
* newTLI: ID of the new timeline
* parentTLI: ID of its immediate parent
+ * newTLUUID: UUID uniquely identifying this promotion instance
* switchpoint: WAL location where the system switched to the new timeline
* reason: human-readable explanation of why the timeline was switched
*
- * Currently this is only used at the end recovery, and so there are no locking
+ * The output file is named <newTLI>.history (e.g. 00000003.history). If two
+ * servers independently promote to the same timeline ID, their history files
+ * share the same name. In a shared WAL archive the second file to arrive
+ * silently overwrites the first. The newTLUUID written into the file content
+ * lets pg_rewind detect this collision: it fetches each server's history file
+ * directly from that server, compares the UUIDs for every shared TLI, and
+ * treats a UUID mismatch as evidence of independent promotion even when the
+ * TLI numbers agree.
+ *
+ * Currently this is only used at end of recovery, and so there are no locking
* considerations. But we should be just as tense as XLogFileInit to avoid
* emplacing a bogus file.
*/
void
writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
+ const pg_uuid_t *newTLUUID,
XLogRecPtr switchpoint, char *reason)
{
char path[MAXPGPATH];
char tmppath[MAXPGPATH];
char histfname[MAXFNAMELEN];
char buffer[BLCKSZ];
+ char *uuid_str;
int srcfd;
int fd;
int nbytes;
@@ -398,13 +433,19 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
*
* If we did have a parent file, insert an extra newline just in case the
* parent file failed to end with one.
+ *
+ * Format: <parentTLI>\t<switchpoint>\t<ThisTimeLineUUID>\t<reason>\n
*/
+ uuid_str = DatumGetCString(DirectFunctionCall1(uuid_out, UUIDPGetDatum(newTLUUID)));
+
snprintf(buffer, sizeof(buffer),
- "%s%u\t%X/%08X\t%s\n",
+ "%s%u\t%X/%08X\t%s\t%s\n",
(srcfd < 0) ? "" : "\n",
parentTLI,
LSN_FORMAT_ARGS(switchpoint),
+ uuid_str,
reason);
+ pfree(uuid_str);
nbytes = strlen(buffer);
errno = 0;
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index e39af79c03b..586d996c56f 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -99,6 +99,7 @@
#include "storage/subsystems.h"
#include "storage/sync.h"
#include "utils/guc_hooks.h"
+#include "utils/uuid.h"
#include "utils/guc_tables.h"
#include "utils/injection_point.h"
#include "utils/pgstat_internal.h"
@@ -515,6 +516,13 @@ typedef struct XLogCtlData
TimeLineID InsertTimeLineID;
TimeLineID PrevTimeLineID;
+ /*
+ * UUID for the current promotion. Generated when the timeline history
+ * file is written and later embedded in the XLOG_END_OF_RECOVERY record.
+ * Protected by info_lck.
+ */
+ pg_uuid_t ThisTimeLineUUID;
+
/*
* SharedRecoveryState indicates if we're still in crash or archive
* recovery. Protected by info_lck.
@@ -6377,6 +6385,9 @@ StartupXLOG(void)
newTLI = endOfRecoveryInfo->lastRecTLI;
if (ArchiveRecoveryRequested)
{
+ TimestampTz now = GetCurrentTimestamp();
+ pg_uuid_t uuid_buf;
+
newTLI = findNewestTimeLine(recoveryTargetTLI) + 1;
ereport(LOG,
(errmsg("selected new timeline ID: %u", newTLI)));
@@ -6407,8 +6418,27 @@ StartupXLOG(void)
* to the new timeline, and will try to connect to the new timeline.
* To minimize the window for that, try to do as little as possible
* between here and writing the end-of-recovery record.
+ *
+ * Generate a UUIDv7 that uniquely identifies this promotion. The
+ * same UUID is written into the history file and later into the
+ * XLOG_END_OF_RECOVERY record so that pg_rewind can distinguish two
+ * servers that independently promoted to the same timeline ID.
*/
+
+
+ /*
+ * TimestampTz is microseconds; generate_uuidv7 wants ms + sub-ms. We
+ * generate the UUID outside the spinlock, to avoid doing the relatively
+ * expensive UUID generation, which could involve unexpected delays,
+ * while holding the spinlock.
+ */
+ generate_uuidv7_r(&uuid_buf, (uint64) (now / 1000), (uint32) (now % 1000) * 1000);
+ SpinLockAcquire(&XLogCtl->info_lck);
+ memcpy(&XLogCtl->ThisTimeLineUUID, &uuid_buf, sizeof(pg_uuid_t));
+ SpinLockRelease(&XLogCtl->info_lck);
+
writeTimeLineHistory(newTLI, recoveryTargetTLI,
+ &uuid_buf,
EndOfLog, endOfRecoveryInfo->recoveryStopReason);
ereport(LOG,
@@ -9042,8 +9072,16 @@ xlog_redo(XLogReaderState *record)
{
xl_end_of_recovery xlrec;
TimeLineID replayTLI;
+ uint32 rec_len;
- memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
+ /*
+ * Zero the struct first so that old records without UUID fields
+ * produce all-zero UUIDs, which pg_rewind treats as "unknown".
+ */
+ memset(&xlrec, 0, sizeof(xl_end_of_recovery));
+ rec_len = XLogRecGetDataLen(record);
+ memcpy(&xlrec, XLogRecGetData(record),
+ Min(rec_len, sizeof(xl_end_of_recovery)));
/*
* For Hot Standby, we could treat this like a Shutdown Checkpoint,
diff --git a/src/backend/utils/adt/uuid.c b/src/backend/utils/adt/uuid.c
index 6ee3752ac78..8dc098d11e3 100644
--- a/src/backend/utils/adt/uuid.c
+++ b/src/backend/utils/adt/uuid.c
@@ -72,7 +72,7 @@ static bool uuid_abbrev_abort(int memtupcount, SortSupport ssup);
static Datum uuid_abbrev_convert(Datum original, SortSupport ssup);
static inline void uuid_set_version(pg_uuid_t *uuid, unsigned char version);
static inline int64 get_real_time_ns_ascending(void);
-static pg_uuid_t *generate_uuidv7(uint64 unix_ts_ms, uint32 sub_ms);
+pg_uuid_t *generate_uuidv7(uint64 unix_ts_ms, uint32 sub_ms);
Datum
uuid_in(PG_FUNCTION_ARGS)
@@ -581,6 +581,13 @@ get_real_time_ns_ascending(void)
return ns;
}
+pg_uuid_t *
+generate_uuidv7(uint64 unix_ts_ms, uint32 sub_ms)
+{
+ pg_uuid_t *uuid = palloc(UUID_LEN);
+ return generate_uuidv7_r(uuid, unix_ts_ms, sub_ms);
+}
+
/*
* Generate UUID version 7 per RFC 9562, with the given timestamp.
*
@@ -597,10 +604,9 @@ get_real_time_ns_ascending(void)
*
* NB: all numbers here are unsigned, unix_ts_ms cannot be negative per RFC.
*/
-static pg_uuid_t *
-generate_uuidv7(uint64 unix_ts_ms, uint32 sub_ms)
+pg_uuid_t *
+generate_uuidv7_r(pg_uuid_t *uuid, uint64 unix_ts_ms, uint32 sub_ms)
{
- pg_uuid_t *uuid = palloc(UUID_LEN);
uint32 increased_clock_precision;
/* Fill in time part */
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 9d745d4b25b..b34f62bf968 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -32,6 +32,19 @@
#include "rewind_source.h"
#include "storage/bufpage.h"
+/*
+ * Timeline histories for both clusters, populated by timelines_match().
+ */
+typedef struct TimelineHistoriesData
+{
+ TimeLineHistoryEntry *source,
+ *target;
+ int sourceNentries,
+ targetNentries;
+} TimelineHistoriesData;
+
+typedef TimelineHistoriesData * TimelineHistories;
+
static void usage(const char *progname);
static void perform_rewind(filemap_t *filemap, rewind_source *source,
@@ -53,6 +66,9 @@ static void findCommonAncestorTimeline(TimeLineHistoryEntry *a_history,
TimeLineHistoryEntry *b_history,
int b_nentries,
XLogRecPtr *recptr, int *tliIndex);
+static inline bool matchingTimelineUUID(TimeLineHistoryEntry *a, TimeLineHistoryEntry *b);
+static bool matchAndFetchTimelines(TimeLineID source_tli, TimeLineID target_tli,
+ TimelineHistories timelineHistories);
static void ensureCleanShutdown(const char *argv0);
static void disconnect_atexit(void);
@@ -141,6 +157,7 @@ main(int argc, char **argv)
int c;
XLogRecPtr divergerec;
int lastcommontliIndex;
+ TimelineHistoriesData timelineHistories;
XLogRecPtr chkptrec;
TimeLineID chkpttli;
XLogRecPtr chkptredo;
@@ -372,10 +389,20 @@ main(int argc, char **argv)
*
* If both clusters are already on the same timeline, there's nothing to
* do.
+ *
+ * This also handles the case when two servers independently promoted to the
+ * same timeline ID: one crashed after writing the history file but before
+ * its EOR WAL record was distributed, so a second standby promoted
+ * independently. The history files produced by those two promotions carry
+ * different UUIDs.
+ *
+ * When the clusters are on different timelines we locate the fork point via
+ * findCommonAncestorTimeline.
*/
- if (target_tli == source_tli)
+ if (matchAndFetchTimelines(source_tli, target_tli, &timelineHistories))
{
pg_log_info("source and target cluster are on the same timeline");
+ pfree(timelineHistories.source);
rewind_needed = false;
target_wal_endrec = InvalidXLogRecPtr;
}
@@ -389,8 +416,10 @@ main(int argc, char **argv)
* Retrieve timelines for both source and target, and find the point
* where they diverged.
*/
- sourceHistory = getTimelineHistory(source_tli, true, &sourceNentries);
- targetHistory = getTimelineHistory(target_tli, false, &targetNentries);
+ targetHistory = timelineHistories.target;
+ targetNentries = timelineHistories.targetNentries;
+ sourceHistory = timelineHistories.source;
+ sourceNentries = timelineHistories.sourceNentries;
findCommonAncestorTimeline(sourceHistory, sourceNentries,
targetHistory, targetNentries,
@@ -874,7 +903,7 @@ getTimelineHistory(TimeLineID tli, bool is_source, int *nentries)
*/
if (tli == 1)
{
- history = pg_malloc_object(TimeLineHistoryEntry);
+ history = pg_malloc0_object(TimeLineHistoryEntry);
history->tli = tli;
history->begin = history->end = InvalidXLogRecPtr;
*nentries = 1;
@@ -920,6 +949,64 @@ getTimelineHistory(TimeLineID tli, bool is_source, int *nentries)
return history;
}
+/*
+ * Return true if two per-entry promotion UUIDs are compatible.
+ *
+ * A zero UUID means the history file predates this fix (or the entry is
+ * synthetic). Zero on either side means "unknown; treat as matching" so
+ * that pg_rewind degrades gracefully when rewinding against an old server.
+ */
+static inline bool
+matchingTimelineUUID(TimeLineHistoryEntry *a, TimeLineHistoryEntry *b)
+{
+ static const pg_uuid_t zero = {{0}};
+
+ if (memcmp(&a->tluuid, &zero, UUID_LEN) == 0 || memcmp(&b->tluuid, &zero, UUID_LEN) == 0)
+ return true;
+ return memcmp(&a->tluuid, &b->tluuid, UUID_LEN) == 0;
+}
+
+/*
+ * Fetch the timeline history for both clusters, store them in tlh, and return
+ * true if the clusters are on the same timeline (no rewind needed).
+ *
+ * tlh is always fully populated on return regardless of the result, so the
+ * caller can pass tlh->source / tlh->target directly to
+ * findCommonAncestorTimeline() when the return value is false.
+ *
+ * TLI 1 always returns true: it is the original timeline and has no promotion
+ * UUID. For TLI greater than 2, the UUID in entry[Nentries - 2] identifies the
+ * promotion that created the current TLI; a zero UUID (old history file or
+ * synthetic entry) is treated as matching.
+ */
+static bool
+matchAndFetchTimelines(TimeLineID source_tli, TimeLineID target_tli, TimelineHistories tlh)
+{
+ static const pg_uuid_t zero = {{0}};
+ pg_uuid_t *a,
+ *b;
+
+ tlh->source = getTimelineHistory(source_tli, true, &tlh->sourceNentries);
+ tlh->target = getTimelineHistory(target_tli, false, &tlh->targetNentries);
+
+ if (source_tli != target_tli)
+ return false;
+
+ /* TLI 1 has no promotion UUID; always treat as the same timeline. */
+ if (tlh->sourceNentries < 2 || tlh->targetNentries < 2)
+ return true;
+
+ a = &tlh->source[tlh->sourceNentries - 2].tluuid;
+ b = &tlh->target[tlh->targetNentries - 2].tluuid;
+
+ if (memcmp(a, &zero, UUID_LEN) == 0)
+ return true;
+ if (memcmp(b, &zero, UUID_LEN) == 0)
+ return true;
+
+ return memcmp(a, b, UUID_LEN) == 0;
+}
+
/*
* Determine the TLI of the last common timeline in the timeline history of
* two clusters. *tliIndex is set to the index of last common timeline in
@@ -936,17 +1023,30 @@ findCommonAncestorTimeline(TimeLineHistoryEntry *a_history, int a_nentries,
/*
* Trace the history forward, until we hit the timeline diverge. It may
- * still be possible that the source and target nodes used the same
- * timeline number in their history but with different start position
- * depending on the history files that each node has fetched in previous
- * recovery processes. Hence check the start position of the new timeline
- * as well and move down by one extra timeline entry if they do not match.
+ * still be possible that the source and target nodes used the same timeline
+ * number in their history but with different start position depending on
+ * the history files that each node has fetched in previous recovery
+ * processes. Hence check the start position of the new timeline as well and
+ * move down by one extra timeline entry if they do not match.
+ *
+ * We also compare timeline UUIDs when both sides carry one. Two servers
+ * that independently promoted to the same timeline ID produce history files
+ * with the same name (e.g. 00000003.history); in a shared WAL archive the
+ * second file silently overwrites the first. pg_rewind fetches each
+ * server's history file directly from that server, so it sees both UUIDs.
+ *
+ * The timeline UUID stored in history entry[i] is the UUID of the promotion
+ * that created entry[i+1], i.e. the UUID of TLI entry[i+1].tli. So to
+ * check whether entry[i] itself represents the same timeline on both sides
+ * we look at entry[i-1].tluuid (for i > 0). TLI 1 (i == 0) is always the
+ * same: it is the original timeline and has no promotion UUID.
*/
n = Min(a_nentries, b_nentries);
for (i = 0; i < n; i++)
{
if (a_history[i].tli != b_history[i].tli ||
- a_history[i].begin != b_history[i].begin)
+ a_history[i].begin != b_history[i].begin ||
+ (i > 0 && !matchingTimelineUUID(&a_history[i - 1], &b_history[i - 1])))
break;
}
diff --git a/src/bin/pg_rewind/t/005_same_timeline.pl b/src/bin/pg_rewind/t/005_same_timeline.pl
index 95a40c3b270..539d05f57a1 100644
--- a/src/bin/pg_rewind/t/005_same_timeline.pl
+++ b/src/bin/pg_rewind/t/005_same_timeline.pl
@@ -7,6 +7,8 @@
#
use strict;
use warnings FATAL => 'all';
+use File::Copy;
+use PostgreSQL::Test::Cluster;
use PostgreSQL::Test::Utils;
use Test::More;
@@ -21,4 +23,89 @@ RewindTest::create_standby();
RewindTest::run_pg_rewind('local');
RewindTest::clean_rewind_test();
+# Test that pg_rewind detects and handles two standbys that independently
+# promoted to the same timeline ID. Before the UUID-based divergence check,
+# pg_rewind's same-TLI shortcut would incorrectly skip the rewind in this
+# case, leaving the target's diverged WAL intact.
+#
+# origin (TLI 1)
+# |
+# +--- node_a (TLI 1) --promote--> TLI 2, UUID-A (target)
+# |
+# +--- node_b (TLI 1) --promote--> TLI 2, UUID-B (source)
+#
+# pg_rewind must detect the UUID mismatch and rewind node_a to match node_b.
+
+my $node_origin = PostgreSQL::Test::Cluster->new('origin');
+$node_origin->init(allows_streaming => 1);
+$node_origin->append_conf('postgresql.conf', "wal_keep_size = 320MB\n");
+$node_origin->start;
+
+$node_origin->safe_psql('postgres', "CREATE TABLE tbl (val text)");
+$node_origin->safe_psql('postgres', "INSERT INTO tbl VALUES ('initial')");
+$node_origin->safe_psql('postgres', 'CHECKPOINT');
+
+# Create node_a and node_b from separate backups of origin so that each
+# has its own data directory and will generate an independent UUID on promotion.
+my $node_a = PostgreSQL::Test::Cluster->new('node_a');
+$node_origin->backup('backup_a');
+$node_a->init_from_backup($node_origin, 'backup_a', has_streaming => 1);
+$node_a->set_standby_mode();
+$node_a->start;
+
+my $node_b = PostgreSQL::Test::Cluster->new('node_b');
+$node_origin->backup('backup_b');
+$node_b->init_from_backup($node_origin, 'backup_b', has_streaming => 1);
+$node_b->set_standby_mode();
+$node_b->start;
+
+# Wait for both standbys to catch up to origin, then stop origin. After
+# this point the two standbys are isolated and will promote independently.
+$node_origin->wait_for_catchup($node_a);
+$node_origin->wait_for_catchup($node_b);
+$node_origin->stop;
+
+# Promote both standbys. Each lands on TLI 2 but generates a distinct UUID,
+# so the resulting clusters are diverged even though they share a timeline ID.
+$node_a->promote;
+$node_b->promote;
+
+# Insert a divergent row on each so the rewind has visible work to do.
+$node_a->safe_psql('postgres', "INSERT INTO tbl VALUES ('in A')");
+$node_b->safe_psql('postgres', "INSERT INTO tbl VALUES ('in B')");
+
+# Stop both nodes; rewind node_a (target) from node_b (source) in local mode.
+$node_a->stop;
+$node_b->stop;
+
+my $node_a_pgdata = $node_a->data_dir;
+my $tmp_folder = PostgreSQL::Test::Utils::tempdir;
+copy("$node_a_pgdata/postgresql.conf",
+ "$tmp_folder/node_a-postgresql.conf.tmp");
+
+command_ok(
+ [
+ 'pg_rewind',
+ '--debug',
+ '--source-pgdata' => $node_b->data_dir,
+ '--target-pgdata' => $node_a_pgdata,
+ '--no-sync',
+ '--config-file' => "$tmp_folder/node_a-postgresql.conf.tmp",
+ ],
+ 'pg_rewind handles independent same-TLI promotion');
+
+move("$tmp_folder/node_a-postgresql.conf.tmp",
+ "$node_a_pgdata/postgresql.conf");
+
+# node_a should now mirror node_b: it has 'initial' and 'in B', not 'in A'.
+$node_a->start;
+my $result =
+ $node_a->safe_psql('postgres', "SELECT val FROM tbl ORDER BY val");
+is($result, "in B\ninitial",
+ 'rewound node has source data, not its own divergent data');
+
+$node_a->teardown_node;
+$node_b->teardown_node;
+$node_origin->teardown_node;
+
done_testing();
diff --git a/src/bin/pg_rewind/timeline.c b/src/bin/pg_rewind/timeline.c
index dda06eaa0bc..b6500606b27 100644
--- a/src/bin/pg_rewind/timeline.c
+++ b/src/bin/pg_rewind/timeline.c
@@ -9,9 +9,40 @@
*/
#include "postgres_fe.h"
+#include <ctype.h>
+#include <string.h>
+
#include "access/timeline.h"
#include "pg_rewind.h"
+/*
+ * Parse a UUID string in standard dashed form into a pg_uuid_t.
+ * Returns true on success, false if str is not a valid UUID string.
+ */
+static bool
+rewind_parse_uuid(const char *str, pg_uuid_t *uuid)
+{
+ const char *src = str;
+
+ for (int i = 0; i < UUID_LEN; i++)
+ {
+ char buf[3];
+
+ if (!isxdigit((unsigned char) src[0]) ||
+ !isxdigit((unsigned char) src[1]))
+ return false;
+ buf[0] = src[0];
+ buf[1] = src[1];
+ buf[2] = '\0';
+ uuid->data[i] = (unsigned char) strtoul(buf, NULL, 16);
+ src += 2;
+ /* skip dash at positions after bytes 3, 5, 7, 9 (i == 3,5,7,9) */
+ if (src[0] == '-' && (i == 3 || i == 5 || i == 7 || i == 9))
+ src++;
+ }
+ return (*src == '\0');
+}
+
/*
* This is copy-pasted from the backend readTimeLineHistory, modified to
* return a malloc'd array and to work without backend functions.
@@ -48,6 +79,7 @@ rewind_parseTimeLineHistory(char *buffer, TimeLineID targetTLI, int *nentries)
uint32 switchpoint_hi;
uint32 switchpoint_lo;
int nfields;
+ char uuid_str[UUID_STR_LEN + 1] = {0};
fline = bufptr;
while (*bufptr && *bufptr != '\n')
@@ -66,7 +98,8 @@ rewind_parseTimeLineHistory(char *buffer, TimeLineID targetTLI, int *nentries)
if (*ptr == '\0' || *ptr == '#')
continue;
- nfields = sscanf(fline, "%u\t%X/%08X", &tli, &switchpoint_hi, &switchpoint_lo);
+ nfields = sscanf(fline, "%u\t%X/%08X\t%36s", &tli, &switchpoint_hi,
+ &switchpoint_lo, uuid_str);
if (nfields < 1)
{
@@ -75,7 +108,7 @@ rewind_parseTimeLineHistory(char *buffer, TimeLineID targetTLI, int *nentries)
pg_log_error_detail("Expected a numeric timeline ID.");
exit(1);
}
- if (nfields != 3)
+ if (nfields < 3)
{
pg_log_error("syntax error in history file: %s", fline);
pg_log_error_detail("Expected a write-ahead log switchpoint location.");
@@ -99,7 +132,14 @@ rewind_parseTimeLineHistory(char *buffer, TimeLineID targetTLI, int *nentries)
entry->end = ((uint64) (switchpoint_hi)) << 32 | (uint64) switchpoint_lo;
prevend = entry->end;
- /* we ignore the remainder of each line */
+ /*
+ * Parse the optional UUID field. Old history files have the reason
+ * string in field 4; its first word is much shorter than UUID_STR_LEN
+ * so the length check safely distinguishes old from new format.
+ */
+ memset(&entry->tluuid, 0, sizeof(pg_uuid_t));
+ if (nfields == 4 && strlen(uuid_str) == UUID_STR_LEN)
+ rewind_parse_uuid(uuid_str, &entry->tluuid);
}
if (entries && targetTLI <= lasttli)
@@ -123,6 +163,7 @@ rewind_parseTimeLineHistory(char *buffer, TimeLineID targetTLI, int *nentries)
entry->tli = targetTLI;
entry->begin = prevend;
entry->end = InvalidXLogRecPtr;
+ memset(&entry->tluuid, 0, sizeof(pg_uuid_t));
*nentries = nlines;
return entries;
diff --git a/src/include/access/timeline.h b/src/include/access/timeline.h
index 97f1d619c35..cdd642c94f0 100644
--- a/src/include/access/timeline.h
+++ b/src/include/access/timeline.h
@@ -13,6 +13,7 @@
#include "access/xlogdefs.h"
#include "nodes/pg_list.h"
+#include "utils/uuid.h"
/*
* A list of these structs describes the timeline history of the server. Each
@@ -22,9 +23,10 @@
* pointers of all the entries form a contiguous line from beginning of time
* to infinity.
*/
-typedef struct
+typedef struct TimeLineHistoryEntry
{
TimeLineID tli;
+ pg_uuid_t tluuid; /* from history file; zero if unknown */
XLogRecPtr begin; /* inclusive */
XLogRecPtr end; /* exclusive, InvalidXLogRecPtr means infinity */
} TimeLineHistoryEntry;
@@ -33,6 +35,7 @@ extern List *readTimeLineHistory(TimeLineID targetTLI);
extern bool existsTimeLineHistory(TimeLineID probeTLI);
extern TimeLineID findNewestTimeLine(TimeLineID startTLI);
extern void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
+ const pg_uuid_t *newTLUUID,
XLogRecPtr switchpoint, char *reason);
extern void writeTimeLineHistoryFile(TimeLineID tli, char *content, int size);
extern void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end);
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 13ae3ad4fbb..8d5e374dfad 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -22,6 +22,7 @@
#include "access/xlogdefs.h"
#include "access/xlogreader.h"
#include "datatype/timestamp.h"
+#include "utils/uuid.h"
#include "lib/stringinfo.h"
#include "pgtime.h"
#include "storage/block.h"
diff --git a/src/include/utils/uuid.h b/src/include/utils/uuid.h
index 572d8cf4c36..784920c1f8e 100644
--- a/src/include/utils/uuid.h
+++ b/src/include/utils/uuid.h
@@ -17,12 +17,16 @@
/* uuid size in bytes */
#define UUID_LEN 16
+/* length of a UUID string (without null terminator): xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx */
+#define UUID_STR_LEN 36
+
typedef struct pg_uuid_t
{
unsigned char data[UUID_LEN];
} pg_uuid_t;
-/* fmgr interface macros */
+/* fmgr interface macros (backend only) */
+#ifndef FRONTEND
static inline Datum
UUIDPGetDatum(const pg_uuid_t *X)
{
@@ -38,5 +42,9 @@ DatumGetUUIDP(Datum X)
}
#define PG_GETARG_UUID_P(X) DatumGetUUIDP(PG_GETARG_DATUM(X))
+#endif /* !FRONTEND */
+
+extern pg_uuid_t *generate_uuidv7(uint64 unix_ts_ms, uint32 sub_ms);
+extern pg_uuid_t *generate_uuidv7_r(pg_uuid_t *uuid, uint64 unix_ts_ms, uint32 sub_ms);
#endif /* UUID_H */
--
2.43.0