fix_some_problems_about_cascading_replication_v1.patch
text/x-patch
Filename: fix_some_problems_about_cascading_replication_v1.patch
Type: text/x-patch
Part: 0
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index c0a32a3..2ec39dd 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -2328,7 +2328,7 @@ reaper(SIGNAL_ARGS)
* XXX should avoid the need for disconnection. When we do,
* am_cascading_walsender should be replaced with RecoveryInProgress()
*/
- if (max_wal_senders > 0)
+ if (max_wal_senders > 0 && CountChildren(BACKEND_TYPE_WALSND) > 0)
{
ereport(LOG,
(errmsg("terminating all walsender processes to force cascaded standby(s) to update timeline and reconnect")));
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 0eadf64..ef6894c 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -368,6 +368,22 @@ StartReplication(StartReplicationCmd *cmd)
SendPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE);
/*
+ * When promoting a cascading standby, postmaster sends SIGUSR2 to
+ * any cascading walsenders to kill them. But there is a corner-case where
+ * such walsender fails to receive SIGUSR2 and survives a standby promotion
+ * unexpectedly. This happens when postmaster sends SIGUSR2 before
+ * the walsender marks itself as a WAL sender, because postmaster sends
+ * SIGUSR2 to only the processes marked as a WAL sender.
+ *
+ * To avoid this corner-case, if recovery is NOT in progress even though
+ * the walsender is cascading one, we do the same thing as SIGUSR2 signal
+ * handler does, i.e., set walsender_ready_to_stop to true. Which causes
+ * the walsender to end later.
+ */
+ if (am_cascading_walsender && !RecoveryInProgress())
+ walsender_ready_to_stop = true;
+
+ /*
* We assume here that we're logging enough information in the WAL for
* log-shipping, since this is checked in PostmasterMain().
*