v20240111-0007-decode-all-sequence-relfilenodes.patch

text/x-patch

Filename: v20240111-0007-decode-all-sequence-relfilenodes.patch
Type: text/x-patch
Part: 6
Message: Re: logical decoding and replication of sequences, take 2

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: format-patch
Series: patch v20240111-0007
Subject: decode all sequence relfilenodes
File+
src/backend/replication/logical/decode.c 6 25
src/backend/replication/logical/reorderbuffer.c 23 3
From 8f1598ac7795d70a9baecc3492b2930dcd828f35 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Thu, 11 Jan 2024 15:48:44 +0100
Subject: [PATCH v20240111 7/7] decode all sequence relfilenodes

---
 src/backend/replication/logical/decode.c      | 31 ++++---------------
 .../replication/logical/reorderbuffer.c       | 26 ++++++++++++++--
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 95f224df4a7..a141f444772 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -1432,27 +1432,12 @@ seq_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 	 * change as transactional and queue it. Otherwise it needs to be treated
 	 * as non-transactional, in which case we just send it to the plugin right
 	 * away.
-	 *
-	 * XXX This may incorrectly identify some changes as non-transactional,
-	 * if the relfilenode was created before the snapshot switched to FULL
-	 * (so that smgr_decode skipped it, and it's not in the hash table).
-	 * But we still handle this correctly, because the snapshot can't be
-	 * in CONSISTENT state yet, so the following check will skip the change
-	 * too, and we won't try to apply it (which would fail due to unknown
-	 * relfilenode).
 	 */
 	transactional = ReorderBufferSequenceIsTransactional(ctx->reorder,
 														 target_locator,
 														 NULL);
 
-	/*
-	 * Skip the change if already processed (per the snapshot).
-	 *
-	 * XXX It's important to skip all non-transactional changes with snapshot
-	 * that is not consistent yet. The sequence change may be incorrectly
-	 * identified as non-transactional (see above), and we must not attempt
-	 * to process it.
-	 */
+	/* Skip the change if already processed (per the snapshot). */
 	if (transactional &&
 		!SnapBuildProcessChange(builder, xid, buf->origptr))
 		return;
@@ -1516,11 +1501,15 @@ seq_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
  * We only care about XLOG_SMGR_CREATE records for "our" database (logical
  * decoding is restricted to a single database), and we do the filtering
  * and skipping, as appropriate.
+ *
+ * Note that the relfilenode is decoded and added to the reorder buffer even
+ * before the snapshot switches to SNAPBUILD_FULL_SNAPSHOT. This is necessary
+ * so that seq_decode can correctly identify transactional changes when the
+ * state changes to FULL after the relfilenode is created.
  */
 void
 smgr_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 {
-	SnapBuild  *builder = ctx->snapshot_builder;
 	XLogReaderState *r = buf->record;
 	TransactionId xid = XLogRecGetXid(r);
 	uint8		info = XLogRecGetInfo(buf->record) & ~XLR_INFO_MASK;
@@ -1542,14 +1531,6 @@ smgr_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 
 	ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(r), buf->origptr);
 
-	/*
-	 * If we don't have snapshot or we are just fast-forwarding, there is no
-	 * point in decoding relfilenode information.
-	 */
-	if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT ||
-		ctx->fast_forward)
-		return;
-
 	/* only interested in our database */
 	xlrec = (xl_smgr_create *) XLogRecGetData(r);
 	if (xlrec->rlocator.dbOid != ctx->slot->data.database)
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index acaa6dce8e4..017fd9c09d7 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -1280,10 +1280,30 @@ ReorderBufferAddRelFileLocator(ReorderBuffer *rb, TransactionId xid,
 						&found);
 
 	/*
-	 * We've just decoded creation of the relfilenode, so if we found it
-	 * in the hash table, something is wrong.
+	 * We've just decoded creation of the relfilenode, but it's possible a
+	 * the transaction aborted without triggering a reorderbuffer cleanup,
+	 * and the relfilenode was allocated again. While quite unlikely (the
+	 * hash table entry should be cleaned when processing RUNNING_XACTS,
+	 * and we should not reuse the relfilenode this fast).
+	 *
+	 * So if we find an entry for the relfilenode, we simply point it to
+	 * the new transaction, but we need to remove it from the hash table
+	 * in the original transaction (otherwise it'd confuse cleanup).
 	 */
-	Assert(!found);
+	if (found)
+	{
+		/*
+		 * The entry must be pointing to a transaction, and there must be
+		 * must be a hash table of relfilenodes.
+		 */
+		Assert(entry->txn);
+		Assert(entry->txn->sequences_hash);
+
+		if (hash_search(entry->txn->sequences_hash,
+						(void *) &entry->rlocator,
+						HASH_REMOVE, NULL) == NULL)
+			elog(ERROR, "hash table of sequence relfilenode corrupted");
+	}
 
 	entry->txn = txn;
 
-- 
2.43.0