[PATCH v36 05/17] Prevent orphan storage files after server crash

Kyotaro Horiguchi <horikyota.ntt@gmail.com>

From: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
To:
Date: 2024-11-06T08:35:33Z
Lists: pgsql-hackers
When a server crashes during a transaction that creates tables, newly
created but unused storage files are not removed. This patch prevents
such orphan files by utilizing the UNDO log system for storage files.
---
 src/backend/access/heap/heapam_handler.c   |  22 +--
 src/backend/access/rmgrdesc/Makefile       |   1 +
 src/backend/access/rmgrdesc/smgrundodesc.c |  62 ++++++
 src/backend/access/rmgrdesc/undologdesc.c  |   2 +
 src/backend/access/transam/undolog.c       |   1 +
 src/backend/catalog/index.c                |   4 +-
 src/backend/catalog/storage.c              | 212 +++++++++++++++++++--
 src/backend/commands/sequence.c            |   4 +-
 src/backend/commands/tablecmds.c           |  19 +-
 src/backend/storage/buffer/bufmgr.c        |   4 +-
 src/backend/storage/file/reinit.c          |  92 +++++++++
 src/backend/storage/smgr/smgr.c            |   9 +
 src/include/access/rmgrlist.h              |   2 +-
 src/include/catalog/storage.h              |   2 +
 src/include/catalog/storage_ulog.h         |  48 +++++
 src/include/storage/reinit.h               |   4 +
 src/include/storage/smgr.h                 |   1 +
 src/test/recovery/t/013_crash_restart.pl   |  19 ++
 18 files changed, 465 insertions(+), 43 deletions(-)
 create mode 100644 src/backend/access/rmgrdesc/smgrundodesc.c
 create mode 100644 src/include/catalog/storage_ulog.h

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 53f572f384b..239442f0cb2 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -611,8 +611,7 @@ heapam_relation_set_new_filelocator(Relation rel,
 	{
 		Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
 			   rel->rd_rel->relkind == RELKIND_TOASTVALUE);
-		smgrcreate(srel, INIT_FORKNUM, false);
-		log_smgrcreate(newrlocator, INIT_FORKNUM);
+		RelationCreateFork(srel, INIT_FORKNUM, true, true);
 	}
 
 	smgrclose(srel);
@@ -656,16 +655,17 @@ heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator)
 	{
 		if (smgrexists(RelationGetSmgr(rel), forkNum))
 		{
-			smgrcreate(dstrel, forkNum, false);
-
-			/*
-			 * WAL log creation if the relation is persistent, or this is the
-			 * init fork of an unlogged relation.
-			 */
-			if (RelationIsPermanent(rel) ||
+			bool wal_log = RelationIsPermanent(rel) |
 				(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
-				 forkNum == INIT_FORKNUM))
-				log_smgrcreate(newrlocator, forkNum);
+				 forkNum == INIT_FORKNUM);
+
+			/*
+			 * Usually, we don't use UNDO log for FSM or VM forks, as their
+			 * creation is not transactional. However, we're currently copying
+			 * the entire relation in a transactional manner, which requires
+			 * after-crash cleanup.
+			 */
+			RelationCreateFork(dstrel, forkNum, wal_log, true);
 			RelationCopyStorage(RelationGetSmgr(rel), dstrel, forkNum,
 								rel->rd_rel->relpersistence);
 		}
diff --git a/src/backend/access/rmgrdesc/Makefile b/src/backend/access/rmgrdesc/Makefile
index 542fd3d6a8e..fc4605bd30b 100644
--- a/src/backend/access/rmgrdesc/Makefile
+++ b/src/backend/access/rmgrdesc/Makefile
@@ -26,6 +26,7 @@ OBJS = \
 	rmgrdesc_utils.o \
 	seqdesc.o \
 	smgrdesc.o \
+	smgrundodesc.o \
 	spgdesc.o \
 	standbydesc.o \
 	tblspcdesc.o \
diff --git a/src/backend/access/rmgrdesc/smgrundodesc.c b/src/backend/access/rmgrdesc/smgrundodesc.c
new file mode 100644
index 00000000000..9939ef2b61d
--- /dev/null
+++ b/src/backend/access/rmgrdesc/smgrundodesc.c
@@ -0,0 +1,62 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgrundodesc.c
+ *	  rmgr undolog descriptor routines for catalog/storage.c
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/rmgrdesc/smgrundodesc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "catalog/storage_ulog.h"
+#include "lib/stringinfo.h"
+
+void
+smgr_undodesc(StringInfo buf, UndoLogRecord *record)
+{
+	uint8		info = ULogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	if (info == ULOG_SMGR_CREATE)
+	{
+		ul_smgr_create *urec = (ul_smgr_create *) ULogRecGetData(record);
+
+		appendStringInfo(buf, ": %d/%d/%d, fork %d, backend %d",
+						 urec->rlocator.spcOid,
+						 urec->rlocator.dbOid,
+						 urec->rlocator.relNumber,
+						 urec->forknum, urec->backend);
+	}
+	else if (info == ULOG_SMGR_PRESERVE)
+	{
+		ul_smgr_preserve *urec = (ul_smgr_preserve *) ULogRecGetData(record);
+
+		appendStringInfo(buf, ": %d/%d/%d, fork %d, backend %d",
+						 urec->rlocator.spcOid,
+						 urec->rlocator.dbOid,
+						 urec->rlocator.relNumber,
+						 urec->forknum, urec->backend);
+	}
+}
+
+const char *
+smgr_undoidentify(uint8 info)
+{
+	const char *id = NULL;
+
+	switch (info & ~XLR_INFO_MASK)
+	{
+		case ULOG_SMGR_CREATE:
+			id = "SMGRCREATE";
+			break;
+		case ULOG_SMGR_PRESERVE:
+			id = "SMGRPRESERVE";
+			break;
+	}
+
+	return id;
+}
diff --git a/src/backend/access/rmgrdesc/undologdesc.c b/src/backend/access/rmgrdesc/undologdesc.c
index e7559cdd33c..fa88705f99e 100644
--- a/src/backend/access/rmgrdesc/undologdesc.c
+++ b/src/backend/access/rmgrdesc/undologdesc.c
@@ -14,6 +14,8 @@
 #include "postgres.h"
 
 #include "access/undolog.h"
+#include "catalog/storage.h"
+#include "catalog/storage_ulog.h"
 
 typedef struct UndoDescData
 {
diff --git a/src/backend/access/transam/undolog.c b/src/backend/access/transam/undolog.c
index 196e02e652f..b2fdbfcd0f9 100644
--- a/src/backend/access/transam/undolog.c
+++ b/src/backend/access/transam/undolog.c
@@ -28,6 +28,7 @@
 #include "access/xlog.h"
 #include "access/xloginsert.h"
 #include "lib/dshash.h"
+#include "catalog/storage_ulog.h"
 #include "miscadmin.h"
 #include "storage/fd.h"
 #include "storage/procarray.h"
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 6976249e9e9..7613192e343 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -3059,8 +3059,8 @@ index_build(Relation heapRelation,
 	if (indexRelation->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
 		!smgrexists(RelationGetSmgr(indexRelation), INIT_FORKNUM))
 	{
-		smgrcreate(RelationGetSmgr(indexRelation), INIT_FORKNUM, false);
-		log_smgrcreate(&indexRelation->rd_locator, INIT_FORKNUM);
+		RelationCreateFork(RelationGetSmgr(indexRelation),
+						   INIT_FORKNUM, true, true);
 		indexRelation->rd_indam->ambuildempty(indexRelation);
 	}
 
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 5b22cf10990..d546d169d34 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -19,13 +19,16 @@
 
 #include "postgres.h"
 
+#include "access/undolog.h"
 #include "access/visibilitymap.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
 #include "access/xlogutils.h"
 #include "catalog/storage.h"
+#include "catalog/storage_ulog.h"
 #include "catalog/storage_xlog.h"
+#include "common/hashfn_unstable.h"
 #include "miscadmin.h"
 #include "storage/bulk_write.h"
 #include "storage/freespace.h"
@@ -76,6 +79,14 @@ typedef struct PendingRelSync
 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
 static HTAB *pendingSyncHash = NULL;
 
+/* Storage for smgr_undo()/smgr_undoevent() */
+static RelFileLocator *rlocs = NULL;
+static int			   rlocs_cap = 0;
+static int			   rlocs_len = 0;
+
+/* local functions */
+static void ulog_smgrcreate(SMgrRelation srel, ForkNumber forkNum);
+static void ulog_smgrpreserve(RelFileLocator rloc, ForkNumber forkNum);
 
 /*
  * AddPendingSync
@@ -147,36 +158,54 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence,
 	}
 
 	srel = smgropen(rlocator, procNumber);
-	smgrcreate(srel, MAIN_FORKNUM, false);
 
-	if (needs_wal)
-		log_smgrcreate(&srel->smgr_rlocator.locator, MAIN_FORKNUM);
+	RelationCreateFork(srel, MAIN_FORKNUM, needs_wal, register_delete);
 
-	/*
-	 * Add the relation to the list of stuff to delete at abort, if we are
-	 * asked to do so.
-	 */
-	if (register_delete)
+	if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
+	{
+		Assert(procNumber == INVALID_PROC_NUMBER);
+		AddPendingSync(&rlocator);
+	}
+
+	return srel;
+}
+
+/*
+ * RelationCreateFork
+ *		Create physical storage for a fork of a relation.
+ *
+ * This function creates a relation fork in a transactional manner. When
+ * undo_log is true, the creation is UNDO-logged so that in case of transaction
+ * aborts or server crashes later on, the fork will be removed. If the caller
+ * plans to remove the fork in another way, it should pass false. Additionally,
+ * it is WAL-logged if wal_log is true.
+ */
+void
+RelationCreateFork(SMgrRelation srel, ForkNumber forkNum,
+				   bool wal_log, bool undo_log)
+{
+	/* Schedule the removal of this init fork at abort if requested. */
+	if (undo_log)
 	{
 		PendingRelDelete *pending;
 
+		ulog_smgrcreate(srel, forkNum);
+
 		pending = (PendingRelDelete *)
 			MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
-		pending->rlocator = rlocator;
-		pending->procNumber = procNumber;
+		pending->rlocator = srel->smgr_rlocator.locator;
+		pending->procNumber = INVALID_PROC_NUMBER;
 		pending->atCommit = false;	/* delete if abort */
 		pending->nestLevel = GetCurrentTransactionNestLevel();
 		pending->next = pendingDeletes;
 		pendingDeletes = pending;
 	}
 
-	if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
-	{
-		Assert(procNumber == INVALID_PROC_NUMBER);
-		AddPendingSync(&rlocator);
-	}
+	/* WAL-log this creation if requested. */
+	if (wal_log)
+		log_smgrcreate(&srel->smgr_rlocator.locator, forkNum);
 
-	return srel;
+	smgrcreate(srel, forkNum, false);
 }
 
 /*
@@ -198,6 +227,35 @@ log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
 	XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
 }
 
+/*
+ * Perform UndoLogWrite of an XLOG_SMGR_CREATE record to UNDO log.
+ */
+void
+ulog_smgrcreate(SMgrRelation srel, ForkNumber forkNum)
+{
+	ul_smgr_create ulrec;
+
+	ulrec.rlocator = srel->smgr_rlocator.locator;
+	ulrec.backend = srel->smgr_rlocator.backend;
+	ulrec.forknum = forkNum;
+	UndoLogWrite(RM_SMGR_ID, ULOG_SMGR_CREATE, &ulrec, sizeof(ulrec));
+}
+
+/*
+ * Perform UndoLogWrite of an XLOG_SMGR_PRESERVE record to UNDO log.
+ */
+void
+ulog_smgrpreserve(RelFileLocator rloc, ForkNumber forkNum)
+{
+	ul_smgr_preserve ulrec;
+
+	Assert(forkNum == MAIN_FORKNUM);
+	ulrec.rlocator = rloc;
+	ulrec.backend = INVALID_PROC_NUMBER;
+	ulrec.forknum = forkNum;
+	UndoLogWrite(RM_SMGR_ID, ULOG_SMGR_PRESERVE, &ulrec, sizeof(ulrec));
+}
+
 /*
  * RelationDropStorage
  *		Schedule unlinking of physical storage at transaction commit.
@@ -253,6 +311,7 @@ RelationPreserveStorage(RelFileLocator rlocator, bool atCommit)
 	PendingRelDelete *pending;
 	PendingRelDelete *prev;
 	PendingRelDelete *next;
+	bool			  found = false;
 
 	prev = NULL;
 	for (pending = pendingDeletes; pending != NULL; pending = next)
@@ -261,6 +320,8 @@ RelationPreserveStorage(RelFileLocator rlocator, bool atCommit)
 		if (RelFileLocatorEquals(rlocator, pending->rlocator)
 			&& pending->atCommit == atCommit)
 		{
+			found = true;
+
 			/* unlink and delete list entry */
 			if (prev)
 				prev->next = next;
@@ -275,6 +336,9 @@ RelationPreserveStorage(RelFileLocator rlocator, bool atCommit)
 			prev = pending;
 		}
 	}
+
+	if (found)
+		ulog_smgrpreserve(rlocator, MAIN_FORKNUM);
 }
 
 /*
@@ -1077,3 +1141,119 @@ smgr_redo(XLogReaderState *record)
 	else
 		elog(PANIC, "smgr_redo: unknown op code %u", info);
 }
+
+void
+smgr_undo(UndoLogRecord *record, ULogContext cxt, bool redo, bool crashed)
+{
+	uint8	info;
+
+	Assert(CritSectionCount == 0);
+
+	if (cxt == ULOGCXT_CLEANUP)
+	{
+		Assert(record);
+		info = record->ul_info & ~ULR_INFO_MASK;
+
+		if (info == ULOG_SMGR_CREATE)
+		{
+			ul_smgr_create *ulrec = (ul_smgr_create *) ULogRecGetData(record);
+
+			Assert(ulrec->forknum == MAIN_FORKNUM);
+			if (rlocs_cap < rlocs_len + 1)
+			{
+				if (rlocs_cap == 0)
+				{
+					rlocs_cap = 32;
+					rlocs = palloc(sizeof(RelFileLocator) * rlocs_cap);
+				}
+				else
+				{
+					rlocs_cap *= 2;
+					rlocs = repalloc(rlocs, sizeof(RelFileLocator) * rlocs_cap);
+				}
+			}
+			rlocs[rlocs_len++] = ulrec->rlocator;
+		}
+		else if (info == ULOG_SMGR_PRESERVE)
+		{
+			ul_smgr_preserve *ulrec =
+				(ul_smgr_preserve *) ULogRecGetData(record);
+			int j = 0;
+
+			for (int i = 0 ; i < rlocs_len ; i++)
+			{
+				if (RelFileLocatorEquals(ulrec->rlocator, rlocs[i]))
+					continue;
+
+				if (i != j)
+					rlocs[j] = rlocs[i];
+				j++;
+			}
+
+			rlocs_len = j;
+		}
+		else
+			elog(PANIC, "smgr_undo: unknown op code %d", info);
+	}
+	else if (cxt == ULOGCXT_COMMIT || cxt == ULOGCXT_ABORT ||
+			 cxt == ULOGCXT_PREPARED)
+	{
+		/* nothing to do here */
+	}
+	else
+		elog(PANIC, "smgr_undo: unknown context code %u", cxt);
+}
+
+void
+smgr_undoevent(ULogEvent event)
+{
+	if (event == ULOGEVENT_XACTEND)
+	{
+		SMgrRelation reln;
+		ForkNumber	forks[3];
+		BlockNumber firstblocks[3] = {0};
+		int			nforks = 0;
+
+		for (int i = 0 ; i < rlocs_len ; i++)
+		{
+			forks[nforks++] = MAIN_FORKNUM;
+
+			/*
+			 * Since the MAIN fork was created in this transaction, rollback
+			 * should remove all forks of this relation.  Although we could
+			 * register an undo record individually for each fork, this may be
+			 * more complex because VM and FSM can be created
+			 * non-transactionally outside the transaction that created the
+			 * MAIN fork.
+			 */
+			forks[nforks++] = VISIBILITYMAP_FORKNUM;
+			forks[nforks++] = FSM_FORKNUM;
+
+			/*
+			 * Drop buffers, then the files. This can be improved by using
+			 * smgrdounlinkall(), but currently I take the simpler way.
+			 */
+			reln = smgropen(rlocs[i], INVALID_PROC_NUMBER);
+			DropRelationBuffers(reln, forks, nforks, firstblocks);
+			for (int j = 0 ; j < nforks ; j++)
+				smgrunlink(reln, forks[j], true);
+
+			smgrclose(reln);
+		}
+
+		if (rlocs)
+		{
+			pfree(rlocs);
+			rlocs = NULL;
+			rlocs_cap = rlocs_len = 0;
+		}
+	}
+	else if (event == ULOGEVENT_CLEANUP_INIT ||
+			 event == ULOGEVENT_RECOVERY_END)
+	{
+		/* Nothing to do */
+	}
+	else
+		elog(PANIC, "smgr_undoevent: unknown event code %u", event);
+
+}
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 0188e8bbd5b..be6afc7df52 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -31,6 +31,7 @@
 #include "catalog/objectaccess.h"
 #include "catalog/pg_sequence.h"
 #include "catalog/pg_type.h"
+#include "catalog/storage.h"
 #include "catalog/storage_xlog.h"
 #include "commands/defrem.h"
 #include "commands/sequence.h"
@@ -344,8 +345,7 @@ fill_seq_with_data(Relation rel, HeapTuple tuple)
 		SMgrRelation srel;
 
 		srel = smgropen(rel->rd_locator, INVALID_PROC_NUMBER);
-		smgrcreate(srel, INIT_FORKNUM, false);
-		log_smgrcreate(&rel->rd_locator, INIT_FORKNUM);
+		RelationCreateFork(srel, INIT_FORKNUM, true, true);
 		fill_seq_fork_with_data(rel, tuple, INIT_FORKNUM);
 		FlushRelationBuffers(rel);
 		smgrclose(srel);
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 49374782625..b5766989d8e 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -15965,16 +15965,17 @@ index_copy_data(Relation rel, RelFileLocator newrlocator)
 	{
 		if (smgrexists(RelationGetSmgr(rel), forkNum))
 		{
-			smgrcreate(dstrel, forkNum, false);
-
-			/*
-			 * WAL log creation if the relation is persistent, or this is the
-			 * init fork of an unlogged relation.
-			 */
-			if (RelationIsPermanent(rel) ||
+			bool wal_log = RelationIsPermanent(rel) |
 				(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
-				 forkNum == INIT_FORKNUM))
-				log_smgrcreate(&newrlocator, forkNum);
+				 forkNum == INIT_FORKNUM);
+
+			/*
+			 * Usually, we don't use UNDO log for FSM or VM forks, as their
+			 * creation is not transactional. However, we're currently copying
+			 * the entire relation in a transactional manner, which requires
+			 * after-crash cleanup.
+			 */
+			RelationCreateFork(dstrel, forkNum, wal_log, true);
 			RelationCopyStorage(RelationGetSmgr(rel), dstrel, forkNum,
 								rel->rd_rel->relpersistence);
 		}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 2622221809c..1a9c794374f 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -4812,8 +4812,7 @@ CreateAndCopyRelationData(RelFileLocator src_rlocator,
 	/*
 	 * Create and copy all forks of the relation.  During create database we
 	 * have a separate cleanup mechanism which deletes complete database
-	 * directory.  Therefore, each individual relation doesn't need to be
-	 * registered for cleanup.
+	 * directory. Therefore, do not issue an UNDO log for this relation.
 	 */
 	RelationCreateStorage(dst_rlocator, relpersistence, false);
 
@@ -4827,6 +4826,7 @@ CreateAndCopyRelationData(RelFileLocator src_rlocator,
 	{
 		if (smgrexists(src_rel, forkNum))
 		{
+			/* Use smgrcreate() directly as no UNDO log is required. */
 			smgrcreate(dst_rel, forkNum, false);
 
 			/*
diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c
index 01e267abf9b..d3a42d3f566 100644
--- a/src/backend/storage/file/reinit.c
+++ b/src/backend/storage/file/reinit.c
@@ -34,6 +34,39 @@ typedef struct
 	RelFileNumber relnumber;	/* hash key */
 } unlogged_relation_entry;
 
+static char **ignore_files = NULL;
+static int nignore_elems = 0;
+static int nignore_files = 0;
+
+/*
+ * determine if the file should be ignored when resetting unlogged relations
+ */
+static bool
+reinit_ignore_file(const char *dirname, const char *name)
+{
+	char fnamebuf[MAXPGPATH];
+	int len;
+
+	if (nignore_files == 0)
+		return false;
+
+	strncpy(fnamebuf, dirname, MAXPGPATH - 1);
+	strncat(fnamebuf, "/", MAXPGPATH - 1);
+	strncat(fnamebuf, name, MAXPGPATH - 1);
+	fnamebuf[MAXPGPATH - 1] = 0;
+
+	for (int i = 0 ; i < nignore_files ; i++)
+	{
+		/* match ignoring fork part */
+		len = strlen(ignore_files[i]);
+		if (strncmp(fnamebuf, ignore_files[i], len) == 0 &&
+			(fnamebuf[len] == 0 || fnamebuf[len] == '_'))
+			return true;
+	}
+
+	return false;
+}
+
 /*
  * Reset unlogged relations from before the last restart.
  *
@@ -204,6 +237,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
 													 &forkNum, &segno))
 				continue;
 
+			/* Skip anything that undo log suggested to ignore */
+			if (reinit_ignore_file(dbspacedirname, de->d_name))
+				continue;
+
 			/* Also skip it unless this is the init fork. */
 			if (forkNum != INIT_FORKNUM)
 				continue;
@@ -243,6 +280,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
 													 &forkNum, &segno))
 				continue;
 
+			/* Skip anything that undo log suggested to ignore */
+			if (reinit_ignore_file(dbspacedirname, de->d_name))
+				continue;
+
 			/* We never remove the init fork. */
 			if (forkNum == INIT_FORKNUM)
 				continue;
@@ -294,6 +335,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
 													 &forkNum, &segno))
 				continue;
 
+			/* Skip anything that undo log suggested to ignore */
+			if (reinit_ignore_file(dbspacedirname, de->d_name))
+				continue;
+
 			/* Also skip it unless this is the init fork. */
 			if (forkNum != INIT_FORKNUM)
 				continue;
@@ -337,6 +382,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
 													 &forkNum, &segno))
 				continue;
 
+			/* Skip anything that undo log suggested to ignore */
+			if (reinit_ignore_file(dbspacedirname, de->d_name))
+				continue;
+
 			/* Also skip it unless this is the init fork. */
 			if (forkNum != INIT_FORKNUM)
 				continue;
@@ -366,6 +415,49 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
 	}
 }
 
+/*
+ * Record relfilenodes that should be left alone during reinitializing unlogged
+ * relations.
+ */
+void
+ResetUnloggedRelationIgnore(RelFileLocator rloc, ProcNumber backend)
+{
+	RelFileLocatorBackend rbloc;
+
+	if (nignore_files >= nignore_elems)
+	{
+		if (ignore_files == NULL)
+		{
+			nignore_elems = 16;
+			ignore_files = palloc(sizeof(char *) * nignore_elems);
+		}
+		else
+		{
+			nignore_elems *= 2;
+			ignore_files = repalloc(ignore_files,
+									sizeof(char *) * nignore_elems);
+		}
+	}
+
+	rbloc.backend = backend;
+	rbloc.locator = rloc;
+	ignore_files[nignore_files++] = relpath(rbloc, MAIN_FORKNUM);
+}
+
+/*
+ * Clear the ignore list
+ */
+void
+ResetUnloggedRelationIgnoreClear(void)
+{
+	if (nignore_elems == 0)
+		return;
+
+	pfree(ignore_files);
+	ignore_files = NULL;
+	nignore_elems = 0;
+}
+
 /*
  * Basic parsing of putative relation filenames.
  *
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 36ad34aa6ac..8a7654118fe 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -819,6 +819,15 @@ smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
 	smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
 }
 
+/*
+ * smgrunlink() -- unlink the storage file
+ */
+void
+smgrunlink(SMgrRelation reln, ForkNumber forknum, bool isRedo)
+{
+	smgrsw[reln->smgr_which].smgr_unlink(reln->smgr_rlocator, forknum, isRedo);
+}
+
 /*
  * AtEOXact_SMgr
  *
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index 5909d87d599..b0c4e689950 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -27,7 +27,7 @@
 /* symbol name, textual name, redo, desc, identify, startup, cleanup, mask, decode, undo, undo_desc, undo_identify, undo_event */
 PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL, xlog_decode, NULL, NULL, NULL, NULL)
 PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL, xact_decode, NULL, NULL, NULL, NULL)
-PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL, NULL, smgr_undo, smgr_undodesc, smgr_undoidentify, smgr_undoevent)
 PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h
index 72ef3ee92c0..3451d6ac80c 100644
--- a/src/include/catalog/storage.h
+++ b/src/include/catalog/storage.h
@@ -25,6 +25,8 @@ extern PGDLLIMPORT int wal_skip_threshold;
 extern SMgrRelation RelationCreateStorage(RelFileLocator rlocator,
 										  char relpersistence,
 										  bool register_delete);
+extern void RelationCreateFork(SMgrRelation srel, ForkNumber forkNum,
+							   bool wal_log, bool undo_log);
 extern void RelationDropStorage(Relation rel);
 extern void RelationPreserveStorage(RelFileLocator rlocator, bool atCommit);
 extern void RelationPreTruncate(Relation rel);
diff --git a/src/include/catalog/storage_ulog.h b/src/include/catalog/storage_ulog.h
new file mode 100644
index 00000000000..9568ab24cfb
--- /dev/null
+++ b/src/include/catalog/storage_ulog.h
@@ -0,0 +1,48 @@
+/*-------------------------------------------------------------------------
+ *
+ * storage_ulog.h
+ *	  prototypes for Undo Log support for backend/catalog/storage.c
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/catalog/storage_ulog.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef STORAGE_ULOG_H
+#define STORAGE_ULOG_H
+
+#include "access/undolog.h"
+#include "storage/smgr.h"
+
+/* ULOG gives us high 4 bits (just following xlog) */
+#define ULOG_SMGR_CREATE			0x10
+#define ULOG_SMGR_PRESERVE			0x20
+
+/* undo log entry for storage file creation */
+typedef struct ul_smgr_create
+{
+	RelFileLocator	rlocator;
+	ProcNumber		backend;
+	ForkNumber		forknum;
+} ul_smgr_create;
+
+typedef struct ul_smgr_preserve
+{
+	RelFileLocator	rlocator;
+	ProcNumber		backend;
+	ForkNumber		forknum;
+} ul_smgr_preserve;
+
+extern void smgr_undo(UndoLogRecord *record, ULogContext cxt, bool redo,
+					  bool crashed);
+extern void	smgr_undodesc(StringInfo buf, UndoLogRecord *record);
+extern const char *smgr_undoidentify(uint8 info);
+extern void smgr_undoevent(ULogEvent event);
+
+#define ULogRecGetData(record) ((char *)record + sizeof(UndoLogRecord))
+#define ULogRecGetInfo(record) ((record)->ul_info)
+
+#endif							/* STORAGE_XLOG_H */
diff --git a/src/include/storage/reinit.h b/src/include/storage/reinit.h
index 1373d509df2..02bf55d3a6b 100644
--- a/src/include/storage/reinit.h
+++ b/src/include/storage/reinit.h
@@ -16,9 +16,13 @@
 #define REINIT_H
 
 #include "common/relpath.h"
+#include "storage/relfilelocator.h"
 
 
 extern void ResetUnloggedRelations(int op);
+extern void ResetUnloggedRelationIgnore(RelFileLocator rloc,
+										ProcNumber backend);
+extern void ResetUnloggedRelationIgnoreClear(void);
 extern bool parse_filename_for_nontemp_relation(const char *name,
 												RelFileNumber *relnumber,
 												ForkNumber *fork,
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 63a186bd346..a2c15d6af90 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -110,6 +110,7 @@ extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
 						 BlockNumber *nblocks);
 extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
 extern void smgrregistersync(SMgrRelation reln, ForkNumber forknum);
+extern void smgrunlink(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern void AtEOXact_SMgr(void);
 extern bool ProcessBarrierSmgrRelease(void);
 
diff --git a/src/test/recovery/t/013_crash_restart.pl b/src/test/recovery/t/013_crash_restart.pl
index d5d24e31d90..4df88efeb3d 100644
--- a/src/test/recovery/t/013_crash_restart.pl
+++ b/src/test/recovery/t/013_crash_restart.pl
@@ -86,6 +86,23 @@ ok( pump_until(
 $killme_stdout = '';
 $killme_stderr = '';
 
+#also, create a table whose storage should *not* survive.
+$killme_stdin .= q[
+CREATE TABLE should_not_survive (a int);
+SELECT pg_relation_filepath('should_not_survive');
+];
+ok( pump_until(
+		$killme, $psql_timeout, \$killme_stdout,
+		qr/base\/[[:digit:]\/]+[\r\n]$/m),
+	'created a table');
+my $relfilerelpath = $killme_stdout;
+chomp($relfilerelpath);
+$killme_stdout = '';
+$killme_stderr = '';
+
+my $relfilepath = $node->data_dir . "/" . $relfilerelpath;
+ok( -e $relfilepath,
+	"storage file is created in xact that is going to crash");
 
 # Start longrunning query in second session; its failure will signal that
 # crash-restart has occurred.  The initial wait for the trivial select is to
@@ -144,6 +161,8 @@ $killme->run();
 ($monitor_stdin, $monitor_stdout, $monitor_stderr) = ('', '', '');
 $monitor->run();
 
+ok( ! -e $relfilepath,
+	"orphaned storage file is correctly removed");
 
 # Acquire pid of new backend
 $killme_stdin .= q[
-- 
2.43.5


----Next_Part(Fri_Dec_27_17_25_02_2024_357)--
Content-Type: Text/X-Patch; charset=us-ascii
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename="v36-0006-new-indexam-bit-for-unlogged-storage-compatibili.patch"