Thread

  1. [PATCH v36 05/17] Prevent orphan storage files after server crash

    Kyotaro Horiguchi <horikyota.ntt@gmail.com> — 2024-11-06T08:35:33Z

    When a server crashes during a transaction that creates tables, newly
    created but unused storage files are not removed. This patch prevents
    such orphan files by utilizing the UNDO log system for storage files.
    ---
     src/backend/access/heap/heapam_handler.c   |  22 +--
     src/backend/access/rmgrdesc/Makefile       |   1 +
     src/backend/access/rmgrdesc/smgrundodesc.c |  62 ++++++
     src/backend/access/rmgrdesc/undologdesc.c  |   2 +
     src/backend/access/transam/undolog.c       |   1 +
     src/backend/catalog/index.c                |   4 +-
     src/backend/catalog/storage.c              | 212 +++++++++++++++++++--
     src/backend/commands/sequence.c            |   4 +-
     src/backend/commands/tablecmds.c           |  19 +-
     src/backend/storage/buffer/bufmgr.c        |   4 +-
     src/backend/storage/file/reinit.c          |  92 +++++++++
     src/backend/storage/smgr/smgr.c            |   9 +
     src/include/access/rmgrlist.h              |   2 +-
     src/include/catalog/storage.h              |   2 +
     src/include/catalog/storage_ulog.h         |  48 +++++
     src/include/storage/reinit.h               |   4 +
     src/include/storage/smgr.h                 |   1 +
     src/test/recovery/t/013_crash_restart.pl   |  19 ++
     18 files changed, 465 insertions(+), 43 deletions(-)
     create mode 100644 src/backend/access/rmgrdesc/smgrundodesc.c
     create mode 100644 src/include/catalog/storage_ulog.h
    
    diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
    index 53f572f384b..239442f0cb2 100644
    --- a/src/backend/access/heap/heapam_handler.c
    +++ b/src/backend/access/heap/heapam_handler.c
    @@ -611,8 +611,7 @@ heapam_relation_set_new_filelocator(Relation rel,
     	{
     		Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
     			   rel->rd_rel->relkind == RELKIND_TOASTVALUE);
    -		smgrcreate(srel, INIT_FORKNUM, false);
    -		log_smgrcreate(newrlocator, INIT_FORKNUM);
    +		RelationCreateFork(srel, INIT_FORKNUM, true, true);
     	}
     
     	smgrclose(srel);
    @@ -656,16 +655,17 @@ heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator)
     	{
     		if (smgrexists(RelationGetSmgr(rel), forkNum))
     		{
    -			smgrcreate(dstrel, forkNum, false);
    -
    -			/*
    -			 * WAL log creation if the relation is persistent, or this is the
    -			 * init fork of an unlogged relation.
    -			 */
    -			if (RelationIsPermanent(rel) ||
    +			bool wal_log = RelationIsPermanent(rel) |
     				(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
    -				 forkNum == INIT_FORKNUM))
    -				log_smgrcreate(newrlocator, forkNum);
    +				 forkNum == INIT_FORKNUM);
    +
    +			/*
    +			 * Usually, we don't use UNDO log for FSM or VM forks, as their
    +			 * creation is not transactional. However, we're currently copying
    +			 * the entire relation in a transactional manner, which requires
    +			 * after-crash cleanup.
    +			 */
    +			RelationCreateFork(dstrel, forkNum, wal_log, true);
     			RelationCopyStorage(RelationGetSmgr(rel), dstrel, forkNum,
     								rel->rd_rel->relpersistence);
     		}
    diff --git a/src/backend/access/rmgrdesc/Makefile b/src/backend/access/rmgrdesc/Makefile
    index 542fd3d6a8e..fc4605bd30b 100644
    --- a/src/backend/access/rmgrdesc/Makefile
    +++ b/src/backend/access/rmgrdesc/Makefile
    @@ -26,6 +26,7 @@ OBJS = \
     	rmgrdesc_utils.o \
     	seqdesc.o \
     	smgrdesc.o \
    +	smgrundodesc.o \
     	spgdesc.o \
     	standbydesc.o \
     	tblspcdesc.o \
    diff --git a/src/backend/access/rmgrdesc/smgrundodesc.c b/src/backend/access/rmgrdesc/smgrundodesc.c
    new file mode 100644
    index 00000000000..9939ef2b61d
    --- /dev/null
    +++ b/src/backend/access/rmgrdesc/smgrundodesc.c
    @@ -0,0 +1,62 @@
    +/*-------------------------------------------------------------------------
    + *
    + * smgrundodesc.c
    + *	  rmgr undolog descriptor routines for catalog/storage.c
    + *
    + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
    + * Portions Copyright (c) 1994, Regents of the University of California
    + *
    + *
    + * IDENTIFICATION
    + *	  src/backend/access/rmgrdesc/smgrundodesc.c
    + *
    + *-------------------------------------------------------------------------
    + */
    +#include "postgres.h"
    +#include "catalog/storage_ulog.h"
    +#include "lib/stringinfo.h"
    +
    +void
    +smgr_undodesc(StringInfo buf, UndoLogRecord *record)
    +{
    +	uint8		info = ULogRecGetInfo(record) & ~XLR_INFO_MASK;
    +
    +	if (info == ULOG_SMGR_CREATE)
    +	{
    +		ul_smgr_create *urec = (ul_smgr_create *) ULogRecGetData(record);
    +
    +		appendStringInfo(buf, ": %d/%d/%d, fork %d, backend %d",
    +						 urec->rlocator.spcOid,
    +						 urec->rlocator.dbOid,
    +						 urec->rlocator.relNumber,
    +						 urec->forknum, urec->backend);
    +	}
    +	else if (info == ULOG_SMGR_PRESERVE)
    +	{
    +		ul_smgr_preserve *urec = (ul_smgr_preserve *) ULogRecGetData(record);
    +
    +		appendStringInfo(buf, ": %d/%d/%d, fork %d, backend %d",
    +						 urec->rlocator.spcOid,
    +						 urec->rlocator.dbOid,
    +						 urec->rlocator.relNumber,
    +						 urec->forknum, urec->backend);
    +	}
    +}
    +
    +const char *
    +smgr_undoidentify(uint8 info)
    +{
    +	const char *id = NULL;
    +
    +	switch (info & ~XLR_INFO_MASK)
    +	{
    +		case ULOG_SMGR_CREATE:
    +			id = "SMGRCREATE";
    +			break;
    +		case ULOG_SMGR_PRESERVE:
    +			id = "SMGRPRESERVE";
    +			break;
    +	}
    +
    +	return id;
    +}
    diff --git a/src/backend/access/rmgrdesc/undologdesc.c b/src/backend/access/rmgrdesc/undologdesc.c
    index e7559cdd33c..fa88705f99e 100644
    --- a/src/backend/access/rmgrdesc/undologdesc.c
    +++ b/src/backend/access/rmgrdesc/undologdesc.c
    @@ -14,6 +14,8 @@
     #include "postgres.h"
     
     #include "access/undolog.h"
    +#include "catalog/storage.h"
    +#include "catalog/storage_ulog.h"
     
     typedef struct UndoDescData
     {
    diff --git a/src/backend/access/transam/undolog.c b/src/backend/access/transam/undolog.c
    index 196e02e652f..b2fdbfcd0f9 100644
    --- a/src/backend/access/transam/undolog.c
    +++ b/src/backend/access/transam/undolog.c
    @@ -28,6 +28,7 @@
     #include "access/xlog.h"
     #include "access/xloginsert.h"
     #include "lib/dshash.h"
    +#include "catalog/storage_ulog.h"
     #include "miscadmin.h"
     #include "storage/fd.h"
     #include "storage/procarray.h"
    diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
    index 6976249e9e9..7613192e343 100644
    --- a/src/backend/catalog/index.c
    +++ b/src/backend/catalog/index.c
    @@ -3059,8 +3059,8 @@ index_build(Relation heapRelation,
     	if (indexRelation->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
     		!smgrexists(RelationGetSmgr(indexRelation), INIT_FORKNUM))
     	{
    -		smgrcreate(RelationGetSmgr(indexRelation), INIT_FORKNUM, false);
    -		log_smgrcreate(&indexRelation->rd_locator, INIT_FORKNUM);
    +		RelationCreateFork(RelationGetSmgr(indexRelation),
    +						   INIT_FORKNUM, true, true);
     		indexRelation->rd_indam->ambuildempty(indexRelation);
     	}
     
    diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
    index 5b22cf10990..d546d169d34 100644
    --- a/src/backend/catalog/storage.c
    +++ b/src/backend/catalog/storage.c
    @@ -19,13 +19,16 @@
     
     #include "postgres.h"
     
    +#include "access/undolog.h"
     #include "access/visibilitymap.h"
     #include "access/xact.h"
     #include "access/xlog.h"
     #include "access/xloginsert.h"
     #include "access/xlogutils.h"
     #include "catalog/storage.h"
    +#include "catalog/storage_ulog.h"
     #include "catalog/storage_xlog.h"
    +#include "common/hashfn_unstable.h"
     #include "miscadmin.h"
     #include "storage/bulk_write.h"
     #include "storage/freespace.h"
    @@ -76,6 +79,14 @@ typedef struct PendingRelSync
     static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
     static HTAB *pendingSyncHash = NULL;
     
    +/* Storage for smgr_undo()/smgr_undoevent() */
    +static RelFileLocator *rlocs = NULL;
    +static int			   rlocs_cap = 0;
    +static int			   rlocs_len = 0;
    +
    +/* local functions */
    +static void ulog_smgrcreate(SMgrRelation srel, ForkNumber forkNum);
    +static void ulog_smgrpreserve(RelFileLocator rloc, ForkNumber forkNum);
     
     /*
      * AddPendingSync
    @@ -147,36 +158,54 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence,
     	}
     
     	srel = smgropen(rlocator, procNumber);
    -	smgrcreate(srel, MAIN_FORKNUM, false);
     
    -	if (needs_wal)
    -		log_smgrcreate(&srel->smgr_rlocator.locator, MAIN_FORKNUM);
    +	RelationCreateFork(srel, MAIN_FORKNUM, needs_wal, register_delete);
     
    -	/*
    -	 * Add the relation to the list of stuff to delete at abort, if we are
    -	 * asked to do so.
    -	 */
    -	if (register_delete)
    +	if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
    +	{
    +		Assert(procNumber == INVALID_PROC_NUMBER);
    +		AddPendingSync(&rlocator);
    +	}
    +
    +	return srel;
    +}
    +
    +/*
    + * RelationCreateFork
    + *		Create physical storage for a fork of a relation.
    + *
    + * This function creates a relation fork in a transactional manner. When
    + * undo_log is true, the creation is UNDO-logged so that in case of transaction
    + * aborts or server crashes later on, the fork will be removed. If the caller
    + * plans to remove the fork in another way, it should pass false. Additionally,
    + * it is WAL-logged if wal_log is true.
    + */
    +void
    +RelationCreateFork(SMgrRelation srel, ForkNumber forkNum,
    +				   bool wal_log, bool undo_log)
    +{
    +	/* Schedule the removal of this init fork at abort if requested. */
    +	if (undo_log)
     	{
     		PendingRelDelete *pending;
     
    +		ulog_smgrcreate(srel, forkNum);
    +
     		pending = (PendingRelDelete *)
     			MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
    -		pending->rlocator = rlocator;
    -		pending->procNumber = procNumber;
    +		pending->rlocator = srel->smgr_rlocator.locator;
    +		pending->procNumber = INVALID_PROC_NUMBER;
     		pending->atCommit = false;	/* delete if abort */
     		pending->nestLevel = GetCurrentTransactionNestLevel();
     		pending->next = pendingDeletes;
     		pendingDeletes = pending;
     	}
     
    -	if (relpersistence == RELPERSISTENCE_PERMANENT && !XLogIsNeeded())
    -	{
    -		Assert(procNumber == INVALID_PROC_NUMBER);
    -		AddPendingSync(&rlocator);
    -	}
    +	/* WAL-log this creation if requested. */
    +	if (wal_log)
    +		log_smgrcreate(&srel->smgr_rlocator.locator, forkNum);
     
    -	return srel;
    +	smgrcreate(srel, forkNum, false);
     }
     
     /*
    @@ -198,6 +227,35 @@ log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
     	XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE);
     }
     
    +/*
    + * Perform UndoLogWrite of an XLOG_SMGR_CREATE record to UNDO log.
    + */
    +void
    +ulog_smgrcreate(SMgrRelation srel, ForkNumber forkNum)
    +{
    +	ul_smgr_create ulrec;
    +
    +	ulrec.rlocator = srel->smgr_rlocator.locator;
    +	ulrec.backend = srel->smgr_rlocator.backend;
    +	ulrec.forknum = forkNum;
    +	UndoLogWrite(RM_SMGR_ID, ULOG_SMGR_CREATE, &ulrec, sizeof(ulrec));
    +}
    +
    +/*
    + * Perform UndoLogWrite of an XLOG_SMGR_PRESERVE record to UNDO log.
    + */
    +void
    +ulog_smgrpreserve(RelFileLocator rloc, ForkNumber forkNum)
    +{
    +	ul_smgr_preserve ulrec;
    +
    +	Assert(forkNum == MAIN_FORKNUM);
    +	ulrec.rlocator = rloc;
    +	ulrec.backend = INVALID_PROC_NUMBER;
    +	ulrec.forknum = forkNum;
    +	UndoLogWrite(RM_SMGR_ID, ULOG_SMGR_PRESERVE, &ulrec, sizeof(ulrec));
    +}
    +
     /*
      * RelationDropStorage
      *		Schedule unlinking of physical storage at transaction commit.
    @@ -253,6 +311,7 @@ RelationPreserveStorage(RelFileLocator rlocator, bool atCommit)
     	PendingRelDelete *pending;
     	PendingRelDelete *prev;
     	PendingRelDelete *next;
    +	bool			  found = false;
     
     	prev = NULL;
     	for (pending = pendingDeletes; pending != NULL; pending = next)
    @@ -261,6 +320,8 @@ RelationPreserveStorage(RelFileLocator rlocator, bool atCommit)
     		if (RelFileLocatorEquals(rlocator, pending->rlocator)
     			&& pending->atCommit == atCommit)
     		{
    +			found = true;
    +
     			/* unlink and delete list entry */
     			if (prev)
     				prev->next = next;
    @@ -275,6 +336,9 @@ RelationPreserveStorage(RelFileLocator rlocator, bool atCommit)
     			prev = pending;
     		}
     	}
    +
    +	if (found)
    +		ulog_smgrpreserve(rlocator, MAIN_FORKNUM);
     }
     
     /*
    @@ -1077,3 +1141,119 @@ smgr_redo(XLogReaderState *record)
     	else
     		elog(PANIC, "smgr_redo: unknown op code %u", info);
     }
    +
    +void
    +smgr_undo(UndoLogRecord *record, ULogContext cxt, bool redo, bool crashed)
    +{
    +	uint8	info;
    +
    +	Assert(CritSectionCount == 0);
    +
    +	if (cxt == ULOGCXT_CLEANUP)
    +	{
    +		Assert(record);
    +		info = record->ul_info & ~ULR_INFO_MASK;
    +
    +		if (info == ULOG_SMGR_CREATE)
    +		{
    +			ul_smgr_create *ulrec = (ul_smgr_create *) ULogRecGetData(record);
    +
    +			Assert(ulrec->forknum == MAIN_FORKNUM);
    +			if (rlocs_cap < rlocs_len + 1)
    +			{
    +				if (rlocs_cap == 0)
    +				{
    +					rlocs_cap = 32;
    +					rlocs = palloc(sizeof(RelFileLocator) * rlocs_cap);
    +				}
    +				else
    +				{
    +					rlocs_cap *= 2;
    +					rlocs = repalloc(rlocs, sizeof(RelFileLocator) * rlocs_cap);
    +				}
    +			}
    +			rlocs[rlocs_len++] = ulrec->rlocator;
    +		}
    +		else if (info == ULOG_SMGR_PRESERVE)
    +		{
    +			ul_smgr_preserve *ulrec =
    +				(ul_smgr_preserve *) ULogRecGetData(record);
    +			int j = 0;
    +
    +			for (int i = 0 ; i < rlocs_len ; i++)
    +			{
    +				if (RelFileLocatorEquals(ulrec->rlocator, rlocs[i]))
    +					continue;
    +
    +				if (i != j)
    +					rlocs[j] = rlocs[i];
    +				j++;
    +			}
    +
    +			rlocs_len = j;
    +		}
    +		else
    +			elog(PANIC, "smgr_undo: unknown op code %d", info);
    +	}
    +	else if (cxt == ULOGCXT_COMMIT || cxt == ULOGCXT_ABORT ||
    +			 cxt == ULOGCXT_PREPARED)
    +	{
    +		/* nothing to do here */
    +	}
    +	else
    +		elog(PANIC, "smgr_undo: unknown context code %u", cxt);
    +}
    +
    +void
    +smgr_undoevent(ULogEvent event)
    +{
    +	if (event == ULOGEVENT_XACTEND)
    +	{
    +		SMgrRelation reln;
    +		ForkNumber	forks[3];
    +		BlockNumber firstblocks[3] = {0};
    +		int			nforks = 0;
    +
    +		for (int i = 0 ; i < rlocs_len ; i++)
    +		{
    +			forks[nforks++] = MAIN_FORKNUM;
    +
    +			/*
    +			 * Since the MAIN fork was created in this transaction, rollback
    +			 * should remove all forks of this relation.  Although we could
    +			 * register an undo record individually for each fork, this may be
    +			 * more complex because VM and FSM can be created
    +			 * non-transactionally outside the transaction that created the
    +			 * MAIN fork.
    +			 */
    +			forks[nforks++] = VISIBILITYMAP_FORKNUM;
    +			forks[nforks++] = FSM_FORKNUM;
    +
    +			/*
    +			 * Drop buffers, then the files. This can be improved by using
    +			 * smgrdounlinkall(), but currently I take the simpler way.
    +			 */
    +			reln = smgropen(rlocs[i], INVALID_PROC_NUMBER);
    +			DropRelationBuffers(reln, forks, nforks, firstblocks);
    +			for (int j = 0 ; j < nforks ; j++)
    +				smgrunlink(reln, forks[j], true);
    +
    +			smgrclose(reln);
    +		}
    +
    +		if (rlocs)
    +		{
    +			pfree(rlocs);
    +			rlocs = NULL;
    +			rlocs_cap = rlocs_len = 0;
    +		}
    +	}
    +	else if (event == ULOGEVENT_CLEANUP_INIT ||
    +			 event == ULOGEVENT_RECOVERY_END)
    +	{
    +		/* Nothing to do */
    +	}
    +	else
    +		elog(PANIC, "smgr_undoevent: unknown event code %u", event);
    +
    +}
    diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
    index 0188e8bbd5b..be6afc7df52 100644
    --- a/src/backend/commands/sequence.c
    +++ b/src/backend/commands/sequence.c
    @@ -31,6 +31,7 @@
     #include "catalog/objectaccess.h"
     #include "catalog/pg_sequence.h"
     #include "catalog/pg_type.h"
    +#include "catalog/storage.h"
     #include "catalog/storage_xlog.h"
     #include "commands/defrem.h"
     #include "commands/sequence.h"
    @@ -344,8 +345,7 @@ fill_seq_with_data(Relation rel, HeapTuple tuple)
     		SMgrRelation srel;
     
     		srel = smgropen(rel->rd_locator, INVALID_PROC_NUMBER);
    -		smgrcreate(srel, INIT_FORKNUM, false);
    -		log_smgrcreate(&rel->rd_locator, INIT_FORKNUM);
    +		RelationCreateFork(srel, INIT_FORKNUM, true, true);
     		fill_seq_fork_with_data(rel, tuple, INIT_FORKNUM);
     		FlushRelationBuffers(rel);
     		smgrclose(srel);
    diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
    index 49374782625..b5766989d8e 100644
    --- a/src/backend/commands/tablecmds.c
    +++ b/src/backend/commands/tablecmds.c
    @@ -15965,16 +15965,17 @@ index_copy_data(Relation rel, RelFileLocator newrlocator)
     	{
     		if (smgrexists(RelationGetSmgr(rel), forkNum))
     		{
    -			smgrcreate(dstrel, forkNum, false);
    -
    -			/*
    -			 * WAL log creation if the relation is persistent, or this is the
    -			 * init fork of an unlogged relation.
    -			 */
    -			if (RelationIsPermanent(rel) ||
    +			bool wal_log = RelationIsPermanent(rel) |
     				(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
    -				 forkNum == INIT_FORKNUM))
    -				log_smgrcreate(&newrlocator, forkNum);
    +				 forkNum == INIT_FORKNUM);
    +
    +			/*
    +			 * Usually, we don't use UNDO log for FSM or VM forks, as their
    +			 * creation is not transactional. However, we're currently copying
    +			 * the entire relation in a transactional manner, which requires
    +			 * after-crash cleanup.
    +			 */
    +			RelationCreateFork(dstrel, forkNum, wal_log, true);
     			RelationCopyStorage(RelationGetSmgr(rel), dstrel, forkNum,
     								rel->rd_rel->relpersistence);
     		}
    diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
    index 2622221809c..1a9c794374f 100644
    --- a/src/backend/storage/buffer/bufmgr.c
    +++ b/src/backend/storage/buffer/bufmgr.c
    @@ -4812,8 +4812,7 @@ CreateAndCopyRelationData(RelFileLocator src_rlocator,
     	/*
     	 * Create and copy all forks of the relation.  During create database we
     	 * have a separate cleanup mechanism which deletes complete database
    -	 * directory.  Therefore, each individual relation doesn't need to be
    -	 * registered for cleanup.
    +	 * directory. Therefore, do not issue an UNDO log for this relation.
     	 */
     	RelationCreateStorage(dst_rlocator, relpersistence, false);
     
    @@ -4827,6 +4826,7 @@ CreateAndCopyRelationData(RelFileLocator src_rlocator,
     	{
     		if (smgrexists(src_rel, forkNum))
     		{
    +			/* Use smgrcreate() directly as no UNDO log is required. */
     			smgrcreate(dst_rel, forkNum, false);
     
     			/*
    diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c
    index 01e267abf9b..d3a42d3f566 100644
    --- a/src/backend/storage/file/reinit.c
    +++ b/src/backend/storage/file/reinit.c
    @@ -34,6 +34,39 @@ typedef struct
     	RelFileNumber relnumber;	/* hash key */
     } unlogged_relation_entry;
     
    +static char **ignore_files = NULL;
    +static int nignore_elems = 0;
    +static int nignore_files = 0;
    +
    +/*
    + * determine if the file should be ignored when resetting unlogged relations
    + */
    +static bool
    +reinit_ignore_file(const char *dirname, const char *name)
    +{
    +	char fnamebuf[MAXPGPATH];
    +	int len;
    +
    +	if (nignore_files == 0)
    +		return false;
    +
    +	strncpy(fnamebuf, dirname, MAXPGPATH - 1);
    +	strncat(fnamebuf, "/", MAXPGPATH - 1);
    +	strncat(fnamebuf, name, MAXPGPATH - 1);
    +	fnamebuf[MAXPGPATH - 1] = 0;
    +
    +	for (int i = 0 ; i < nignore_files ; i++)
    +	{
    +		/* match ignoring fork part */
    +		len = strlen(ignore_files[i]);
    +		if (strncmp(fnamebuf, ignore_files[i], len) == 0 &&
    +			(fnamebuf[len] == 0 || fnamebuf[len] == '_'))
    +			return true;
    +	}
    +
    +	return false;
    +}
    +
     /*
      * Reset unlogged relations from before the last restart.
      *
    @@ -204,6 +237,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
     													 &forkNum, &segno))
     				continue;
     
    +			/* Skip anything that undo log suggested to ignore */
    +			if (reinit_ignore_file(dbspacedirname, de->d_name))
    +				continue;
    +
     			/* Also skip it unless this is the init fork. */
     			if (forkNum != INIT_FORKNUM)
     				continue;
    @@ -243,6 +280,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
     													 &forkNum, &segno))
     				continue;
     
    +			/* Skip anything that undo log suggested to ignore */
    +			if (reinit_ignore_file(dbspacedirname, de->d_name))
    +				continue;
    +
     			/* We never remove the init fork. */
     			if (forkNum == INIT_FORKNUM)
     				continue;
    @@ -294,6 +335,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
     													 &forkNum, &segno))
     				continue;
     
    +			/* Skip anything that undo log suggested to ignore */
    +			if (reinit_ignore_file(dbspacedirname, de->d_name))
    +				continue;
    +
     			/* Also skip it unless this is the init fork. */
     			if (forkNum != INIT_FORKNUM)
     				continue;
    @@ -337,6 +382,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
     													 &forkNum, &segno))
     				continue;
     
    +			/* Skip anything that undo log suggested to ignore */
    +			if (reinit_ignore_file(dbspacedirname, de->d_name))
    +				continue;
    +
     			/* Also skip it unless this is the init fork. */
     			if (forkNum != INIT_FORKNUM)
     				continue;
    @@ -366,6 +415,49 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
     	}
     }
     
    +/*
    + * Record relfilenodes that should be left alone during reinitializing unlogged
    + * relations.
    + */
    +void
    +ResetUnloggedRelationIgnore(RelFileLocator rloc, ProcNumber backend)
    +{
    +	RelFileLocatorBackend rbloc;
    +
    +	if (nignore_files >= nignore_elems)
    +	{
    +		if (ignore_files == NULL)
    +		{
    +			nignore_elems = 16;
    +			ignore_files = palloc(sizeof(char *) * nignore_elems);
    +		}
    +		else
    +		{
    +			nignore_elems *= 2;
    +			ignore_files = repalloc(ignore_files,
    +									sizeof(char *) * nignore_elems);
    +		}
    +	}
    +
    +	rbloc.backend = backend;
    +	rbloc.locator = rloc;
    +	ignore_files[nignore_files++] = relpath(rbloc, MAIN_FORKNUM);
    +}
    +
    +/*
    + * Clear the ignore list
    + */
    +void
    +ResetUnloggedRelationIgnoreClear(void)
    +{
    +	if (nignore_elems == 0)
    +		return;
    +
    +	pfree(ignore_files);
    +	ignore_files = NULL;
    +	nignore_elems = 0;
    +}
    +
     /*
      * Basic parsing of putative relation filenames.
      *
    diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
    index 36ad34aa6ac..8a7654118fe 100644
    --- a/src/backend/storage/smgr/smgr.c
    +++ b/src/backend/storage/smgr/smgr.c
    @@ -819,6 +819,15 @@ smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
     	smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
     }
     
    +/*
    + * smgrunlink() -- unlink the storage file
    + */
    +void
    +smgrunlink(SMgrRelation reln, ForkNumber forknum, bool isRedo)
    +{
    +	smgrsw[reln->smgr_which].smgr_unlink(reln->smgr_rlocator, forknum, isRedo);
    +}
    +
     /*
      * AtEOXact_SMgr
      *
    diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
    index 5909d87d599..b0c4e689950 100644
    --- a/src/include/access/rmgrlist.h
    +++ b/src/include/access/rmgrlist.h
    @@ -27,7 +27,7 @@
     /* symbol name, textual name, redo, desc, identify, startup, cleanup, mask, decode, undo, undo_desc, undo_identify, undo_event */
     PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL, xlog_decode, NULL, NULL, NULL, NULL)
     PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL, xact_decode, NULL, NULL, NULL, NULL)
    -PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
    +PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL, NULL, smgr_undo, smgr_undodesc, smgr_undoidentify, smgr_undoevent)
     PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
     PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
     PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
    diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h
    index 72ef3ee92c0..3451d6ac80c 100644
    --- a/src/include/catalog/storage.h
    +++ b/src/include/catalog/storage.h
    @@ -25,6 +25,8 @@ extern PGDLLIMPORT int wal_skip_threshold;
     extern SMgrRelation RelationCreateStorage(RelFileLocator rlocator,
     										  char relpersistence,
     										  bool register_delete);
    +extern void RelationCreateFork(SMgrRelation srel, ForkNumber forkNum,
    +							   bool wal_log, bool undo_log);
     extern void RelationDropStorage(Relation rel);
     extern void RelationPreserveStorage(RelFileLocator rlocator, bool atCommit);
     extern void RelationPreTruncate(Relation rel);
    diff --git a/src/include/catalog/storage_ulog.h b/src/include/catalog/storage_ulog.h
    new file mode 100644
    index 00000000000..9568ab24cfb
    --- /dev/null
    +++ b/src/include/catalog/storage_ulog.h
    @@ -0,0 +1,48 @@
    +/*-------------------------------------------------------------------------
    + *
    + * storage_ulog.h
    + *	  prototypes for Undo Log support for backend/catalog/storage.c
    + *
    + *
    + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
    + * Portions Copyright (c) 1994, Regents of the University of California
    + *
    + * src/include/catalog/storage_ulog.h
    + *
    + *-------------------------------------------------------------------------
    + */
    +#ifndef STORAGE_ULOG_H
    +#define STORAGE_ULOG_H
    +
    +#include "access/undolog.h"
    +#include "storage/smgr.h"
    +
    +/* ULOG gives us high 4 bits (just following xlog) */
    +#define ULOG_SMGR_CREATE			0x10
    +#define ULOG_SMGR_PRESERVE			0x20
    +
    +/* undo log entry for storage file creation */
    +typedef struct ul_smgr_create
    +{
    +	RelFileLocator	rlocator;
    +	ProcNumber		backend;
    +	ForkNumber		forknum;
    +} ul_smgr_create;
    +
    +typedef struct ul_smgr_preserve
    +{
    +	RelFileLocator	rlocator;
    +	ProcNumber		backend;
    +	ForkNumber		forknum;
    +} ul_smgr_preserve;
    +
    +extern void smgr_undo(UndoLogRecord *record, ULogContext cxt, bool redo,
    +					  bool crashed);
    +extern void	smgr_undodesc(StringInfo buf, UndoLogRecord *record);
    +extern const char *smgr_undoidentify(uint8 info);
    +extern void smgr_undoevent(ULogEvent event);
    +
    +#define ULogRecGetData(record) ((char *)record + sizeof(UndoLogRecord))
    +#define ULogRecGetInfo(record) ((record)->ul_info)
    +
    +#endif							/* STORAGE_XLOG_H */
    diff --git a/src/include/storage/reinit.h b/src/include/storage/reinit.h
    index 1373d509df2..02bf55d3a6b 100644
    --- a/src/include/storage/reinit.h
    +++ b/src/include/storage/reinit.h
    @@ -16,9 +16,13 @@
     #define REINIT_H
     
     #include "common/relpath.h"
    +#include "storage/relfilelocator.h"
     
     
     extern void ResetUnloggedRelations(int op);
    +extern void ResetUnloggedRelationIgnore(RelFileLocator rloc,
    +										ProcNumber backend);
    +extern void ResetUnloggedRelationIgnoreClear(void);
     extern bool parse_filename_for_nontemp_relation(const char *name,
     												RelFileNumber *relnumber,
     												ForkNumber *fork,
    diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
    index 63a186bd346..a2c15d6af90 100644
    --- a/src/include/storage/smgr.h
    +++ b/src/include/storage/smgr.h
    @@ -110,6 +110,7 @@ extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
     						 BlockNumber *nblocks);
     extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
     extern void smgrregistersync(SMgrRelation reln, ForkNumber forknum);
    +extern void smgrunlink(SMgrRelation reln, ForkNumber forknum, bool isRedo);
     extern void AtEOXact_SMgr(void);
     extern bool ProcessBarrierSmgrRelease(void);
     
    diff --git a/src/test/recovery/t/013_crash_restart.pl b/src/test/recovery/t/013_crash_restart.pl
    index d5d24e31d90..4df88efeb3d 100644
    --- a/src/test/recovery/t/013_crash_restart.pl
    +++ b/src/test/recovery/t/013_crash_restart.pl
    @@ -86,6 +86,23 @@ ok( pump_until(
     $killme_stdout = '';
     $killme_stderr = '';
     
    +#also, create a table whose storage should *not* survive.
    +$killme_stdin .= q[
    +CREATE TABLE should_not_survive (a int);
    +SELECT pg_relation_filepath('should_not_survive');
    +];
    +ok( pump_until(
    +		$killme, $psql_timeout, \$killme_stdout,
    +		qr/base\/[[:digit:]\/]+[\r\n]$/m),
    +	'created a table');
    +my $relfilerelpath = $killme_stdout;
    +chomp($relfilerelpath);
    +$killme_stdout = '';
    +$killme_stderr = '';
    +
    +my $relfilepath = $node->data_dir . "/" . $relfilerelpath;
    +ok( -e $relfilepath,
    +	"storage file is created in xact that is going to crash");
     
     # Start longrunning query in second session; its failure will signal that
     # crash-restart has occurred.  The initial wait for the trivial select is to
    @@ -144,6 +161,8 @@ $killme->run();
     ($monitor_stdin, $monitor_stdout, $monitor_stderr) = ('', '', '');
     $monitor->run();
     
    +ok( ! -e $relfilepath,
    +	"orphaned storage file is correctly removed");
     
     # Acquire pid of new backend
     $killme_stdin .= q[
    -- 
    2.43.5
    
    
    ----Next_Part(Fri_Dec_27_17_25_02_2024_357)--
    Content-Type: Text/X-Patch; charset=us-ascii
    Content-Transfer-Encoding: 7bit
    Content-Disposition: attachment;
     filename="v36-0006-new-indexam-bit-for-unlogged-storage-compatibili.patch"