v20251228-v3-0001-Add-Storage-I-O-Transform-Hooks-for-PostgreSQL.patch

application/octet-stream

Filename: v20251228-v3-0001-Add-Storage-I-O-Transform-Hooks-for-PostgreSQL.patch
Type: application/octet-stream
Part: 0
Message: Re: RFC: PostgreSQL Storage I/O Transformation Hooks
From 82ce5cc05f1ce0311a2eedd559f1db7a7703f126 Mon Sep 17 00:00:00 2001
From: Henson Choi <assam258@gmail.com>
Date: Tue, 2 Dec 2025 21:50:12 +0900
Subject: [PATCH v3 1/2] Add Storage I/O Transform Hooks for PostgreSQL

This patch introduces a set of hook points that allow extensions to
intercept and transform data during storage I/O operations.  The hooks
are designed to support transparent data encryption (TDE) and similar
use cases that require data transformation at the storage layer.

The following hooks are added:

  - page_encrypt_hook / page_decrypt_hook in bufmgr.c for buffer page
    transformation during read/write operations
  - xlog_insert_pre_hook in xloginsert.c for WAL record transformation
    before assembly
  - xlog_decrypt_record_hook in xlogreader.c for WAL record
    transformation during replay
  - smgr_write_transform_hook / smgr_read_transform_hook in md.c for
    low-level storage manager I/O transformation

Each hook is optional and defaults to NULL, ensuring no overhead when
extensions are not loaded.

Author: Henson Choi <assam258@gmail.com>
---
 src/backend/access/transam/xloginsert.c | 10 ++++
 src/backend/access/transam/xlogreader.c | 21 ++++++++
 src/backend/storage/buffer/bufmgr.c     |  9 ++++
 src/backend/storage/smgr/md.c           | 20 ++++++++
 src/include/access/xloginsert.h         | 20 ++++++++
 src/include/access/xlogreader.h         | 20 ++++++++
 src/include/access/xlogrecord.h         |  5 ++
 src/include/storage/bufpage.h           | 25 +++++++++-
 src/include/storage/md.h                | 65 +++++++++++++++++++++++++
 9 files changed, 194 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index a56d5a55282..f518ef3f16f 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -136,6 +136,12 @@ static bool begininsert_called = false;
 /* Memory context to hold the registered buffer and data references. */
 static MemoryContext xloginsert_cxt;
 
+/*
+ * Hook variable for WAL insert transformation (e.g., encryption).
+ * Extensions can set this hook to transform WAL data before assembly.
+ */
+xlog_insert_pre_hook_type xlog_insert_pre_hook = NULL;
+
 static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info,
 									   XLogRecPtr RedoRecPtr, bool doPageWrites,
 									   XLogRecPtr *fpw_lsn, int *num_fpi,
@@ -526,6 +532,10 @@ XLogInsert(RmgrId rmid, uint8 info)
 								 &fpw_lsn, &num_fpi, &fpi_bytes,
 								 &topxid_included);
 
+		/* Pre-insert hook for transformation (e.g., encryption) */
+		if (xlog_insert_pre_hook)
+			rdt = xlog_insert_pre_hook(rdt);
+
 		EndPos = XLogInsertRecord(rdt, fpw_lsn, curinsert_flags, num_fpi,
 								  fpi_bytes, topxid_included);
 	} while (!XLogRecPtrIsValid(EndPos));
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index 5e5001b2101..169f2b06fc5 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -40,6 +40,13 @@
 #include "common/logging.h"
 #endif
 
+/*
+ * Hook variable for WAL record transformation (e.g., decryption).
+ * Extensions can set this hook to transform raw WAL data before decoding.
+ * Frontend tools can also set this hook at startup.
+ */
+xlog_decode_pre_hook_type xlog_decode_pre_hook = NULL;
+
 static void report_invalid_record(XLogReaderState *state, const char *fmt,...)
 			pg_attribute_printf(2, 3);
 static void allocate_recordbuf(XLogReaderState *state, uint32 reclength);
@@ -843,6 +850,11 @@ restart:
 		Assert(gotheader);
 
 		record = (XLogRecord *) state->readRecordBuf;
+
+		/* Pre-validation hook for transformation (e.g., decryption) */
+		if (xlog_decode_pre_hook)
+			record = xlog_decode_pre_hook(state, record, RecPtr, true);
+
 		if (!ValidXLogRecord(state, record, RecPtr))
 			goto err;
 
@@ -862,6 +874,15 @@ restart:
 			goto err;
 
 		/* Record does not cross a page boundary */
+
+		/*
+		 * Pre-validation hook for transformation (e.g., decryption).
+		 * inplace_allowed is false because record points to readBuf, which
+		 * may be copied back to WAL files (e.g., FinishWalRecovery).
+		 */
+		if (xlog_decode_pre_hook)
+			record = xlog_decode_pre_hook(state, record, RecPtr, false);
+
 		if (!ValidXLogRecord(state, record, RecPtr))
 			goto err;
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index eb55102b0d7..ea0b62e98f2 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -57,6 +57,7 @@
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
+#include "storage/md.h"
 #include "storage/proc.h"
 #include "storage/read_stream.h"
 #include "storage/smgr.h"
@@ -7401,6 +7402,14 @@ buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer,
 			VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
 #endif
 
+		/* Decrypt block before checksum verification */
+		if (mdread_post_hook)
+		{
+			RelFileLocator rlocator = BufTagGetRelFileLocator(&tag);
+
+			mdread_post_hook(&rlocator, tag.forkNum, tag.blockNum, (void **) &bufdata, 1);
+		}
+
 		if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
 							failed_checksum))
 		{
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 71bcdeb6601..5416128d2cc 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -96,6 +96,14 @@ typedef struct _MdfdVec
 
 static MemoryContext MdCxt;		/* context for all MdfdVec objects */
 
+/*
+ * Hook variables for I/O transformation (e.g., encryption/decryption).
+ * Extensions can set these hooks to transform data during storage I/O.
+ */
+mdread_post_hook_type mdread_post_hook = NULL;
+mdwrite_pre_hook_type mdwrite_pre_hook = NULL;
+mdextend_pre_hook_type mdextend_pre_hook = NULL;
+
 
 /* Populate a file tag describing an md.c segment file. */
 #define INIT_MD_FILETAG(a,xx_rlocator,xx_forknum,xx_segno) \
@@ -513,6 +521,10 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 						relpath(reln->smgr_rlocator, forknum).str,
 						InvalidBlockNumber)));
 
+	/* Pre-extend hook for transformation (e.g., encryption) */
+	if (mdextend_pre_hook)
+		buffer = mdextend_pre_hook(&reln->smgr_rlocator.locator, forknum, blocknum, buffer);
+
 	v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
 
 	seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
@@ -972,6 +984,10 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
 		}
 
+		/* Post-read hook for transformation (e.g., decryption) */
+		if (mdread_post_hook)
+			mdread_post_hook(&reln->smgr_rlocator.locator, forknum, blocknum, buffers, nblocks_this_segment);
+
 		nblocks -= nblocks_this_segment;
 		buffers += nblocks_this_segment;
 		blocknum += nblocks_this_segment;
@@ -1064,6 +1080,10 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
 #endif
 
+	/* Pre-write hook for transformation (e.g., encryption) */
+	if (mdwrite_pre_hook)
+		buffers = mdwrite_pre_hook(&reln->smgr_rlocator.locator, forknum, blocknum, buffers, nblocks);
+
 	while (nblocks > 0)
 	{
 		struct iovec iov[PG_IOV_MAX];
diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h
index d6a71415d4f..cc54459ad33 100644
--- a/src/include/access/xloginsert.h
+++ b/src/include/access/xloginsert.h
@@ -19,6 +19,26 @@
 #include "storage/relfilelocator.h"
 #include "utils/relcache.h"
 
+/* Forward declaration for XLogRecData */
+struct XLogRecData;
+
+/*
+ * Hook function type for WAL insert transformation (e.g., encryption).
+ * Called after XLogRecordAssemble() but before XLogInsertRecord().
+ * Extension can transform the assembled WAL record data for encryption.
+ * Returns the (possibly modified) XLogRecData chain to be inserted.
+ *
+ * The first node's data points to XLogRecord header, which contains
+ * xl_rmid and xl_info if needed by the hook.
+ *
+ * On failure, the hook should either PANIC or return the original rdata
+ * as fallback.
+ */
+typedef struct XLogRecData *(*xlog_insert_pre_hook_type) (struct XLogRecData *rdata);
+
+/* Hook variable for WAL insert transformation */
+extern PGDLLIMPORT xlog_insert_pre_hook_type xlog_insert_pre_hook;
+
 /*
  * The minimum size of the WAL construction working area. If you need to
  * register more than XLR_NORMAL_MAX_BLOCK_ID block references or have more
diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h
index dfabbbd57d4..898d52a1013 100644
--- a/src/include/access/xlogreader.h
+++ b/src/include/access/xlogreader.h
@@ -400,6 +400,26 @@ extern bool DecodeXLogRecord(XLogReaderState *state,
 							 XLogRecPtr lsn,
 							 char **errormsg);
 
+/*
+ * Hook function type for WAL record transformation (e.g., decryption).
+ * Called before ValidXLogRecord() and DecodeXLogRecord().
+ * Extension can decrypt or transform the raw record data.
+ * Returns the (possibly modified) XLogRecord to be validated and decoded.
+ *
+ * If inplace_allowed is true, the hook may modify the record in place.
+ * If false, the hook must allocate a new buffer and return it.
+ *
+ * On failure, the hook should either PANIC or return the original record
+ * as fallback.
+ */
+typedef XLogRecord *(*xlog_decode_pre_hook_type) (XLogReaderState *state,
+												  XLogRecord *record,
+												  XLogRecPtr lsn,
+												  bool inplace_allowed);
+
+/* Hook variable for WAL record transformation */
+extern PGDLLIMPORT xlog_decode_pre_hook_type xlog_decode_pre_hook;
+
 /*
  * Macros that provide access to parts of the record most recently returned by
  * XLogReadRecord() or XLogNextRecord().
diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h
index a06833ce0a3..9cfb2aff5ae 100644
--- a/src/include/access/xlogrecord.h
+++ b/src/include/access/xlogrecord.h
@@ -244,5 +244,10 @@ typedef struct XLogRecordDataHeaderLong
 #define XLR_BLOCK_ID_DATA_LONG		254
 #define XLR_BLOCK_ID_ORIGIN			253
 #define XLR_BLOCK_ID_TOPLEVEL_XID	252
+/*
+ * I/O transform hook marker. Uses same header format as XLogRecordDataHeaderLong
+ * (1 byte id + 4 bytes length). Use SizeOfXLogRecordDataHeaderLong for size.
+ */
+#define XLR_BLOCK_ID_TRANSFORMED	251
 
 #endif							/* XLOGRECORD_H */
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index abc2cf2a020..f18f77d3d22 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -189,7 +189,17 @@ typedef PageHeaderData *PageHeader;
 #define PD_ALL_VISIBLE		0x0004	/* all tuples on page are visible to
 									 * everyone */
 
-#define PD_VALID_FLAG_BITS	0x0007	/* OR of all valid pd_flags bits */
+/*
+ * Transform ID field (5 bits: values 0-31) for I/O transform extensions.
+ * Value 0 means the page is not transformed (backward compatible).
+ * Values 1-31 are available for extensions to define their own meanings
+ * (e.g., encryption key versions, algorithm identifiers, migration markers).
+ */
+#define PD_TRANSFORM_ID_MASK	0x00F8	/* bits 3-7 */
+#define PD_TRANSFORM_ID_SHIFT	3
+#define PD_TRANSFORM_NONE		0		/* not transformed (core reserved) */
+
+#define PD_VALID_FLAG_BITS	0x00FF	/* OR of all valid pd_flags bits */
 
 /*
  * Page layout version number 0 is for pre-7.3 Postgres releases.
@@ -441,6 +451,19 @@ PageClearAllVisible(Page page)
 	((PageHeader) page)->pd_flags &= ~PD_ALL_VISIBLE;
 }
 
+static inline uint8
+PageGetTransformId(const PageData *page)
+{
+	return (((const PageHeaderData *) page)->pd_flags & PD_TRANSFORM_ID_MASK) >> PD_TRANSFORM_ID_SHIFT;
+}
+static inline void
+PageSetTransformId(Page page, uint8 id)
+{
+	((PageHeader) page)->pd_flags =
+		(((PageHeader) page)->pd_flags & ~PD_TRANSFORM_ID_MASK) |
+		((id << PD_TRANSFORM_ID_SHIFT) & PD_TRANSFORM_ID_MASK);
+}
+
 /*
  * These two require "access/transam.h", so left as macros.
  */
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index b563c27abf0..0a766a2b61f 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -22,6 +22,71 @@
 
 extern PGDLLIMPORT const PgAioHandleCallbacks aio_md_readv_cb;
 
+/*
+ * Hook function types for I/O transformation (e.g., encryption/decryption).
+ * These hooks allow extensions to transform data during storage I/O operations.
+ */
+
+/*
+ * Called after blocks are read from disk, before PostgreSQL's checksum verification.
+ * Extension can reverse-transform (e.g., decrypt) the data in place.
+ *
+ * For synchronous reads, called from mdreadv() after read completes.
+ * For AIO reads, called from buffer_readv_complete_one() before PageIsVerified().
+ *
+ * Note: The hook is responsible for verifying on-disk checksum before reverse
+ * transformation and recalculating checksum after transformation. This ensures
+ * data integrity is verified at both stages and PostgreSQL's checksum verification
+ * passes.
+ *
+ * On failure, the hook should raise an ERROR (or PANIC for critical errors).
+ */
+typedef void (*mdread_post_hook_type) (RelFileLocator *rlocator,
+									   ForkNumber forknum,
+									   BlockNumber blocknum,
+									   void **buffers,
+									   BlockNumber nblocks);
+
+/*
+ * Called before mdwritev() writes blocks to disk.
+ * Extension can transform (e.g., encrypt) data.
+ * Returns pointer to transformed buffers array (hook manages the memory,
+ * typically using static local storage).
+ *
+ * Note: The hook should recalculate checksum on transformed data after
+ * transformation. This on-disk checksum will be verified on read before
+ * reverse transformation, ensuring disk-level data integrity.
+ *
+ * On failure, the hook should raise an ERROR (or PANIC for critical errors),
+ * or return the original buffers with a WARNING as fallback.
+ */
+typedef const void **(*mdwrite_pre_hook_type) (RelFileLocator *rlocator,
+											   ForkNumber forknum,
+											   BlockNumber blocknum,
+											   const void **buffers,
+											   BlockNumber nblocks);
+
+/*
+ * Called before mdextend() extends a relation with new blocks.
+ * Returns pointer to transformed buffer (hook manages the memory,
+ * typically using static local storage).
+ *
+ * Note: Same as write hook - the hook should recalculate checksum on
+ * transformed data after transformation.
+ *
+ * On failure, the hook should raise an ERROR (or PANIC for critical errors),
+ * or return the original buffer with a WARNING as fallback.
+ */
+typedef const void *(*mdextend_pre_hook_type) (RelFileLocator *rlocator,
+											   ForkNumber forknum,
+											   BlockNumber blocknum,
+											   const void *buffer);
+
+/* Hook variables for I/O transformation */
+extern PGDLLIMPORT mdread_post_hook_type mdread_post_hook;
+extern PGDLLIMPORT mdwrite_pre_hook_type mdwrite_pre_hook;
+extern PGDLLIMPORT mdextend_pre_hook_type mdextend_pre_hook;
+
 /* md storage manager functionality */
 extern void mdinit(void);
 extern void mdopen(SMgrRelation reln);
-- 
2.50.1 (Apple Git-155)