v20251229-v4-0001-Add-Storage-I-O-Transform-Hooks-for-PostgreSQL.patch
application/octet-stream
Filename: v20251229-v4-0001-Add-Storage-I-O-Transform-Hooks-for-PostgreSQL.patch
Type: application/octet-stream
Part: 0
From 82ce5cc05f1ce0311a2eedd559f1db7a7703f126 Mon Sep 17 00:00:00 2001
From: Henson Choi <assam258@gmail.com>
Date: Tue, 2 Dec 2025 21:50:12 +0900
Subject: [PATCH v4 v4 1/3] Add Storage I/O Transform Hooks for PostgreSQL
This patch introduces a set of hook points that allow extensions to
intercept and transform data during storage I/O operations. The hooks
are designed to support transparent data encryption (TDE) and similar
use cases that require data transformation at the storage layer.
The following hooks are added:
- page_encrypt_hook / page_decrypt_hook in bufmgr.c for buffer page
transformation during read/write operations
- xlog_insert_pre_hook in xloginsert.c for WAL record transformation
before assembly
- xlog_decrypt_record_hook in xlogreader.c for WAL record
transformation during replay
- smgr_write_transform_hook / smgr_read_transform_hook in md.c for
low-level storage manager I/O transformation
Each hook is optional and defaults to NULL, ensuring no overhead when
extensions are not loaded.
Author: Henson Choi <assam258@gmail.com>
---
src/backend/access/transam/xloginsert.c | 10 ++++
src/backend/access/transam/xlogreader.c | 21 ++++++++
src/backend/storage/buffer/bufmgr.c | 9 ++++
src/backend/storage/smgr/md.c | 20 ++++++++
src/include/access/xloginsert.h | 20 ++++++++
src/include/access/xlogreader.h | 20 ++++++++
src/include/access/xlogrecord.h | 5 ++
src/include/storage/bufpage.h | 25 +++++++++-
src/include/storage/md.h | 65 +++++++++++++++++++++++++
9 files changed, 194 insertions(+), 1 deletion(-)
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index a56d5a55282..f518ef3f16f 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -136,6 +136,12 @@ static bool begininsert_called = false;
/* Memory context to hold the registered buffer and data references. */
static MemoryContext xloginsert_cxt;
+/*
+ * Hook variable for WAL insert transformation (e.g., encryption).
+ * Extensions can set this hook to transform WAL data before assembly.
+ */
+xlog_insert_pre_hook_type xlog_insert_pre_hook = NULL;
+
static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info,
XLogRecPtr RedoRecPtr, bool doPageWrites,
XLogRecPtr *fpw_lsn, int *num_fpi,
@@ -526,6 +532,10 @@ XLogInsert(RmgrId rmid, uint8 info)
&fpw_lsn, &num_fpi, &fpi_bytes,
&topxid_included);
+ /* Pre-insert hook for transformation (e.g., encryption) */
+ if (xlog_insert_pre_hook)
+ rdt = xlog_insert_pre_hook(rdt);
+
EndPos = XLogInsertRecord(rdt, fpw_lsn, curinsert_flags, num_fpi,
fpi_bytes, topxid_included);
} while (!XLogRecPtrIsValid(EndPos));
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index 5e5001b2101..169f2b06fc5 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -40,6 +40,13 @@
#include "common/logging.h"
#endif
+/*
+ * Hook variable for WAL record transformation (e.g., decryption).
+ * Extensions can set this hook to transform raw WAL data before decoding.
+ * Frontend tools can also set this hook at startup.
+ */
+xlog_decode_pre_hook_type xlog_decode_pre_hook = NULL;
+
static void report_invalid_record(XLogReaderState *state, const char *fmt,...)
pg_attribute_printf(2, 3);
static void allocate_recordbuf(XLogReaderState *state, uint32 reclength);
@@ -843,6 +850,11 @@ restart:
Assert(gotheader);
record = (XLogRecord *) state->readRecordBuf;
+
+ /* Pre-validation hook for transformation (e.g., decryption) */
+ if (xlog_decode_pre_hook)
+ record = xlog_decode_pre_hook(state, record, RecPtr, true);
+
if (!ValidXLogRecord(state, record, RecPtr))
goto err;
@@ -862,6 +874,15 @@ restart:
goto err;
/* Record does not cross a page boundary */
+
+ /*
+ * Pre-validation hook for transformation (e.g., decryption).
+ * inplace_allowed is false because record points to readBuf, which
+ * may be copied back to WAL files (e.g., FinishWalRecovery).
+ */
+ if (xlog_decode_pre_hook)
+ record = xlog_decode_pre_hook(state, record, RecPtr, false);
+
if (!ValidXLogRecord(state, record, RecPtr))
goto err;
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index eb55102b0d7..ea0b62e98f2 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -57,6 +57,7 @@
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
+#include "storage/md.h"
#include "storage/proc.h"
#include "storage/read_stream.h"
#include "storage/smgr.h"
@@ -7401,6 +7402,14 @@ buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer,
VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
#endif
+ /* Decrypt block before checksum verification */
+ if (mdread_post_hook)
+ {
+ RelFileLocator rlocator = BufTagGetRelFileLocator(&tag);
+
+ mdread_post_hook(&rlocator, tag.forkNum, tag.blockNum, (void **) &bufdata, 1);
+ }
+
if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
failed_checksum))
{
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 71bcdeb6601..5416128d2cc 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -96,6 +96,14 @@ typedef struct _MdfdVec
static MemoryContext MdCxt; /* context for all MdfdVec objects */
+/*
+ * Hook variables for I/O transformation (e.g., encryption/decryption).
+ * Extensions can set these hooks to transform data during storage I/O.
+ */
+mdread_post_hook_type mdread_post_hook = NULL;
+mdwrite_pre_hook_type mdwrite_pre_hook = NULL;
+mdextend_pre_hook_type mdextend_pre_hook = NULL;
+
/* Populate a file tag describing an md.c segment file. */
#define INIT_MD_FILETAG(a,xx_rlocator,xx_forknum,xx_segno) \
@@ -513,6 +521,10 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
relpath(reln->smgr_rlocator, forknum).str,
InvalidBlockNumber)));
+ /* Pre-extend hook for transformation (e.g., encryption) */
+ if (mdextend_pre_hook)
+ buffer = mdextend_pre_hook(&reln->smgr_rlocator.locator, forknum, blocknum, buffer);
+
v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
@@ -972,6 +984,10 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
}
+ /* Post-read hook for transformation (e.g., decryption) */
+ if (mdread_post_hook)
+ mdread_post_hook(&reln->smgr_rlocator.locator, forknum, blocknum, buffers, nblocks_this_segment);
+
nblocks -= nblocks_this_segment;
buffers += nblocks_this_segment;
blocknum += nblocks_this_segment;
@@ -1064,6 +1080,10 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
#endif
+ /* Pre-write hook for transformation (e.g., encryption) */
+ if (mdwrite_pre_hook)
+ buffers = mdwrite_pre_hook(&reln->smgr_rlocator.locator, forknum, blocknum, buffers, nblocks);
+
while (nblocks > 0)
{
struct iovec iov[PG_IOV_MAX];
diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h
index d6a71415d4f..cc54459ad33 100644
--- a/src/include/access/xloginsert.h
+++ b/src/include/access/xloginsert.h
@@ -19,6 +19,26 @@
#include "storage/relfilelocator.h"
#include "utils/relcache.h"
+/* Forward declaration for XLogRecData */
+struct XLogRecData;
+
+/*
+ * Hook function type for WAL insert transformation (e.g., encryption).
+ * Called after XLogRecordAssemble() but before XLogInsertRecord().
+ * Extension can transform the assembled WAL record data for encryption.
+ * Returns the (possibly modified) XLogRecData chain to be inserted.
+ *
+ * The first node's data points to XLogRecord header, which contains
+ * xl_rmid and xl_info if needed by the hook.
+ *
+ * On failure, the hook should either PANIC or return the original rdata
+ * as fallback.
+ */
+typedef struct XLogRecData *(*xlog_insert_pre_hook_type) (struct XLogRecData *rdata);
+
+/* Hook variable for WAL insert transformation */
+extern PGDLLIMPORT xlog_insert_pre_hook_type xlog_insert_pre_hook;
+
/*
* The minimum size of the WAL construction working area. If you need to
* register more than XLR_NORMAL_MAX_BLOCK_ID block references or have more
diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h
index dfabbbd57d4..898d52a1013 100644
--- a/src/include/access/xlogreader.h
+++ b/src/include/access/xlogreader.h
@@ -400,6 +400,26 @@ extern bool DecodeXLogRecord(XLogReaderState *state,
XLogRecPtr lsn,
char **errormsg);
+/*
+ * Hook function type for WAL record transformation (e.g., decryption).
+ * Called before ValidXLogRecord() and DecodeXLogRecord().
+ * Extension can decrypt or transform the raw record data.
+ * Returns the (possibly modified) XLogRecord to be validated and decoded.
+ *
+ * If inplace_allowed is true, the hook may modify the record in place.
+ * If false, the hook must allocate a new buffer and return it.
+ *
+ * On failure, the hook should either PANIC or return the original record
+ * as fallback.
+ */
+typedef XLogRecord *(*xlog_decode_pre_hook_type) (XLogReaderState *state,
+ XLogRecord *record,
+ XLogRecPtr lsn,
+ bool inplace_allowed);
+
+/* Hook variable for WAL record transformation */
+extern PGDLLIMPORT xlog_decode_pre_hook_type xlog_decode_pre_hook;
+
/*
* Macros that provide access to parts of the record most recently returned by
* XLogReadRecord() or XLogNextRecord().
diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h
index a06833ce0a3..9cfb2aff5ae 100644
--- a/src/include/access/xlogrecord.h
+++ b/src/include/access/xlogrecord.h
@@ -244,5 +244,10 @@ typedef struct XLogRecordDataHeaderLong
#define XLR_BLOCK_ID_DATA_LONG 254
#define XLR_BLOCK_ID_ORIGIN 253
#define XLR_BLOCK_ID_TOPLEVEL_XID 252
+/*
+ * I/O transform hook marker. Uses same header format as XLogRecordDataHeaderLong
+ * (1 byte id + 4 bytes length). Use SizeOfXLogRecordDataHeaderLong for size.
+ */
+#define XLR_BLOCK_ID_TRANSFORMED 251
#endif /* XLOGRECORD_H */
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index abc2cf2a020..f18f77d3d22 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -189,7 +189,17 @@ typedef PageHeaderData *PageHeader;
#define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to
* everyone */
-#define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */
+/*
+ * Transform ID field (5 bits: values 0-31) for I/O transform extensions.
+ * Value 0 means the page is not transformed (backward compatible).
+ * Values 1-31 are available for extensions to define their own meanings
+ * (e.g., encryption key versions, algorithm identifiers, migration markers).
+ */
+#define PD_TRANSFORM_ID_MASK 0x00F8 /* bits 3-7 */
+#define PD_TRANSFORM_ID_SHIFT 3
+#define PD_TRANSFORM_NONE 0 /* not transformed (core reserved) */
+
+#define PD_VALID_FLAG_BITS 0x00FF /* OR of all valid pd_flags bits */
/*
* Page layout version number 0 is for pre-7.3 Postgres releases.
@@ -441,6 +451,19 @@ PageClearAllVisible(Page page)
((PageHeader) page)->pd_flags &= ~PD_ALL_VISIBLE;
}
+static inline uint8
+PageGetTransformId(const PageData *page)
+{
+ return (((const PageHeaderData *) page)->pd_flags & PD_TRANSFORM_ID_MASK) >> PD_TRANSFORM_ID_SHIFT;
+}
+static inline void
+PageSetTransformId(Page page, uint8 id)
+{
+ ((PageHeader) page)->pd_flags =
+ (((PageHeader) page)->pd_flags & ~PD_TRANSFORM_ID_MASK) |
+ ((id << PD_TRANSFORM_ID_SHIFT) & PD_TRANSFORM_ID_MASK);
+}
+
/*
* These two require "access/transam.h", so left as macros.
*/
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index b563c27abf0..0a766a2b61f 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -22,6 +22,71 @@
extern PGDLLIMPORT const PgAioHandleCallbacks aio_md_readv_cb;
+/*
+ * Hook function types for I/O transformation (e.g., encryption/decryption).
+ * These hooks allow extensions to transform data during storage I/O operations.
+ */
+
+/*
+ * Called after blocks are read from disk, before PostgreSQL's checksum verification.
+ * Extension can reverse-transform (e.g., decrypt) the data in place.
+ *
+ * For synchronous reads, called from mdreadv() after read completes.
+ * For AIO reads, called from buffer_readv_complete_one() before PageIsVerified().
+ *
+ * Note: The hook is responsible for verifying on-disk checksum before reverse
+ * transformation and recalculating checksum after transformation. This ensures
+ * data integrity is verified at both stages and PostgreSQL's checksum verification
+ * passes.
+ *
+ * On failure, the hook should raise an ERROR (or PANIC for critical errors).
+ */
+typedef void (*mdread_post_hook_type) (RelFileLocator *rlocator,
+ ForkNumber forknum,
+ BlockNumber blocknum,
+ void **buffers,
+ BlockNumber nblocks);
+
+/*
+ * Called before mdwritev() writes blocks to disk.
+ * Extension can transform (e.g., encrypt) data.
+ * Returns pointer to transformed buffers array (hook manages the memory,
+ * typically using static local storage).
+ *
+ * Note: The hook should recalculate checksum on transformed data after
+ * transformation. This on-disk checksum will be verified on read before
+ * reverse transformation, ensuring disk-level data integrity.
+ *
+ * On failure, the hook should raise an ERROR (or PANIC for critical errors),
+ * or return the original buffers with a WARNING as fallback.
+ */
+typedef const void **(*mdwrite_pre_hook_type) (RelFileLocator *rlocator,
+ ForkNumber forknum,
+ BlockNumber blocknum,
+ const void **buffers,
+ BlockNumber nblocks);
+
+/*
+ * Called before mdextend() extends a relation with new blocks.
+ * Returns pointer to transformed buffer (hook manages the memory,
+ * typically using static local storage).
+ *
+ * Note: Same as write hook - the hook should recalculate checksum on
+ * transformed data after transformation.
+ *
+ * On failure, the hook should raise an ERROR (or PANIC for critical errors),
+ * or return the original buffer with a WARNING as fallback.
+ */
+typedef const void *(*mdextend_pre_hook_type) (RelFileLocator *rlocator,
+ ForkNumber forknum,
+ BlockNumber blocknum,
+ const void *buffer);
+
+/* Hook variables for I/O transformation */
+extern PGDLLIMPORT mdread_post_hook_type mdread_post_hook;
+extern PGDLLIMPORT mdwrite_pre_hook_type mdwrite_pre_hook;
+extern PGDLLIMPORT mdextend_pre_hook_type mdextend_pre_hook;
+
/* md storage manager functionality */
extern void mdinit(void);
extern void mdopen(SMgrRelation reln);
--
2.50.1 (Apple Git-155)