From 82ce5cc05f1ce0311a2eedd559f1db7a7703f126 Mon Sep 17 00:00:00 2001 From: Henson Choi Date: Tue, 2 Dec 2025 21:50:12 +0900 Subject: [PATCH v4 v4 1/3] Add Storage I/O Transform Hooks for PostgreSQL This patch introduces a set of hook points that allow extensions to intercept and transform data during storage I/O operations. The hooks are designed to support transparent data encryption (TDE) and similar use cases that require data transformation at the storage layer. The following hooks are added: - page_encrypt_hook / page_decrypt_hook in bufmgr.c for buffer page transformation during read/write operations - xlog_insert_pre_hook in xloginsert.c for WAL record transformation before assembly - xlog_decrypt_record_hook in xlogreader.c for WAL record transformation during replay - smgr_write_transform_hook / smgr_read_transform_hook in md.c for low-level storage manager I/O transformation Each hook is optional and defaults to NULL, ensuring no overhead when extensions are not loaded. Author: Henson Choi --- src/backend/access/transam/xloginsert.c | 10 ++++ src/backend/access/transam/xlogreader.c | 21 ++++++++ src/backend/storage/buffer/bufmgr.c | 9 ++++ src/backend/storage/smgr/md.c | 20 ++++++++ src/include/access/xloginsert.h | 20 ++++++++ src/include/access/xlogreader.h | 20 ++++++++ src/include/access/xlogrecord.h | 5 ++ src/include/storage/bufpage.h | 25 +++++++++- src/include/storage/md.h | 65 +++++++++++++++++++++++++ 9 files changed, 194 insertions(+), 1 deletion(-) diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index a56d5a55282..f518ef3f16f 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -136,6 +136,12 @@ static bool begininsert_called = false; /* Memory context to hold the registered buffer and data references. */ static MemoryContext xloginsert_cxt; +/* + * Hook variable for WAL insert transformation (e.g., encryption). + * Extensions can set this hook to transform WAL data before assembly. + */ +xlog_insert_pre_hook_type xlog_insert_pre_hook = NULL; + static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecPtr RedoRecPtr, bool doPageWrites, XLogRecPtr *fpw_lsn, int *num_fpi, @@ -526,6 +532,10 @@ XLogInsert(RmgrId rmid, uint8 info) &fpw_lsn, &num_fpi, &fpi_bytes, &topxid_included); + /* Pre-insert hook for transformation (e.g., encryption) */ + if (xlog_insert_pre_hook) + rdt = xlog_insert_pre_hook(rdt); + EndPos = XLogInsertRecord(rdt, fpw_lsn, curinsert_flags, num_fpi, fpi_bytes, topxid_included); } while (!XLogRecPtrIsValid(EndPos)); diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 5e5001b2101..169f2b06fc5 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -40,6 +40,13 @@ #include "common/logging.h" #endif +/* + * Hook variable for WAL record transformation (e.g., decryption). + * Extensions can set this hook to transform raw WAL data before decoding. + * Frontend tools can also set this hook at startup. + */ +xlog_decode_pre_hook_type xlog_decode_pre_hook = NULL; + static void report_invalid_record(XLogReaderState *state, const char *fmt,...) pg_attribute_printf(2, 3); static void allocate_recordbuf(XLogReaderState *state, uint32 reclength); @@ -843,6 +850,11 @@ restart: Assert(gotheader); record = (XLogRecord *) state->readRecordBuf; + + /* Pre-validation hook for transformation (e.g., decryption) */ + if (xlog_decode_pre_hook) + record = xlog_decode_pre_hook(state, record, RecPtr, true); + if (!ValidXLogRecord(state, record, RecPtr)) goto err; @@ -862,6 +874,15 @@ restart: goto err; /* Record does not cross a page boundary */ + + /* + * Pre-validation hook for transformation (e.g., decryption). + * inplace_allowed is false because record points to readBuf, which + * may be copied back to WAL files (e.g., FinishWalRecovery). + */ + if (xlog_decode_pre_hook) + record = xlog_decode_pre_hook(state, record, RecPtr, false); + if (!ValidXLogRecord(state, record, RecPtr)) goto err; diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index eb55102b0d7..ea0b62e98f2 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -57,6 +57,7 @@ #include "storage/fd.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#include "storage/md.h" #include "storage/proc.h" #include "storage/read_stream.h" #include "storage/smgr.h" @@ -7401,6 +7402,14 @@ buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ); #endif + /* Decrypt block before checksum verification */ + if (mdread_post_hook) + { + RelFileLocator rlocator = BufTagGetRelFileLocator(&tag); + + mdread_post_hook(&rlocator, tag.forkNum, tag.blockNum, (void **) &bufdata, 1); + } + if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags, failed_checksum)) { diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 71bcdeb6601..5416128d2cc 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -96,6 +96,14 @@ typedef struct _MdfdVec static MemoryContext MdCxt; /* context for all MdfdVec objects */ +/* + * Hook variables for I/O transformation (e.g., encryption/decryption). + * Extensions can set these hooks to transform data during storage I/O. + */ +mdread_post_hook_type mdread_post_hook = NULL; +mdwrite_pre_hook_type mdwrite_pre_hook = NULL; +mdextend_pre_hook_type mdextend_pre_hook = NULL; + /* Populate a file tag describing an md.c segment file. */ #define INIT_MD_FILETAG(a,xx_rlocator,xx_forknum,xx_segno) \ @@ -513,6 +521,10 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, relpath(reln->smgr_rlocator, forknum).str, InvalidBlockNumber))); + /* Pre-extend hook for transformation (e.g., encryption) */ + if (mdextend_pre_hook) + buffer = mdextend_pre_hook(&reln->smgr_rlocator.locator, forknum, blocknum, buffer); + v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); @@ -972,6 +984,10 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes); } + /* Post-read hook for transformation (e.g., decryption) */ + if (mdread_post_hook) + mdread_post_hook(&reln->smgr_rlocator.locator, forknum, blocknum, buffers, nblocks_this_segment); + nblocks -= nblocks_this_segment; buffers += nblocks_this_segment; blocknum += nblocks_this_segment; @@ -1064,6 +1080,10 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum)); #endif + /* Pre-write hook for transformation (e.g., encryption) */ + if (mdwrite_pre_hook) + buffers = mdwrite_pre_hook(&reln->smgr_rlocator.locator, forknum, blocknum, buffers, nblocks); + while (nblocks > 0) { struct iovec iov[PG_IOV_MAX]; diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index d6a71415d4f..cc54459ad33 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -19,6 +19,26 @@ #include "storage/relfilelocator.h" #include "utils/relcache.h" +/* Forward declaration for XLogRecData */ +struct XLogRecData; + +/* + * Hook function type for WAL insert transformation (e.g., encryption). + * Called after XLogRecordAssemble() but before XLogInsertRecord(). + * Extension can transform the assembled WAL record data for encryption. + * Returns the (possibly modified) XLogRecData chain to be inserted. + * + * The first node's data points to XLogRecord header, which contains + * xl_rmid and xl_info if needed by the hook. + * + * On failure, the hook should either PANIC or return the original rdata + * as fallback. + */ +typedef struct XLogRecData *(*xlog_insert_pre_hook_type) (struct XLogRecData *rdata); + +/* Hook variable for WAL insert transformation */ +extern PGDLLIMPORT xlog_insert_pre_hook_type xlog_insert_pre_hook; + /* * The minimum size of the WAL construction working area. If you need to * register more than XLR_NORMAL_MAX_BLOCK_ID block references or have more diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index dfabbbd57d4..898d52a1013 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -400,6 +400,26 @@ extern bool DecodeXLogRecord(XLogReaderState *state, XLogRecPtr lsn, char **errormsg); +/* + * Hook function type for WAL record transformation (e.g., decryption). + * Called before ValidXLogRecord() and DecodeXLogRecord(). + * Extension can decrypt or transform the raw record data. + * Returns the (possibly modified) XLogRecord to be validated and decoded. + * + * If inplace_allowed is true, the hook may modify the record in place. + * If false, the hook must allocate a new buffer and return it. + * + * On failure, the hook should either PANIC or return the original record + * as fallback. + */ +typedef XLogRecord *(*xlog_decode_pre_hook_type) (XLogReaderState *state, + XLogRecord *record, + XLogRecPtr lsn, + bool inplace_allowed); + +/* Hook variable for WAL record transformation */ +extern PGDLLIMPORT xlog_decode_pre_hook_type xlog_decode_pre_hook; + /* * Macros that provide access to parts of the record most recently returned by * XLogReadRecord() or XLogNextRecord(). diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h index a06833ce0a3..9cfb2aff5ae 100644 --- a/src/include/access/xlogrecord.h +++ b/src/include/access/xlogrecord.h @@ -244,5 +244,10 @@ typedef struct XLogRecordDataHeaderLong #define XLR_BLOCK_ID_DATA_LONG 254 #define XLR_BLOCK_ID_ORIGIN 253 #define XLR_BLOCK_ID_TOPLEVEL_XID 252 +/* + * I/O transform hook marker. Uses same header format as XLogRecordDataHeaderLong + * (1 byte id + 4 bytes length). Use SizeOfXLogRecordDataHeaderLong for size. + */ +#define XLR_BLOCK_ID_TRANSFORMED 251 #endif /* XLOGRECORD_H */ diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index abc2cf2a020..f18f77d3d22 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -189,7 +189,17 @@ typedef PageHeaderData *PageHeader; #define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to * everyone */ -#define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */ +/* + * Transform ID field (5 bits: values 0-31) for I/O transform extensions. + * Value 0 means the page is not transformed (backward compatible). + * Values 1-31 are available for extensions to define their own meanings + * (e.g., encryption key versions, algorithm identifiers, migration markers). + */ +#define PD_TRANSFORM_ID_MASK 0x00F8 /* bits 3-7 */ +#define PD_TRANSFORM_ID_SHIFT 3 +#define PD_TRANSFORM_NONE 0 /* not transformed (core reserved) */ + +#define PD_VALID_FLAG_BITS 0x00FF /* OR of all valid pd_flags bits */ /* * Page layout version number 0 is for pre-7.3 Postgres releases. @@ -441,6 +451,19 @@ PageClearAllVisible(Page page) ((PageHeader) page)->pd_flags &= ~PD_ALL_VISIBLE; } +static inline uint8 +PageGetTransformId(const PageData *page) +{ + return (((const PageHeaderData *) page)->pd_flags & PD_TRANSFORM_ID_MASK) >> PD_TRANSFORM_ID_SHIFT; +} +static inline void +PageSetTransformId(Page page, uint8 id) +{ + ((PageHeader) page)->pd_flags = + (((PageHeader) page)->pd_flags & ~PD_TRANSFORM_ID_MASK) | + ((id << PD_TRANSFORM_ID_SHIFT) & PD_TRANSFORM_ID_MASK); +} + /* * These two require "access/transam.h", so left as macros. */ diff --git a/src/include/storage/md.h b/src/include/storage/md.h index b563c27abf0..0a766a2b61f 100644 --- a/src/include/storage/md.h +++ b/src/include/storage/md.h @@ -22,6 +22,71 @@ extern PGDLLIMPORT const PgAioHandleCallbacks aio_md_readv_cb; +/* + * Hook function types for I/O transformation (e.g., encryption/decryption). + * These hooks allow extensions to transform data during storage I/O operations. + */ + +/* + * Called after blocks are read from disk, before PostgreSQL's checksum verification. + * Extension can reverse-transform (e.g., decrypt) the data in place. + * + * For synchronous reads, called from mdreadv() after read completes. + * For AIO reads, called from buffer_readv_complete_one() before PageIsVerified(). + * + * Note: The hook is responsible for verifying on-disk checksum before reverse + * transformation and recalculating checksum after transformation. This ensures + * data integrity is verified at both stages and PostgreSQL's checksum verification + * passes. + * + * On failure, the hook should raise an ERROR (or PANIC for critical errors). + */ +typedef void (*mdread_post_hook_type) (RelFileLocator *rlocator, + ForkNumber forknum, + BlockNumber blocknum, + void **buffers, + BlockNumber nblocks); + +/* + * Called before mdwritev() writes blocks to disk. + * Extension can transform (e.g., encrypt) data. + * Returns pointer to transformed buffers array (hook manages the memory, + * typically using static local storage). + * + * Note: The hook should recalculate checksum on transformed data after + * transformation. This on-disk checksum will be verified on read before + * reverse transformation, ensuring disk-level data integrity. + * + * On failure, the hook should raise an ERROR (or PANIC for critical errors), + * or return the original buffers with a WARNING as fallback. + */ +typedef const void **(*mdwrite_pre_hook_type) (RelFileLocator *rlocator, + ForkNumber forknum, + BlockNumber blocknum, + const void **buffers, + BlockNumber nblocks); + +/* + * Called before mdextend() extends a relation with new blocks. + * Returns pointer to transformed buffer (hook manages the memory, + * typically using static local storage). + * + * Note: Same as write hook - the hook should recalculate checksum on + * transformed data after transformation. + * + * On failure, the hook should raise an ERROR (or PANIC for critical errors), + * or return the original buffer with a WARNING as fallback. + */ +typedef const void *(*mdextend_pre_hook_type) (RelFileLocator *rlocator, + ForkNumber forknum, + BlockNumber blocknum, + const void *buffer); + +/* Hook variables for I/O transformation */ +extern PGDLLIMPORT mdread_post_hook_type mdread_post_hook; +extern PGDLLIMPORT mdwrite_pre_hook_type mdwrite_pre_hook; +extern PGDLLIMPORT mdextend_pre_hook_type mdextend_pre_hook; + /* md storage manager functionality */ extern void mdinit(void); extern void mdopen(SMgrRelation reln); -- 2.50.1 (Apple Git-155)