0007-Use-anonymous-files-to-back-shared-memory-s-20250616.patch
text/x-patch
Filename: 0007-Use-anonymous-files-to-back-shared-memory-s-20250616.patch
Type: text/x-patch
Part: 7
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch 0007
Subject: Use anonymous files to back shared memory segments
| File | + | − |
|---|---|---|
| src/backend/port/sysv_shmem.c | 63 | 10 |
| src/backend/port/win32_shmem.c | 1 | 1 |
| src/backend/storage/ipc/ipci.c | 1 | 1 |
| src/include/portability/mem.h | 1 | 1 |
| src/include/storage/pg_shmem.h | 2 | 1 |
From 441f537b64b6bc8f0f00fa0de7850911acff621c Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Sat, 15 Mar 2025 16:39:45 +0100
Subject: [PATCH 07/17] Use anonymous files to back shared memory segments
Allow to use anonymous files for shared memory, instead of plain
anonymous memory. Such an anonymous file is created via memfd_create, it
lives in memory, behaves like a regular file and semantically equivalent
to an anonymous memory allocated via mmap with MAP_ANONYMOUS.
Advantages of using anon files are following:
* We've got a file descriptor, which could be used for regular file
operations (modification, truncation, you name it).
* The file could be given a name, which improves readability when it
comes to process maps. Here is how it looks like
7f90cde00000-7f90d5126000 rw-s 00000000 00:01 5463 /memfd:main (deleted)
7f90d5126000-7f914de00000 ---p 00000000 00:00 0
7f914de00000-7f9175128000 rw-s 00000000 00:01 5466 /memfd:buffers (deleted)
7f9175128000-7f944de00000 ---p 00000000 00:00 0
7f944de00000-7f9455528000 rw-s 00000000 00:01 5469 /memfd:descriptors (deleted)
7f9455528000-7f94cde00000 ---p 00000000 00:00 0
7f94cde00000-7f94d5228000 rw-s 00000000 00:01 5472 /memfd:iocv (deleted)
7f94d5228000-7f954de00000 ---p 00000000 00:00 0
7f954de00000-7f9555266000 rw-s 00000000 00:01 5475 /memfd:checkpoint (deleted)
7f9555266000-7f958de00000 ---p 00000000 00:00 0
7f958de00000-7f95954aa000 rw-s 00000000 00:01 5478 /memfd:strategy (deleted)
7f95954aa000-7f95cde00000 ---p 00000000 00:00 0
* By default, Linux will not add file-backed shared mappings into a core dump,
making it more convenient to work with them in PostgreSQL: no more huge dumps
to process.
The downside is that memfd_create is Linux specific.
---
src/backend/port/sysv_shmem.c | 73 +++++++++++++++++++++++++++++-----
src/backend/port/win32_shmem.c | 2 +-
src/backend/storage/ipc/ipci.c | 2 +-
src/include/portability/mem.h | 2 +-
src/include/storage/pg_shmem.h | 3 +-
5 files changed, 68 insertions(+), 14 deletions(-)
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index a3437973784..87000a24eea 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -107,6 +107,7 @@ typedef struct AnonymousMapping
Pointer shmem; /* Pointer to the start of the mapped memory */
Pointer seg_addr; /* SysV shared memory for the header */
unsigned long seg_id; /* IPC key */
+ int segment_fd; /* fd for the backing anon file */
} AnonymousMapping;
static AnonymousMapping Mappings[ANON_MAPPINGS];
@@ -127,7 +128,7 @@ static int next_free_segment = 0;
* 00400000-00490000 /path/bin/postgres
* ...
* 012d9000-0133e000 [heap]
- * 7f443a800000-7f470a800000 /dev/zero (deleted)
+ * 7f443a800000-7f470a800000 /memfd:main (deleted)
* 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive
* 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2
* ...
@@ -150,9 +151,9 @@ static int next_free_segment = 0;
* The result would look like this:
*
* 012d9000-0133e000 [heap]
- * 7f4426f54000-7f442e010000 /dev/zero (deleted)
+ * 7f4426f54000-7f442e010000 /memfd:main (deleted)
* 7f442e010000-7f443a800000 # reserved empty space
- * 7f443a800000-7f444196c000 /dev/zero (deleted)
+ * 7f443a800000-7f444196c000 /memfd:buffers (deleted)
* 7f444196c000-7f470a800000 # reserved empty space
* 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive
* 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2
@@ -643,13 +644,14 @@ PGSharedMemoryAttach(IpcMemoryId shmId,
* *hugepagesize and *mmap_flags are set to 0.
*/
void
-GetHugePageSize(Size *hugepagesize, int *mmap_flags)
+GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags)
{
#ifdef MAP_HUGETLB
Size default_hugepagesize = 0;
Size hugepagesize_local = 0;
int mmap_flags_local = 0;
+ int memfd_flags_local = 0;
/*
* System-dependent code to find out the default huge page size.
@@ -708,6 +710,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
}
mmap_flags_local = MAP_HUGETLB;
+ memfd_flags_local = MFD_HUGETLB;
/*
* On recent enough Linux, also include the explicit page size, if
@@ -718,7 +721,16 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
{
int shift = pg_ceil_log2_64(hugepagesize_local);
- mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
+ memfd_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
+ }
+#endif
+
+#if defined(MFD_HUGE_MASK) && defined(MFD_HUGE_SHIFT)
+ if (hugepagesize_local != default_hugepagesize)
+ {
+ int shift = pg_ceil_log2_64(hugepagesize_local);
+
+ memfd_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
}
#endif
@@ -727,6 +739,8 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
*mmap_flags = mmap_flags_local;
if (hugepagesize)
*hugepagesize = hugepagesize_local;
+ if (memfd_flags)
+ *memfd_flags = memfd_flags_local;
#else
@@ -734,6 +748,8 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
*hugepagesize = 0;
if (mmap_flags)
*mmap_flags = 0;
+ if (memfd_flags)
+ *memfd_flags = 0;
#endif /* MAP_HUGETLB */
}
@@ -771,7 +787,7 @@ CreateAnonymousSegment(AnonymousMapping *mapping, Pointer base)
Size allocsize = mapping->shmem_size;
void *ptr = MAP_FAILED;
int mmap_errno = 0;
- int mmap_flags = PG_MMAP_FLAGS;
+ int mmap_flags = PG_MMAP_FLAGS, memfd_flags = 0;
#ifndef MAP_HUGETLB
/* ReserveAnonymousMemory should have dealt with this case */
@@ -785,7 +801,7 @@ CreateAnonymousSegment(AnonymousMapping *mapping, Pointer base)
Assert(huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY);
/* Round up the request size to a suitable large value */
- GetHugePageSize(&hugepagesize, &mmap_flags);
+ GetHugePageSize(&hugepagesize, &mmap_flags, &memfd_flags);
if (allocsize % hugepagesize != 0)
allocsize += hugepagesize - (allocsize % hugepagesize);
@@ -794,6 +810,29 @@ CreateAnonymousSegment(AnonymousMapping *mapping, Pointer base)
}
#endif
+ /*
+ * Prepare an anonymous file backing the segment. Its size will be
+ * specified later via ftruncate.
+ *
+ * The file behaves like a regular file, but lives in memory. Once all
+ * references to the file are dropped, it is automatically released.
+ * Anonymous memory is used for all backing pages of the file, thus it has
+ * the same semantics as anonymous memory allocations using mmap with the
+ * MAP_ANONYMOUS flag.
+ */
+ mapping->segment_fd = memfd_create(MappingName(mapping->shmem_segment),
+ memfd_flags);
+
+ /*
+ * Specify the segment file size using allocsize, which contains
+ * potentially modified size.
+ */
+ if(ftruncate(mapping->segment_fd, allocsize) == -1)
+ ereport(FATAL,
+ (errcode(ERRCODE_SYSTEM_ERROR),
+ errmsg("could not truncase anonymous file for \"%s\": %m",
+ MappingName(mapping->shmem_segment))));
+
elog(DEBUG1, "segment[%s]: mmap(%zu) at address %p",
MappingName(mapping->shmem_segment), allocsize, base + reserved_offset);
@@ -807,7 +846,7 @@ CreateAnonymousSegment(AnonymousMapping *mapping, Pointer base)
* a restart.
*/
ptr = mmap(base + reserved_offset, allocsize, PROT_READ | PROT_WRITE,
- mmap_flags | MAP_FIXED, -1, 0);
+ mmap_flags | MAP_FIXED, mapping->segment_fd, 0);
mmap_errno = errno;
if (ptr == MAP_FAILED)
@@ -817,8 +856,15 @@ CreateAnonymousSegment(AnonymousMapping *mapping, Pointer base)
"fallback to the non-resizable allocation",
MappingName(mapping->shmem_segment), allocsize, base + reserved_offset);
+ /* Specify the segment file size using allocsize. */
+ if(ftruncate(mapping->segment_fd, allocsize) == -1)
+ ereport(FATAL,
+ (errcode(ERRCODE_SYSTEM_ERROR),
+ errmsg("could not truncase anonymous file for \"%s\": %m",
+ MappingName(mapping->shmem_segment))));
+
ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
- PG_MMAP_FLAGS, -1, 0);
+ PG_MMAP_FLAGS, mapping->segment_fd, 0);
mmap_errno = errno;
}
else
@@ -889,7 +935,7 @@ ReserveAnonymousMemory(Size reserve_size)
Size hugepagesize, total_size = 0;
int mmap_flags;
- GetHugePageSize(&hugepagesize, &mmap_flags);
+ GetHugePageSize(&hugepagesize, &mmap_flags, NULL);
/*
* Figure out how much memory is needed for all segments, keeping in
@@ -1070,6 +1116,13 @@ AnonymousShmemResize(void)
if (m->shmem_size == new_size)
continue;
+ /* Resize the backing anon file. */
+ if(ftruncate(m->segment_fd, new_size) == -1)
+ ereport(FATAL,
+ (errcode(ERRCODE_SYSTEM_ERROR),
+ errmsg("could not truncase anonymous file for \"%s\": %m",
+ MappingName(m->shmem_segment))));
+
/* Clean up some reserved space to resize into */
if (munmap(m->shmem + m->shmem_size, new_size - m->shmem_size) == -1)
ereport(FATAL,
diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c
index ce719f1b412..ba972106de1 100644
--- a/src/backend/port/win32_shmem.c
+++ b/src/backend/port/win32_shmem.c
@@ -627,7 +627,7 @@ pgwin32_ReserveSharedMemoryRegion(HANDLE hChild)
* use GetLargePageMinimum() instead.
*/
void
-GetHugePageSize(Size *hugepagesize, int *mmap_flags)
+GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags)
{
if (hugepagesize)
*hugepagesize = 0;
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index abeb91e24fd..dc2b4becf4a 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -396,7 +396,7 @@ InitializeShmemGUCs(void)
/*
* Calculate the number of huge pages required.
*/
- GetHugePageSize(&hp_size, NULL);
+ GetHugePageSize(&hp_size, NULL, NULL);
if (hp_size != 0)
{
Size hp_required;
diff --git a/src/include/portability/mem.h b/src/include/portability/mem.h
index ef9800732d9..40588ff6968 100644
--- a/src/include/portability/mem.h
+++ b/src/include/portability/mem.h
@@ -38,7 +38,7 @@
#define MAP_NOSYNC 0
#endif
-#define PG_MMAP_FLAGS (MAP_SHARED|MAP_ANONYMOUS|MAP_HASSEMAPHORE)
+#define PG_MMAP_FLAGS (MAP_SHARED|MAP_HASSEMAPHORE)
/* Some really old systems don't define MAP_FAILED. */
#ifndef MAP_FAILED
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index 19ad2e2f788..192b637cc65 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -125,7 +125,8 @@ extern PGShmemHeader *PGSharedMemoryCreate(Size size,
PGShmemHeader **shim, Pointer base);
extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2);
extern void PGSharedMemoryDetach(void);
-extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags);
+extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags,
+ int *memfd_flags);
void *ReserveAnonymousMemory(Size reserve_size);
bool ProcessBarrierShmemResize(Barrier *barrier);
--
2.34.1