0006-Use-anonymous-files-to-back-shared-memory-s-20250228.patch
text/x-patch
Filename: 0006-Use-anonymous-files-to-back-shared-memory-s-20250228.patch
Type: text/x-patch
Part: 5
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch 0006
Subject: Use anonymous files to back shared memory segments
| File | + | − |
|---|---|---|
| src/backend/port/sysv_shmem.c | 38 | 8 |
| src/include/portability/mem.h | 1 | 1 |
From be911372e4de4b2b98699512007ff8055dbea2f2 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Sun, 23 Feb 2025 14:42:39 +0100
Subject: [PATCH 06/11] Use anonymous files to back shared memory segments
Allow to use anonymous files for shared memory, instead of plain
anonymous memory. Such an anonymous file is created via memfd_create, it
lives in memory, behaves like a regular file and semantically equivalent
to an anonymous memory allocated via mmap with MAP_ANONYMOUS.
Advantages of using anon files are following:
* We've got a file descriptor, which could be used for regular file
operations (modification, truncation, you name it).
* The file could be given a name, which improves readability when it
comes to process maps. Here is how it looks like
7f5a2bd04000-7f5a32e52000 rw-s 00000000 00:01 1845 /memfd:strategy (deleted)
7f5a39252000-7f5a4030e000 rw-s 00000000 00:01 1842 /memfd:checkpoint (deleted)
7f5a4670e000-7f5a4d7ba000 rw-s 00000000 00:01 1839 /memfd:iocv (deleted)
7f5a53bba000-7f5a5ad26000 rw-s 00000000 00:01 1836 /memfd:descriptors (deleted)
7f5a9ad26000-7f5aa9d94000 rw-s 00000000 00:01 1833 /memfd:buffers (deleted)
7f5d29d94000-7f5d30e00000 rw-s 00000000 00:01 1830 /memfd:main (deleted)
* By default, Linux will not add file-backed shared mappings into a core dump,
making it more convenient to work with them in PostgreSQL: no more huge dumps
to process.
The downside is that memfd_create is Linux specific.
---
src/backend/port/sysv_shmem.c | 46 +++++++++++++++++++++++++++++------
src/include/portability/mem.h | 2 +-
2 files changed, 39 insertions(+), 9 deletions(-)
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 35a8ff92175..8864866f26c 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -105,6 +105,7 @@ typedef struct AnonymousMapping
void *shmem; /* Pointer to the start of the mapped memory */
void *seg_addr; /* SysV shared memory for the header */
unsigned long seg_id; /* IPC key */
+ int segment_fd; /* fd for the backing anon file */
} AnonymousMapping;
static AnonymousMapping Mappings[ANON_MAPPINGS];
@@ -125,7 +126,7 @@ static int next_free_segment = 0;
* 00400000-00490000 /path/bin/postgres
* ...
* 012d9000-0133e000 [heap]
- * 7f443a800000-7f470a800000 /dev/zero (deleted)
+ * 7f443a800000-7f470a800000 /memfd:main (deleted)
* 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive
* 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2
* ...
@@ -152,9 +153,9 @@ static int next_free_segment = 0;
* The result would look like this:
*
* 012d9000-0133e000 [heap]
- * 7f4426f54000-7f442e010000 /dev/zero (deleted)
+ * 7f4426f54000-7f442e010000 /memfd:main (deleted)
* [...free space...]
- * 7f443a800000-7f444196c000 /dev/zero (deleted)
+ * 7f443a800000-7f444196c000 /memfd:buffers (deleted)
* [...free space...]
* 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive
* 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2
@@ -717,6 +718,18 @@ CreateAnonymousSegment(AnonymousMapping *mapping)
void *ptr = MAP_FAILED;
int mmap_errno = 0;
+ /*
+ * Prepare an anonymous file backing the segment. Its size will be
+ * specified later via ftruncate.
+ *
+ * The file behaves like a regular file, but lives in memory. Once all
+ * references to the file are dropped, it is automatically released.
+ * Anonymous memory is used for all backing pages of the file, thus it has
+ * the same semantics as anonymous memory allocations using mmap with the
+ * MAP_ANONYMOUS flag.
+ */
+ mapping->segment_fd = memfd_create(MappingName(mapping->shmem_segment), 0);
+
#ifndef MAP_HUGETLB
/* PGSharedMemoryCreate should have dealt with this case */
Assert(huge_pages != HUGE_PAGES_ON);
@@ -734,8 +747,13 @@ CreateAnonymousSegment(AnonymousMapping *mapping)
if (allocsize % hugepagesize != 0)
allocsize += hugepagesize - (allocsize % hugepagesize);
+ /*
+ * Do not use an anonymous file here yet. When adding it, do not forget
+ * to use ftruncate and flags MFD_HUGETLB & MFD_HUGE_2MB/MFD_HUGE_1GB
+ * in memfd_create.
+ */
ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
- PG_MMAP_FLAGS | mmap_flags, -1, 0);
+ PG_MMAP_FLAGS | MAP_ANONYMOUS | mmap_flags, -1, 0);
mmap_errno = errno;
if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
{
@@ -771,7 +789,8 @@ CreateAnonymousSegment(AnonymousMapping *mapping)
* - First create the temporary probe mapping of a fixed size and let
* kernel to place it at address of its choice. By the virtue of the
* probe mapping size we expect it to be located at the lowest
- * possible address, expecting some non mapped space above.
+ * possible address, expecting some non mapped space above. The probe
+ * is does not need to be backed by an anonymous file.
*
* - Unmap the probe mapping, remember the address.
*
@@ -786,7 +805,7 @@ CreateAnonymousSegment(AnonymousMapping *mapping)
* without a restart.
*/
probe = mmap(NULL, PROBE_MAPPING_SIZE, PROT_READ | PROT_WRITE,
- PG_MMAP_FLAGS, -1, 0);
+ PG_MMAP_FLAGS | MAP_ANONYMOUS, -1, 0);
if (probe == MAP_FAILED)
{
@@ -802,8 +821,14 @@ CreateAnonymousSegment(AnonymousMapping *mapping)
munmap(probe, PROBE_MAPPING_SIZE);
+ /*
+ * Specify the segment file size using allocsize, which contains
+ * potentially modified size.
+ */
+ ftruncate(mapping->segment_fd, allocsize);
+
ptr = mmap(probe - offset, allocsize, PROT_READ | PROT_WRITE,
- PG_MMAP_FLAGS | MAP_FIXED_NOREPLACE, -1, 0);
+ PG_MMAP_FLAGS | MAP_FIXED_NOREPLACE, mapping->segment_fd, 0);
mmap_errno = errno;
if (ptr == MAP_FAILED)
{
@@ -822,8 +847,11 @@ CreateAnonymousSegment(AnonymousMapping *mapping)
*/
allocsize = mapping->shmem_size;
+ /* Specify the segment file size using allocsize. */
+ ftruncate(mapping->segment_fd, allocsize);
+
ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
- PG_MMAP_FLAGS, -1, 0);
+ PG_MMAP_FLAGS, mapping->segment_fd, 0);
mmap_errno = errno;
}
@@ -917,6 +945,8 @@ AnonymousShmemResize(void)
if (m->shmem_size == new_size)
continue;
+ /* Resize the backing anon file. */
+ ftruncate(m->segment_fd, new_size);
/*
* Fail hard if faced any issues. In theory we could try to handle this
diff --git a/src/include/portability/mem.h b/src/include/portability/mem.h
index ef9800732d9..40588ff6968 100644
--- a/src/include/portability/mem.h
+++ b/src/include/portability/mem.h
@@ -38,7 +38,7 @@
#define MAP_NOSYNC 0
#endif
-#define PG_MMAP_FLAGS (MAP_SHARED|MAP_ANONYMOUS|MAP_HASSEMAPHORE)
+#define PG_MMAP_FLAGS (MAP_SHARED|MAP_HASSEMAPHORE)
/* Some really old systems don't define MAP_FAILED. */
#ifndef MAP_FAILED
--
2.34.1