From ba93989e755b23f3cef21ad0d11a14231e866b07 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Wed, 16 Oct 2024 20:24:58 +0200
Subject: [PATCH 4/7] Allow to resize shared memory without restart

Add assing hook for shared_buffers to resize shared memory using space,
introduced in the previous commits without requiring PostgreSQL restart.
Size for every shared memory slot is recalculated based on the new
NBuffers, and extended using mremap. After allocating new space, new
shared structures (buffer blocks, descriptors, etc) are allocated as
needed. Here is how it looks like after raising shared_buffers from 128
MB to 512 MB and calling pg_reload_conf():

    -- 128 MB
    7f5a2bd04000-7f5a32e52000  /dev/zero (deleted)
    7f5a39252000-7f5a4030e000  /dev/zero (deleted)
    7f5a4670e000-7f5a4d7ba000  /dev/zero (deleted)
    7f5a53bba000-7f5a5ad26000  /dev/zero (deleted)
    7f5a9ad26000-7f5aa9d94000  /dev/zero (deleted)
    ^ buffers mapping, ~240 MB
    7f5d29d94000-7f5d30e00000  /dev/zero (deleted)

    -- 512 MB
    7f5a2bd04000-7f5a33274000  /dev/zero (deleted)
    7f5a39252000-7f5a4057e000  /dev/zero (deleted)
    7f5a4670e000-7f5a4d9fa000  /dev/zero (deleted)
    7f5a53bba000-7f5a5b1a6000  /dev/zero (deleted)
    7f5a9ad26000-7f5ac1f14000  /dev/zero (deleted)
    ^ buffers mapping, ~625 MB
    7f5d29d94000-7f5d30f80000  /dev/zero (deleted)

The implementation supports only increasing of shared_buffers. For
decreasing the value a similar procedure is needed. But the buffer
blocks with data have to be drained first, so that the actual data set
fits into the new smaller space.

From experiment it turns out that shared mappings have to be extended
separately for each process that uses them. Another rough edge is that a
backend, executing pg_reload_conf interactively, will not resize
mappings immediately, for some reason it will require another command.

Note, that mremap is Linux specific, thus the implementation not very
portable.
---
 src/backend/port/sysv_shmem.c                 | 62 +++++++++++++
 src/backend/storage/buffer/buf_init.c         | 86 +++++++++++++++++++
 src/backend/storage/ipc/ipci.c                | 11 +++
 src/backend/storage/ipc/shmem.c               | 14 ++-
 .../utils/activity/wait_event_names.txt       |  1 +
 src/backend/utils/misc/guc_tables.c           |  4 +-
 src/include/storage/bufmgr.h                  |  1 +
 src/include/storage/lwlocklist.h              |  1 +
 src/include/storage/pg_shmem.h                |  2 +
 9 files changed, 171 insertions(+), 11 deletions(-)

diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 7157bf95b1a..72e823618ef 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -30,9 +30,11 @@
 #include "miscadmin.h"
 #include "port/pg_bitutils.h"
 #include "portability/mem.h"
+#include "storage/bufmgr.h"
 #include "storage/dsm.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
+#include "storage/lwlock.h"
 #include "storage/pg_shmem.h"
 #include "utils/guc.h"
 #include "utils/guc_hooks.h"
@@ -861,6 +863,66 @@ AnonymousShmemDetach(int status, Datum arg)
 	}
 }
 
+/*
+ * An assign callback for shared_buffers GUC -- a somewhat clumsy way of
+ * resizing shared memory without a restart. On NBuffers change use the new
+ * value to recalculate required size for every shmem slot, then base on the
+ * new and old values initialize new buffer blocks.
+ *
+ * The actual slot resizing is done via mremap, which will fail if is not
+ * sufficient space to expand the mapping.
+ *
+ * XXX: For some readon in the current implementation the change is applied to
+ * the backend calling pg_reload_conf only at the backend exit.
+ */
+void
+AnonymousShmemResize(int newval, void *extra)
+{
+	int	numSemas;
+	bool reinit = false;
+	int NBuffersOld = NBuffers;
+
+	/*
+	 * XXX: Currently only increasing of shared_buffers is supported. For
+	 * decreasing something similar has to be done, but buffer blocks with
+	 * data have to be drained first.
+	 */
+	if(NBuffers > newval)
+		return;
+
+	/* XXX: Hack, NBuffers has to be exposed in the the interface for
+	 * memory calculation and buffer blocks reinitialization instead. */
+	NBuffers = newval;
+
+	for(int i = 0; i < next_free_slot; i++)
+	{
+		Size new_size = CalculateShmemSize(&numSemas, i);
+		AnonymousMapping *m = &Mappings[i];
+
+		if (m->shmem == NULL)
+			continue;
+
+		if (m->shmem_size == new_size)
+			continue;
+
+		if (mremap(m->shmem, m->shmem_size, new_size, 0) < 0)
+			elog(LOG, "mremap(%p, %zu) failed: %m",
+				 m->shmem, m->shmem_size);
+		else
+		{
+			reinit = true;
+			m->shmem_size = new_size;
+		}
+	}
+
+	if (reinit)
+	{
+		LWLockAcquire(ShmemResizeLock, LW_EXCLUSIVE);
+		BufferManagerShmemResize(NBuffersOld);
+		LWLockRelease(ShmemResizeLock);
+	}
+}
+
 /*
  * PGSharedMemoryCreate
  *
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index b066e97a0c9..ae58f82937f 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -153,6 +153,92 @@ BufferManagerShmemInit(void)
 						 &backend_flush_after);
 }
 
+/*
+ * Reinitialize shared memory structures, which size depends on NBuffers. It's
+ * similar to BufferManagerShmemInit, but applied only to the buffers in the range
+ * between NBuffersOld and NBuffers.
+ */
+void
+BufferManagerShmemResize(int NBuffersOld)
+{
+	bool		foundBufs,
+				foundDescs,
+				foundIOCV,
+				foundBufCkpt;
+	int			i;
+
+	/* XXX: Only increasing of shared_buffers is supported in this function */
+	if(NBuffersOld > NBuffers)
+		return;
+
+	/* Align descriptors to a cacheline boundary. */
+	BufferDescriptors = (BufferDescPadded *)
+		ShmemInitStructInSlot("Buffer Descriptors",
+						NBuffers * sizeof(BufferDescPadded),
+						&foundDescs, BUFFER_DESCRIPTORS_SHMEM_SLOT);
+
+	/* Align condition variables to cacheline boundary. */
+	BufferIOCVArray = (ConditionVariableMinimallyPadded *)
+		ShmemInitStructInSlot("Buffer IO Condition Variables",
+						NBuffers * sizeof(ConditionVariableMinimallyPadded),
+						&foundIOCV, BUFFER_IOCV_SHMEM_SLOT);
+
+	/*
+	 * The array used to sort to-be-checkpointed buffer ids is located in
+	 * shared memory, to avoid having to allocate significant amounts of
+	 * memory at runtime. As that'd be in the middle of a checkpoint, or when
+	 * the checkpointer is restarted, memory allocation failures would be
+	 * painful.
+	 */
+	CkptBufferIds = (CkptSortItem *)
+		ShmemInitStructInSlot("Checkpoint BufferIds",
+						NBuffers * sizeof(CkptSortItem), &foundBufCkpt,
+						CHECKPOINT_BUFFERS_SHMEM_SLOT);
+
+	/* Align buffer pool on IO page size boundary. */
+	BufferBlocks = (char *)
+		TYPEALIGN(PG_IO_ALIGN_SIZE,
+				  ShmemInitStructInSlot("Buffer Blocks",
+								  NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
+								  &foundBufs, BUFFERS_SHMEM_SLOT));
+
+	/*
+	 * Initialize the headers for new buffers.
+	 */
+	for (i = NBuffersOld - 1; i < NBuffers; i++)
+	{
+		BufferDesc *buf = GetBufferDescriptor(i);
+
+		ClearBufferTag(&buf->tag);
+
+		pg_atomic_init_u32(&buf->state, 0);
+		buf->wait_backend_pgprocno = INVALID_PROC_NUMBER;
+
+		buf->buf_id = i;
+
+		/*
+		 * Initially link all the buffers together as unused. Subsequent
+		 * management of this list is done by freelist.c.
+		 */
+		buf->freeNext = i + 1;
+
+		LWLockInitialize(BufferDescriptorGetContentLock(buf),
+						 LWTRANCHE_BUFFER_CONTENT);
+
+		ConditionVariableInit(BufferDescriptorGetIOCV(buf));
+	}
+
+	/* Correct last entry of linked list */
+	GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST;
+
+	/* Init other shared buffer-management stuff */
+	StrategyInitialize(!foundDescs);
+
+	/* Initialize per-backend file flush context */
+	WritebackContextInit(&BackendWritebackContext,
+						 &backend_flush_after);
+}
+
 /*
  * BufferManagerShmemSize
  *
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index fd8b44b8161..15d06fd4ca4 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -83,6 +83,9 @@ RequestAddinShmemSpace(Size size)
  *
  * If num_semaphores is not NULL, it will be set to the number of semaphores
  * required.
+ *
+ * XXX: Calculation for non main shared memory slots are incorrect, it includes
+ * more than needed for buffers only.
  */
 Size
 CalculateShmemSize(int *num_semaphores, int shmem_slot)
@@ -149,6 +152,14 @@ CalculateShmemSize(int *num_semaphores, int shmem_slot)
 	size = add_size(size, InjectionPointShmemSize());
 	size = add_size(size, SlotSyncShmemSize());
 
+	/*
+	 * XXX: For some reason slightly more memory is needed for larger
+	 * shared_buffers, but this size is enough for any large value I've tested
+	 * with. Is it a mistake in how slots are split, or there was a hidden
+	 * inconsistency in shmem calculation?
+	 */
+	size = add_size(size, 1024 * 1024 * 100);
+
 	/* include additional requested shmem from preload libraries */
 	size = add_size(size, total_addin_request);
 
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 89d8c7baf16..faca7c9a525 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -490,17 +490,13 @@ ShmemInitStructInSlot(const char *name, Size size, bool *foundPtr,
 	{
 		/*
 		 * Structure is in the shmem index so someone else has allocated it
-		 * already.  The size better be the same as the size we are trying to
-		 * initialize to, or there is a name conflict (or worse).
+		 * already. Verify the structure's size:
+		 * - If it's the same, we've found the expected structure.
+		 * - If it's different, we're resizing the expected structure.
 		 */
 		if (result->size != size)
-		{
-			LWLockRelease(ShmemIndexLock);
-			ereport(ERROR,
-					(errmsg("ShmemIndex entry size is wrong for data structure"
-							" \"%s\": expected %zu, actual %zu",
-							name, size, result->size)));
-		}
+			result->size = size;
+
 		structPtr = result->location;
 	}
 	else
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 16144c2b72d..e8ecff5f7f0 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -345,6 +345,7 @@ WALSummarizer	"Waiting to read or update WAL summarization state."
 DSMRegistry	"Waiting to read or update the dynamic shared memory registry."
 InjectionPoint	"Waiting to read or update information related to injection points."
 SerialControl	"Waiting to read or update shared <filename>pg_serial</filename> state."
+ShmemResize	"Waiting to resize shared memory."
 
 #
 # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE)
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 8cf1afbad20..7a12eedbbd3 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -2318,14 +2318,14 @@ struct config_int ConfigureNamesInt[] =
 	 * checking for overflow, so we mustn't allow more than INT_MAX / 2.
 	 */
 	{
-		{"shared_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+		{"shared_buffers", PGC_SIGHUP, RESOURCES_MEM,
 			gettext_noop("Sets the number of shared memory buffers used by the server."),
 			NULL,
 			GUC_UNIT_BLOCKS
 		},
 		&NBuffers,
 		16384, 16, INT_MAX / 2,
-		NULL, NULL, NULL
+		NULL, AnonymousShmemResize, NULL
 	},
 
 	{
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 27c4cac8540..ead69a2974c 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -302,6 +302,7 @@ extern bool EvictUnpinnedBuffer(Buffer buf);
 /* in buf_init.c */
 extern void BufferManagerShmemInit(void);
 extern Size BufferManagerShmemSize(int);
+extern void BufferManagerShmemResize(int);
 
 /* in localbuf.c */
 extern void AtProcExit_LocalBuffers(void);
diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h
index 6a2f64c54fb..e8d379e4b0b 100644
--- a/src/include/storage/lwlocklist.h
+++ b/src/include/storage/lwlocklist.h
@@ -83,3 +83,4 @@ PG_LWLOCK(49, WALSummarizer)
 PG_LWLOCK(50, DSMRegistry)
 PG_LWLOCK(51, InjectionPoint)
 PG_LWLOCK(52, SerialControl)
+PG_LWLOCK(53, ShmemResize)
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index c0143e38995..c1a96240d79 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -105,6 +105,8 @@ extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2);
 extern void PGSharedMemoryDetach(void);
 extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags);
 
+extern void AnonymousShmemResize(int newval, void *extra);
+
 /*
  * To be able to dynamically resize largest parts of the data stored in shared
  * memory, we split it into multiple shared memory mappings slots. Each slot
-- 
2.34.1

