0010-Reinitialize-StrategyControl-after-resizing-20250616.patch
text/x-patch
Filename: 0010-Reinitialize-StrategyControl-after-resizing-20250616.patch
Type: text/x-patch
Part: 8
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch 0010
Subject: Reinitialize StrategyControl after resizing buffers
| File | + | − |
|---|---|---|
| src/backend/port/sysv_shmem.c | 22 | 2 |
| src/backend/postmaster/bgwriter.c | 1 | 1 |
| src/backend/storage/buffer/buf_init.c | 9 | 2 |
| src/backend/storage/buffer/bufmgr.c | 57 | 17 |
| src/backend/storage/buffer/freelist.c | 133 | 0 |
| src/include/storage/buf_internals.h | 1 | 0 |
| src/include/storage/bufmgr.h | 2 | 1 |
| src/include/storage/ipc.h | 1 | 0 |
From 2d173c9580b125dbc1248d86fb603a8508501ede Mon Sep 17 00:00:00 2001
From: Ashutosh Bapat <ashutosh.bapat.oss@gmail.com>
Date: Tue, 10 Jun 2025 11:00:36 +0530
Subject: [PATCH 10/17] Reinitialize StrategyControl after resizing buffers
... and BgBufferSync and ClockSweepTick adjustments
The commit introduces a separate function StrategyReInitialize() instead
of reusing StrategyInitialize() since some of the things that the second
one does are not required in the first one. Here's list of what
StrategyReInitialize() does and how does it differ from
StrategyInitialize().
1. When expanding the buffer pool add new buffers to the free list.
2. When shrinking buffers, we remove any buffers, in the area being
shrunk, from the freelist. While doing so we adjust the first and
last free buffer pointers in the StrategyControl area. Hence nothing
more needed after resizing.
3. Check the sanity of the free buffer list is added after resizing.
4. StrategyControl pointer needn't be fetched again since it should not
change. But added an Assert to make sure the pointer is valid.
5. &StrategyControl->buffer_strategy_lock need not be initialized again.
6. nextVictimBuffer, completePasses and numBufferAllocs are viewed in
the context of NBuffers. Now that NBuffers itself has changed, those
three do not make sense. Reset them as if the server has restarted
again.
This commit introduces a flag delay_shmem_resize, which postgresql
backends and workers can use to signal the coordinator to delay resizing
operation. Background writer sets this flag when its scanning buffers. Background
writer is blocked when the actual resizing is in progress. But if
resizing is about to begin, it does not scan the buffers by returning
from BgBufferSync(). It stops a scan in progress when it sees that the
resizing has begun. After the resizing is finished, it adjusts the
collected statistics according to the new size of the buffer pool at the
end of barrier processing.
Once the buffer resizing is finished, before resuming the regular
operation, bgwriter resets the information saved so far. This
information is viewed in the context of NBuffers and hence does not make
sense after NBuffer has changed.
Ashutosh Bapat
---
src/backend/port/sysv_shmem.c | 24 ++++-
src/backend/postmaster/bgwriter.c | 2 +-
src/backend/storage/buffer/buf_init.c | 11 ++-
src/backend/storage/buffer/bufmgr.c | 74 ++++++++++----
src/backend/storage/buffer/freelist.c | 133 ++++++++++++++++++++++++++
src/include/storage/buf_internals.h | 1 +
src/include/storage/bufmgr.h | 3 +-
src/include/storage/ipc.h | 1 +
8 files changed, 226 insertions(+), 23 deletions(-)
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index c03464b40e2..9144f101585 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -114,6 +114,7 @@ static AnonymousMapping Mappings[ANON_MAPPINGS];
/* Flag telling postmaster that resize is needed */
volatile bool pending_pm_shmem_resize = false;
+volatile bool delay_shmem_resize = false;
/* Keeps track of the previous NBuffers value */
static int NBuffersOld = -1;
@@ -1207,6 +1208,12 @@ AnonymousShmemResize(void)
LWLockRelease(ShmemResizeLock);
}
+
+ /*
+ * TODO: Shouldn't we call ResizeBufferPool() here as well? Or those
+ * backend who can not lock the LWLock conditionally won't resize the
+ * buffers.
+ */
}
return true;
@@ -1225,13 +1232,17 @@ AnonymousShmemResize(void)
bool
ProcessBarrierShmemResize(Barrier *barrier)
{
- elog(DEBUG1, "Handle a barrier for shmem resizing from %d to %d, %d",
- NBuffersOld, NBuffersPending, pending_pm_shmem_resize);
+ elog(DEBUG1, "Handle a barrier for shmem resizing from %d to %d, %d, %d",
+ NBuffersOld, NBuffersPending, pending_pm_shmem_resize, delay_shmem_resize);
/* Wait until we have seen the new NBuffers value */
if (!pending_pm_shmem_resize)
return false;
+ /* Wait till this process becomes ready to resize buffers. */
+ if (delay_shmem_resize)
+ return false;
+
/*
* First thing to do after attaching to the barrier is to wait for others.
* We can't simply use BarrierArriveAndWait, because backends might arrive
@@ -1281,6 +1292,15 @@ ProcessBarrierShmemResize(Barrier *barrier)
/* The second phase means the resize has finished, SHMEM_RESIZE_DONE */
BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_DONE);
+ if (MyBackendType == B_BG_WRITER)
+ {
+ /*
+ * Before resuming regular background writer activity, adjust the
+ * statistics collected so far.
+ */
+ BgBufferSyncAdjust(NBuffersOld, NBuffers);
+ }
+
BarrierDetach(barrier);
return true;
}
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 72f5acceec7..32b34f28ead 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -233,7 +233,7 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len)
/*
* Do one cycle of dirty-buffer writing.
*/
- can_hibernate = BgBufferSync(&wb_context);
+ can_hibernate = BgBufferSync(&wb_context, false);
/* Report pending statistics to the cumulative stats system */
pgstat_report_bgwriter();
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index f78be4700df..7b8bc577bd5 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -163,8 +163,15 @@ BufferManagerShmemInit(int FirstBufferToInit)
if (FirstBufferToInit < NBuffers)
GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST;
- /* Init other shared buffer-management stuff */
- StrategyInitialize(!foundDescs);
+ /*
+ * Init other shared buffer-management stuff from scratch configuring buffer
+ * pool the first time. If we are just resizing buffer pool adjust only the
+ * required structures.
+ */
+ if (FirstBufferToInit == 0)
+ StrategyInitialize(!foundDescs);
+ else
+ StrategyReInitialize(FirstBufferToInit);
/* Initialize per-backend file flush context */
WritebackContextInit(&BackendWritebackContext,
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 57d78c482bb..194d5da2999 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3611,6 +3611,32 @@ BufferSync(int flags)
TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
}
+/*
+ * Information saved between BgBufferSync() calls so we can determine the
+ * strategy point's advance rate and avoid scanning already-cleaned buffers. The
+ * variables are global instead of static local so that BgBufferSyncAdjust() can
+ * adjust it when resizing shared buffers.
+ */
+static bool saved_info_valid = false;
+static int prev_strategy_buf_id;
+static uint32 prev_strategy_passes;
+static int next_to_clean;
+static uint32 next_passes;
+
+/* Moving averages of allocation rate and clean-buffer density */
+static float smoothed_alloc = 0;
+static float smoothed_density = 10.0;
+
+void
+BgBufferSyncAdjust(int NBuffersOld, int NBuffersNew)
+{
+ saved_info_valid = false;
+#ifdef BGW_DEBUG
+ elog(DEBUG2, "invalidated background writer status after resizing buffers from %d to %d",
+ NBuffersOld, NBuffersNew);
+#endif
+}
+
/*
* BgBufferSync -- Write out some dirty buffers in the pool.
*
@@ -3623,27 +3649,13 @@ BufferSync(int flags)
* bgwriter_lru_maxpages to 0.)
*/
bool
-BgBufferSync(WritebackContext *wb_context)
+BgBufferSync(WritebackContext *wb_context, bool reset)
{
/* info obtained from freelist.c */
int strategy_buf_id;
uint32 strategy_passes;
uint32 recent_alloc;
- /*
- * Information saved between calls so we can determine the strategy
- * point's advance rate and avoid scanning already-cleaned buffers.
- */
- static bool saved_info_valid = false;
- static int prev_strategy_buf_id;
- static uint32 prev_strategy_passes;
- static int next_to_clean;
- static uint32 next_passes;
-
- /* Moving averages of allocation rate and clean-buffer density */
- static float smoothed_alloc = 0;
- static float smoothed_density = 10.0;
-
/* Potentially these could be tunables, but for now, not */
float smoothing_samples = 16;
float scan_whole_pool_milliseconds = 120000.0;
@@ -3666,6 +3678,22 @@ BgBufferSync(WritebackContext *wb_context)
long new_strategy_delta;
uint32 new_recent_alloc;
+ /*
+ * If buffer pool is being shrunk the buffer being written out may not remain
+ * valid. If the buffer pool is being expanded, more buffers will become
+ * available without even this function writing out any. Hence wait till
+ * buffer resizing finishes i.e. go into hibernation mode.
+ */
+ if (pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers) != NBuffers)
+ return true;
+
+ /*
+ * Resizing shared buffers while this function is performing an LRU scan on
+ * them may lead to wrong results. Indicate that the resizing should wait for
+ * the LRU scan to complete.
+ */
+ delay_shmem_resize = true;
+
/*
* Find out where the freelist clock sweep currently is, and how many
* buffer allocations have happened since our last call.
@@ -3842,8 +3870,17 @@ BgBufferSync(WritebackContext *wb_context)
num_written = 0;
reusable_buffers = reusable_buffers_est;
- /* Execute the LRU scan */
- while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
+ /*
+ * Execute the LRU scan.
+ *
+ * If buffer pool is being shrunk, the buffer being written may not remain
+ * valid. If the buffer pool is being expanded, more buffers will become
+ * available without even this function writing any. Hence stop what we are doing. This
+ * also unblocks other processes that are waiting for buffer resizing to
+ * finish.
+ */
+ while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est &&
+ pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers) == NBuffers)
{
int sync_state = SyncOneBuffer(next_to_clean, true,
wb_context);
@@ -3902,6 +3939,9 @@ BgBufferSync(WritebackContext *wb_context)
#endif
}
+ /* Let the resizing commence. */
+ delay_shmem_resize = false;
+
/* Return true if OK to hibernate */
return (bufs_to_lap == 0 && recent_alloc == 0);
}
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 7b9ed010e2f..41641bb3ae6 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -98,6 +98,9 @@ static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
uint32 *buf_state);
static void AddBufferToRing(BufferAccessStrategy strategy,
BufferDesc *buf);
+#ifdef USE_ASSERT_CHECKING
+static void StrategyValidateFreeList(void);
+#endif /* USE_ASSERT_CHECKING */
/*
* ClockSweepTick - Helper routine for StrategyGetBuffer()
@@ -526,6 +529,88 @@ StrategyInitialize(bool init)
Assert(!init);
}
+/*
+ * StrategyReInitialize -- re-initialize the buffer cache replacement
+ * strategy.
+ *
+ * To be called when resizing buffer manager and only from the coordinator.
+ * TODO: Assess the differences between this function and StrategyInitialize().
+ */
+void
+StrategyReInitialize(int FirstBufferIdToInit)
+{
+ bool found;
+
+ /*
+ * Resizing memory for buffer pools should not affect the address of
+ * StrategyControl.
+ */
+ if (StrategyControl != (BufferStrategyControl *)
+ ShmemInitStructInSegment("Buffer Strategy Status",
+ sizeof(BufferStrategyControl),
+ &found, STRATEGY_SHMEM_SEGMENT))
+ elog(FATAL, "something went wrong while re-initializing the buffer strategy");
+
+ Assert(found);
+
+ /* TODO: Buffer lookup table adjustment: There are two options:
+ *
+ * 1. Resize the buffer lookup table to match the new number of buffers. But
+ * this requires rehashing all the entries in the buffer lookup table with
+ * the new table size.
+ *
+ * 2. Allocate maximum size of the buffer lookup table at the beginning and
+ * never resize it. This leaves sparse buffer lookup table which is
+ * inefficient from both memory and time perspective. According to David
+ * Rowley, the sparse entries in the buffer look up table cause frequent
+ * cacheline reload which affect performance. If the impact of that
+ * inefficiency in a benchmark is significant, we will need to consider first
+ * option.
+ */
+
+ /*
+ * When shrinking buffers, we must have adjusted the first and the last free
+ * buffer when removing the buffers being shrunk from the free list. Nothing
+ * to be done here.
+ *
+ * When expanding the shared buffers, new buffers are added at the end of the
+ * freelist or they form the new free list if there are no free buffers.
+ */
+ if (FirstBufferIdToInit < NBuffers)
+ {
+ if (StrategyControl->firstFreeBuffer == FREENEXT_END_OF_LIST)
+ StrategyControl->firstFreeBuffer = FirstBufferIdToInit;
+ else
+ {
+ Assert(StrategyControl->lastFreeBuffer >= 0);
+ GetBufferDescriptor(StrategyControl->lastFreeBuffer - 1)->freeNext = FirstBufferIdToInit;
+ }
+
+ StrategyControl->lastFreeBuffer = NBuffers - 1;
+ }
+
+ /* Check free list sanity after resizing. */
+#ifdef USE_ASSERT_CHECKING
+ StrategyValidateFreeList();
+#endif /* USE_ASSERT_CHECKING */
+
+ /*
+ * The clock sweep tick pointer might have got invalidated. Reset it as if
+ * starting a fresh server.
+ */
+ pg_atomic_write_u32(&StrategyControl->nextVictimBuffer, 0);
+
+ /*
+ * The old statistics is viewed in the context of the number of shared
+ * buffers. It does not make sense now that the number of shared buffers
+ * itself has changed.
+ */
+ StrategyControl->completePasses = 0;
+ pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
+
+ /* No pending notification */
+ StrategyControl->bgwprocno = -1;
+}
/*
* StrategyPurgeFreeList -- remove all buffers with id higher than the number of
@@ -595,6 +680,54 @@ StrategyPurgeFreeList(int numBuffers)
*/
}
+#ifdef USE_ASSERT_CHECKING
+/*
+ * StrategyValidateFreeList-- check sanity of free buffer list.
+ */
+static void
+StrategyValidateFreeList(void)
+{
+ int nextFree = StrategyControl->firstFreeBuffer;
+ int numFreeBuffers = 0;
+ int lastFreeBuffer = FREENEXT_END_OF_LIST;
+
+ SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+
+ while (nextFree != FREENEXT_END_OF_LIST)
+ {
+ BufferDesc *buf = GetBufferDescriptor(nextFree);
+
+ /* nextFree should be id of buffer being examined. */
+ Assert(nextFree == buf->buf_id);
+ Assert(buf->buf_id < NBuffers);
+ /* The buffer should not be marked as not in the list. */
+ Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
+
+ /* Update our knowledge of last buffer in the free list. */
+ lastFreeBuffer = buf->buf_id;
+
+ numFreeBuffers++;
+
+ /* Avoid infinite recursion in case there are cycles in free list. */
+ if (numFreeBuffers > NBuffers)
+ break;
+
+ nextFree = buf->freeNext;
+ }
+
+ Assert(numFreeBuffers <= NBuffers);
+
+ /*
+ * Make sure that the StrategyControl's knowledge of last free buffer
+ * agrees with what's there in the free list.
+ */
+ if (StrategyControl->firstFreeBuffer != FREENEXT_END_OF_LIST)
+ Assert(StrategyControl->lastFreeBuffer == lastFreeBuffer);
+
+ SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+}
+#endif /* USE_ASSERT_CHECKING */
+
/* ----------------------------------------------------------------
* Backend-private buffer ring management
* ----------------------------------------------------------------
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index add15e3723b..46949e9d90e 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -454,6 +454,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
extern Size StrategyShmemSize(void);
extern void StrategyInitialize(bool init);
extern void StrategyPurgeFreeList(int numBuffers);
+extern void StrategyReInitialize(int FirstBufferToInit);
extern bool have_free_buffer(void);
/* buf_table.c */
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 0c554f0b130..83a75eab844 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -298,7 +298,8 @@ extern bool ConditionalLockBufferForCleanup(Buffer buffer);
extern bool IsBufferCleanupOK(Buffer buffer);
extern bool HoldingBufferPinThatDelaysRecovery(void);
-extern bool BgBufferSync(struct WritebackContext *wb_context);
+extern bool BgBufferSync(struct WritebackContext *wb_context, bool reset);
+extern void BgBufferSyncAdjust(int NBuffersOld, int NBuffersNew);
extern uint32 GetPinLimit(void);
extern uint32 GetLocalPinLimit(void);
diff --git a/src/include/storage/ipc.h b/src/include/storage/ipc.h
index bb7ae4d33b3..7d1c64a9267 100644
--- a/src/include/storage/ipc.h
+++ b/src/include/storage/ipc.h
@@ -65,6 +65,7 @@ typedef void (*shmem_startup_hook_type) (void);
extern PGDLLIMPORT bool proc_exit_inprogress;
extern PGDLLIMPORT bool shmem_exit_inprogress;
extern PGDLLIMPORT volatile bool pending_pm_shmem_resize;
+extern PGDLLIMPORT volatile bool delay_shmem_resize;
pg_noreturn extern void proc_exit(int code);
extern void shmem_exit(int code);
--
2.34.1