v6-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patch
text/x-patch
Filename: v6-0001-Refactor-goto-into-for-loop-in-GetVictimBuffer.patch
Type: text/x-patch
Part: 3
Message:
Re: Checkpointer write combining
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch v6-0001
Subject: Refactor goto into for loop in GetVictimBuffer()
| File | + | − |
|---|---|---|
| src/backend/storage/buffer/bufmgr.c | 94 | 106 |
| src/backend/storage/buffer/freelist.c | 25 | 7 |
| src/include/storage/buf_internals.h | 5 | 0 |
From 6c46b33c7a51990f1d2df0fab7dfea2f88e0861e Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 2 Sep 2025 11:00:44 -0400
Subject: [PATCH v6 1/9] Refactor goto into for loop in GetVictimBuffer()
GetVictimBuffer() implemented a loop to optimistically lock a clean
victim buffer using a goto. Future commits will add batch flushing
functionality to GetVictimBuffer. The new logic works better with
standard for loop flow control.
This commit is only a refactor and does not introduce any new
functionality.
Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com
Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com
---
src/backend/storage/buffer/bufmgr.c | 200 ++++++++++++--------------
src/backend/storage/buffer/freelist.c | 32 ++++-
src/include/storage/buf_internals.h | 5 +
3 files changed, 124 insertions(+), 113 deletions(-)
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index fe470de63f2..f3668051574 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -68,10 +68,6 @@
#include "utils/timestamp.h"
-/* Note: these two macros only work on shared buffers, not local ones! */
-#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
-#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
-
/* Note: this macro only works on local buffers, not shared ones! */
#define LocalBufHdrGetBlock(bufHdr) \
LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
@@ -2344,130 +2340,122 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
ReservePrivateRefCountEntry();
ResourceOwnerEnlarge(CurrentResourceOwner);
- /* we return here if a prospective victim buffer gets used concurrently */
-again:
-
- /*
- * Select a victim buffer. The buffer is returned with its header
- * spinlock still held!
- */
- buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
- buf = BufferDescriptorGetBuffer(buf_hdr);
-
- Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
-
- /* Pin the buffer and then release the buffer spinlock */
- PinBuffer_Locked(buf_hdr);
-
- /*
- * We shouldn't have any other pins for this buffer.
- */
- CheckBufferIsPinnedOnce(buf);
-
- /*
- * If the buffer was dirty, try to write it out. There is a race
- * condition here, in that someone might dirty it after we released the
- * buffer header lock above, or even while we are writing it out (since
- * our share-lock won't prevent hint-bit updates). We will recheck the
- * dirty bit after re-locking the buffer header.
- */
- if (buf_state & BM_DIRTY)
+ /* Select a victim buffer using an optimistic locking scheme. */
+ for (;;)
{
- LWLock *content_lock;
+ /*
+ * Attempt to claim a victim buffer. The buffer is returned with its
+ * header spinlock still held!
+ */
+ buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
+ buf = BufferDescriptorGetBuffer(buf_hdr);
- Assert(buf_state & BM_TAG_VALID);
- Assert(buf_state & BM_VALID);
+ Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
+
+ /* Pin the buffer and then release the buffer spinlock */
+ PinBuffer_Locked(buf_hdr);
/*
- * We need a share-lock on the buffer contents to write it out (else
- * we might write invalid data, eg because someone else is compacting
- * the page contents while we write). We must use a conditional lock
- * acquisition here to avoid deadlock. Even though the buffer was not
- * pinned (and therefore surely not locked) when StrategyGetBuffer
- * returned it, someone else could have pinned and exclusive-locked it
- * by the time we get here. If we try to get the lock unconditionally,
- * we'd block waiting for them; if they later block waiting for us,
- * deadlock ensues. (This has been observed to happen when two
- * backends are both trying to split btree index pages, and the second
- * one just happens to be trying to split the page the first one got
- * from StrategyGetBuffer.)
+ * We shouldn't have any other pins for this buffer.
*/
- content_lock = BufferDescriptorGetContentLock(buf_hdr);
- if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
- {
- /*
- * Someone else has locked the buffer, so give it up and loop back
- * to get another one.
- */
- UnpinBuffer(buf_hdr);
- goto again;
- }
+ CheckBufferIsPinnedOnce(buf);
/*
- * If using a nondefault strategy, and writing the buffer would
- * require a WAL flush, let the strategy decide whether to go ahead
- * and write/reuse the buffer or to choose another victim. We need a
- * lock to inspect the page LSN, so this can't be done inside
- * StrategyGetBuffer.
+ * If the buffer was dirty, try to write it out. There is a race
+ * condition here, in that someone might dirty it after we released
+ * the buffer header lock above, or even while we are writing it out
+ * (since our share-lock won't prevent hint-bit updates). We will
+ * recheck the dirty bit after re-locking the buffer header.
*/
- if (strategy != NULL)
+ if (buf_state & BM_DIRTY)
{
- XLogRecPtr lsn;
+ LWLock *content_lock;
- /* Read the LSN while holding buffer header lock */
- buf_state = LockBufHdr(buf_hdr);
- lsn = BufferGetLSN(buf_hdr);
- UnlockBufHdr(buf_hdr, buf_state);
+ Assert(buf_state & BM_TAG_VALID);
+ Assert(buf_state & BM_VALID);
- if (XLogNeedsFlush(lsn)
- && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
+ /*
+ * We need a share-lock on the buffer contents to write it out
+ * (else we might write invalid data, eg because someone else is
+ * compacting the page contents while we write). We must use a
+ * conditional lock acquisition here to avoid deadlock. Even
+ * though the buffer was not pinned (and therefore surely not
+ * locked) when StrategyGetBuffer returned it, someone else could
+ * have pinned and exclusive-locked it by the time we get here. If
+ * we try to get the lock unconditionally, we'd block waiting for
+ * them; if they later block waiting for us, deadlock ensues.
+ * (This has been observed to happen when two backends are both
+ * trying to split btree index pages, and the second one just
+ * happens to be trying to split the page the first one got from
+ * StrategyGetBuffer.)
+ */
+ content_lock = BufferDescriptorGetContentLock(buf_hdr);
+ if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+ {
+ /*
+ * Someone else has locked the buffer, so give it up and loop
+ * back to get another one.
+ */
+ UnpinBuffer(buf_hdr);
+ continue;
+ }
+
+ /*
+ * If using a nondefault strategy, and writing the buffer would
+ * require a WAL flush, let the strategy decide whether to go
+ * ahead and write/reuse the buffer or to choose another victim.
+ * We need the content lock to inspect the page LSN, so this can't
+ * be done inside StrategyGetBuffer.
+ */
+ if (StrategyRejectBuffer(strategy, buf_hdr, from_ring))
{
LWLockRelease(content_lock);
UnpinBuffer(buf_hdr);
- goto again;
+ continue;
}
- }
- /* OK, do the I/O */
- FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
- LWLockRelease(content_lock);
+ /* OK, do the I/O */
+ FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
+ LWLockRelease(content_lock);
- ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
- &buf_hdr->tag);
- }
+ ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+ &buf_hdr->tag);
+ }
+ if (buf_state & BM_VALID)
+ {
+ /*
+ * When a BufferAccessStrategy is in use, blocks evicted from
+ * shared buffers are counted as IOOP_EVICT in the corresponding
+ * context (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted
+ * by a strategy in two cases: 1) while initially claiming buffers
+ * for the strategy ring 2) to replace an existing strategy ring
+ * buffer because it is pinned or in use and cannot be reused.
+ *
+ * Blocks evicted from buffers already in the strategy ring are
+ * counted as IOOP_REUSE in the corresponding strategy context.
+ *
+ * At this point, we can accurately count evictions and reuses,
+ * because we have successfully claimed the valid buffer.
+ * Previously, we may have been forced to release the buffer due
+ * to concurrent pinners or erroring out.
+ */
+ pgstat_count_io_op(IOOBJECT_RELATION, io_context,
+ from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
+ }
- if (buf_state & BM_VALID)
- {
/*
- * When a BufferAccessStrategy is in use, blocks evicted from shared
- * buffers are counted as IOOP_EVICT in the corresponding context
- * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
- * strategy in two cases: 1) while initially claiming buffers for the
- * strategy ring 2) to replace an existing strategy ring buffer
- * because it is pinned or in use and cannot be reused.
- *
- * Blocks evicted from buffers already in the strategy ring are
- * counted as IOOP_REUSE in the corresponding strategy context.
- *
- * At this point, we can accurately count evictions and reuses,
- * because we have successfully claimed the valid buffer. Previously,
- * we may have been forced to release the buffer due to concurrent
- * pinners or erroring out.
+ * If the buffer has an entry in the buffer mapping table, delete it.
+ * This can fail because another backend could have pinned or dirtied
+ * the buffer. Then loop around and try again.
*/
- pgstat_count_io_op(IOOBJECT_RELATION, io_context,
- from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
- }
+ if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
+ {
+ UnpinBuffer(buf_hdr);
+ continue;
+ }
- /*
- * If the buffer has an entry in the buffer mapping table, delete it. This
- * can fail because another backend could have pinned or dirtied the
- * buffer.
- */
- if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
- {
- UnpinBuffer(buf_hdr);
- goto again;
+ break;
}
/* a final set of sanity checks */
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 7d59a92bd1a..12bb7e2312e 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -15,6 +15,7 @@
*/
#include "postgres.h"
+#include "access/xlog.h"
#include "pgstat.h"
#include "port/atomics.h"
#include "storage/buf_internals.h"
@@ -716,12 +717,21 @@ IOContextForStrategy(BufferAccessStrategy strategy)
* be written out and doing so would require flushing WAL too. This gives us
* a chance to choose a different victim.
*
+ * The buffer must be pinned and content locked and the buffer header spinlock
+ * must not be held. We must hold the content lock to examine the LSN.
+ *
* Returns true if buffer manager should ask for a new victim, and false
- * if this buffer should be written and re-used.
+ * if this buffer should be written and reused.
*/
bool
StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
{
+ uint32 buf_state;
+ XLogRecPtr lsn;
+
+ if (!strategy)
+ return false;
+
/* We only do this in bulkread mode */
if (strategy->btype != BAS_BULKREAD)
return false;
@@ -731,11 +741,19 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_r
strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
return false;
- /*
- * Remove the dirty buffer from the ring; necessary to prevent infinite
- * loop if all ring members are dirty.
- */
- strategy->buffers[strategy->current] = InvalidBuffer;
+ buf_state = LockBufHdr(buf);
+ lsn = BufferGetLSN(buf);
+ UnlockBufHdr(buf, buf_state);
+
+ if (XLogNeedsFlush(lsn))
+ {
+ /*
+ * Remove the dirty buffer from the ring; necessary to prevent an
+ * infinite loop if all ring members are dirty.
+ */
+ strategy->buffers[strategy->current] = InvalidBuffer;
+ return true;
+ }
- return true;
+ return false;
}
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index dfd614f7ca4..b1b81f31419 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -419,6 +419,11 @@ ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
/*
* Internal buffer management routines
*/
+
+/* Note: these two macros only work on shared buffers, not local ones! */
+#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
+#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
+
/* bufmgr.c */
extern void WritebackContextInit(WritebackContext *context, int *max_pending);
extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context);
--
2.43.0