v20251109-0003-Reduce-malloc-free-traffic-by-caching-batc.patch
application/octet-stream
Filename: v20251109-0003-Reduce-malloc-free-traffic-by-caching-batc.patch
Type: application/octet-stream
Part: 2
Message:
Re: index prefetching
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch v20251109-0003
Subject: Reduce malloc/free traffic by caching batches
| File | + | − |
|---|---|---|
| src/backend/access/index/indexam.c | 188 | 14 |
| src/backend/access/nbtree/nbtree.c | 9 | 7 |
| src/backend/access/nbtree/nbtsearch.c | 2 | 2 |
| src/include/access/genam.h | 3 | 1 |
| src/include/access/relscan.h | 9 | 0 |
From 6fe6c4b4d5e52c2e476e292d0c1d4e3add3f6b63 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Wed, 10 Sep 2025 16:54:50 -0400
Subject: [PATCH v20251109 3/3] Reduce malloc/free traffic by caching batches
Instead of immediately freeing a batch, stash it in a cache (a small
fixed-size array), for reuse by the same scan.
There's room for improvement:
- Keeping some of the batch pieces (killItems, itemsvisibility, ...)
instead of freeing them in index_batch_release.
- Allocating only space we need (both index_batch_alloc calls use
MaxTIDsPerBTreePage, and thus malloc - because of ALLOC_CHUNK_LIMIT).
---
src/include/access/genam.h | 4 +-
src/include/access/relscan.h | 9 ++
src/backend/access/index/indexam.c | 202 ++++++++++++++++++++++++--
src/backend/access/nbtree/nbtree.c | 16 +-
src/backend/access/nbtree/nbtsearch.c | 4 +-
5 files changed, 211 insertions(+), 24 deletions(-)
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 767503bb6..1a92a8195 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -230,7 +230,9 @@ extern void index_store_float8_orderby_distances(IndexScanDesc scan,
bool recheckOrderBy);
extern bytea *index_opclass_options(Relation indrel, AttrNumber attnum,
Datum attoptions, bool validate);
-extern IndexScanBatch index_batch_alloc(int maxitems, bool want_itup);
+extern IndexScanBatch index_batch_alloc(IndexScanDesc scan,
+ int maxitems, bool want_itup);
+extern void index_batch_release(IndexScanDesc scan, IndexScanBatch batch);
extern void index_batch_unlock(Relation rel, bool dropPin, IndexScanBatch batch);
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index db5e3b309..763f4ffe4 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -187,6 +187,8 @@ typedef struct IndexScanBatchData
*/
char *itemsvisibility; /* Index-only scan visibility cache */
+ /* capacity of the batch (size of the items array) */
+ int maxitems;
IndexScanBatchPosItem items[FLEXIBLE_ARRAY_MEMBER];
} IndexScanBatchData;
@@ -266,6 +268,13 @@ typedef struct IndexScanBatchState
int headBatch; /* head batch slot */
int nextBatch; /* next empty batch slot */
+ /* small cache of unused batches, to reduce malloc/free traffic */
+ struct
+ {
+ int maxbatches;
+ IndexScanBatchData **batches;
+ } cache;
+
IndexScanBatchData **batches;
/* callback to skip prefetching in IOS etc. */
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index d42cc1c50..f1f2531b3 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -2196,6 +2196,10 @@ index_batch_init(IndexScanDesc scan)
scan->batchState->headBatch = 0; /* initial head batch */
scan->batchState->nextBatch = 0; /* initial batch starts empty */
+ /* XXX init the cache of batches, capacity 16 is arbitrary */
+ scan->batchState->cache.maxbatches = 16;
+ scan->batchState->cache.batches = NULL;
+
scan->batchState->batches =
palloc(sizeof(IndexScanBatchData *) * scan->batchState->maxBatches);
@@ -2328,34 +2332,123 @@ static void
index_batch_end(IndexScanDesc scan)
{
index_batch_reset(scan, true);
+
+ /* bail out without batching */
+ if (!scan->batchState)
+ return;
+
+ /* we can simply free batches thanks to the earlier reset */
+ if (scan->batchState->batches)
+ pfree(scan->batchState->batches);
+
+ /* also walk the cache of batches, if any */
+ if (scan->batchState->cache.batches)
+ {
+ for (int i = 0; i < scan->batchState->cache.maxbatches; i++)
+ {
+ if (scan->batchState->cache.batches[i] == NULL)
+ continue;
+
+ pfree(scan->batchState->cache.batches[i]);
+ }
+
+ pfree(scan->batchState->cache.batches);
+ }
+
+ pfree(scan->batchState);
}
/*
+ * index_batch_alloc
+ * Allocate a batch that can fit maxitems index tuples.
+ *
+ * Returns a IndexScanBatch struct with capacity sufficient for maxitems
+ * index tuples. It's either newly allocated or loaded from a small cache
+ * maintained for individual scans.
+ *
+ * maxitems determines the minimum size of the batch (it may be larger)
+ * want_itup determines whether the bach allocates space for currTuples
+ *
* XXX Both index_batch_alloc() calls in btree use MaxTIDsPerBTreePage,
* which seems unfortunate - it increases the allocation sizes, even if
- * the index would be fine with smaller arrays. This means all batches exceed
- * ALLOC_CHUNK_LIMIT, forcing a separate malloc (expensive).
+ * the index would be fine with smaller arrays. This means all batches
+ * exceed ALLOC_CHUNK_LIMIT, forcing a separate malloc (expensive). The
+ * cache helps for longer queries, not for queries that only create a
+ * single batch, etc.
*/
IndexScanBatch
-index_batch_alloc(int maxitems, bool want_itup)
+index_batch_alloc(IndexScanDesc scan, int maxitems, bool want_itup)
{
- IndexScanBatch batch = palloc(offsetof(IndexScanBatchData, items) +
- sizeof(IndexScanBatchPosItem) * maxitems);
+ IndexScanBatch batch = NULL;
+ /*
+ * Try to find a sufficiently large batch in the cache.
+ *
+ * Use the first batch that can fit the requested number of items. We
+ * could be smarter and look for the smallest of such batches. But that
+ * probably won't help very much. We expect batches to be mostly uniform,
+ * with about the same size. And index_batch_release() prefers larger
+ * batches, so we should end up with mostly larger batches in the cache.
+ *
+ * XXX We can get here with batchState==NULL for bitmapscans. Could that
+ * mean bitmapscans have issues with malloc/free on batches too? But the
+ * cache can't help with that, when it's in batchState (because bitmap
+ * scans don't have that).
+ */
+ if ((scan->batchState != NULL) &&
+ (scan->batchState->cache.batches != NULL))
+ {
+ /*
+ * try to find a batch in the cache, with maxitems high enough
+ *
+ * XXX Maybe should look for a batch with lowest maxitems? That should
+ * increase probability of cache hits in the future?
+ */
+ for (int i = 0; i < scan->batchState->cache.maxbatches; i++)
+ {
+ if ((scan->batchState->cache.batches[i] != NULL) &&
+ (scan->batchState->cache.batches[i]->maxitems >= maxitems))
+ {
+ batch = scan->batchState->cache.batches[i];
+ scan->batchState->cache.batches[i] = NULL;
+ break;
+ }
+ }
+ }
+
+ /* found a batch in the cache? */
+ if (batch)
+ {
+ /* for IOS, we expect to already have the currTuples */
+ Assert(!(want_itup && (batch->currTuples == NULL)));
+
+ /* XXX maybe we could keep these allocations too */
+ Assert(batch->pos == NULL);
+ Assert(batch->itemsvisibility == NULL);
+ }
+ else
+ {
+ batch = palloc(offsetof(IndexScanBatchData, items) +
+ sizeof(IndexScanBatchPosItem) * maxitems);
+
+ batch->maxitems = maxitems;
+
+ /*
+ * If we are doing an index-only scan, we need a tuple storage
+ * workspace. We allocate BLCKSZ for this, which should always give
+ * the index AM enough space to fit a full page's worth of tuples.
+ */
+ batch->currTuples = NULL;
+ if (want_itup)
+ batch->currTuples = palloc(BLCKSZ);
+ }
+
+ /* shared initialization */
batch->firstItem = -1;
batch->lastItem = -1;
batch->killedItems = NULL;
batch->numKilled = 0;
- /*
- * If we are doing an index-only scan, we need a tuple storage workspace.
- * We allocate BLCKSZ for this, which should always give the index AM
- * enough space to fit a full page's worth of tuples.
- */
- batch->currTuples = NULL;
- if (want_itup)
- batch->currTuples = palloc(BLCKSZ);
-
batch->buf = InvalidBuffer;
batch->pos = NULL;
batch->itemsvisibility = NULL; /* per-batch IOS visibility */
@@ -2396,3 +2489,84 @@ index_batch_unlock(Relation rel, bool dropPin, IndexScanBatch batch)
ReleaseBuffer(batch->buf);
batch->buf = InvalidBuffer; /* defensive */
}
+
+/*
+ * index_batch_release
+ * Either stash the batch info a small cache for reuse, or free it.
+ */
+void
+index_batch_release(IndexScanDesc scan, IndexScanBatch batch)
+{
+ /* custom fields should have been cleaned by amfreebatch */
+ Assert(batch->pos == NULL);
+ Assert(batch->buf == InvalidBuffer);
+
+ /*
+ * free killedItems / itemsvisibility
+ *
+ * XXX We could keep/reuse those too, I guess.
+ */
+
+ if (batch->killedItems != NULL)
+ {
+ pfree(batch->killedItems);
+ batch->killedItems = NULL;
+ }
+
+ if (batch->itemsvisibility != NULL)
+ {
+ pfree(batch->itemsvisibility);
+ batch->itemsvisibility = NULL;
+ }
+
+ /*
+ * Try adding the batch to the small cache - find a slot that's either
+ * empty or used by a smaller batch (with smallest maxitems value), and
+ * replace that batch.
+ *
+ * XXX There may be ways to improve this. We could track the number of
+ * empty slots, and minimum maxitems value, which would allow skipping
+ * pointless searches (in cases when should just discard the batch).
+ */
+ if (scan->batchState != NULL)
+ {
+ /* lowest maxitems we found in the cache (to replace with this batch) */
+ int maxitems = batch->maxitems;
+ int slot = scan->batchState->cache.maxbatches;
+
+ /* first time through, initialize the cache */
+ if (scan->batchState->cache.batches == NULL)
+ scan->batchState->cache.batches
+ = palloc0_array(IndexScanBatch,
+ scan->batchState->cache.maxbatches);
+
+ /* find am empty or sufficiently large batch */
+ for (int i = 0; i < scan->batchState->cache.maxbatches; i++)
+ {
+ /* found empty slot, we're done */
+ if (scan->batchState->cache.batches[i] == NULL)
+ {
+ scan->batchState->cache.batches[i] = batch;
+ return;
+ }
+
+ /* found a smaller slot, remember it */
+ if (scan->batchState->cache.batches[i]->maxitems < maxitems)
+ {
+ maxitems = scan->batchState->cache.batches[i]->maxitems;
+ slot = i;
+ }
+ }
+
+ /* found a slot for this batch? */
+ if (maxitems < batch->maxitems)
+ {
+ pfree(scan->batchState->cache.batches[slot]);
+ scan->batchState->cache.batches[slot] = batch;
+ return;
+ }
+ }
+
+ /* either no cache or no slot for this batch */
+ pfree(batch);
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index fba562df8..af947b6dc 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -382,21 +382,23 @@ btfreebatch(IndexScanDesc scan, IndexScanBatch batch)
if (batch->numKilled > 0)
_bt_killitems(scan, batch);
- if (batch->itemsvisibility)
- pfree(batch->itemsvisibility);
-
- if (batch->currTuples)
- pfree(batch->currTuples);
-
+ /* free AM-specific fields of the batch */
if (batch->pos)
{
if (!scan->batchState || !scan->batchState->dropPin)
+ {
ReleaseBuffer(batch->buf);
+ batch->buf = InvalidBuffer;
+ }
pfree(batch->pos);
+ batch->pos = NULL;
}
- pfree(batch);
+ /* other fields (itemsvisibility, killItems, currTuples) freed elsewhere */
+
+ /* free the batch (or cache it for reuse) */
+ index_batch_release(scan, batch);
}
/*
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 5bb28b4bb..ee947a758 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1200,7 +1200,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
}
/* Allocate space for first batch */
- firstbatch = index_batch_alloc(MaxTIDsPerBTreePage, scan->xs_want_itup);
+ firstbatch = index_batch_alloc(scan, MaxTIDsPerBTreePage, scan->xs_want_itup);
firstbatch->pos = palloc(sizeof(BTScanPosData));
/*
@@ -2237,7 +2237,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
BTScanPos newpos;
/* Allocate space for next batch */
- newbatch = index_batch_alloc(MaxTIDsPerBTreePage, scan->xs_want_itup);
+ newbatch = index_batch_alloc(scan, MaxTIDsPerBTreePage, scan->xs_want_itup);
newbatch->pos = palloc(sizeof(BTScanPosData));
newpos = newbatch->pos;
--
2.51.0