v20251102-0003-Reduce-malloc-free-traffic-by-caching-batc.patch
application/octet-stream
Filename: v20251102-0003-Reduce-malloc-free-traffic-by-caching-batc.patch
Type: application/octet-stream
Part: 1
Message:
Re: index prefetching
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch v20251102-0003
Subject: Reduce malloc/free traffic by caching batches
| File | + | − |
|---|---|---|
| src/backend/access/index/indexam.c | 159 | 14 |
| src/backend/access/nbtree/nbtree.c | 8 | 7 |
| src/backend/access/nbtree/nbtsearch.c | 2 | 2 |
| src/include/access/genam.h | 2 | 1 |
| src/include/access/relscan.h | 5 | 0 |
From d5f0e9b0ed6f2a2c7284cb72ebadb3373687b5ab Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Wed, 10 Sep 2025 16:54:50 -0400
Subject: [PATCH v20251102 3/3] Reduce malloc/free traffic by caching batches
Instead of immediately freeing a batch, stash it in a cache (a small
fixed-size array), for reuse by the same scan.
There's room for improvement:
- Keeping some of the batch pieces (killItems, itemsvisibility, ...)
instead of freeing them in index_batch_release.
- Allocating only space we need (both index_batch_alloc calls use
MaxTIDsPerBTreePage, and thus malloc - because of ALLOC_CHUNK_LIMIT).
---
src/include/access/genam.h | 3 +-
src/include/access/relscan.h | 5 +
src/backend/access/index/indexam.c | 173 +++++++++++++++++++++++---
src/backend/access/nbtree/nbtree.c | 15 +--
src/backend/access/nbtree/nbtsearch.c | 4 +-
5 files changed, 176 insertions(+), 24 deletions(-)
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 767503bb6..913945c4b 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -230,7 +230,8 @@ extern void index_store_float8_orderby_distances(IndexScanDesc scan,
bool recheckOrderBy);
extern bytea *index_opclass_options(Relation indrel, AttrNumber attnum,
Datum attoptions, bool validate);
-extern IndexScanBatch index_batch_alloc(int maxitems, bool want_itup);
+extern IndexScanBatch index_batch_alloc(IndexScanDesc scan, int maxitems, bool want_itup);
+extern void index_batch_release(IndexScanDesc scan, IndexScanBatch batch);
extern void index_batch_unlock(Relation rel, bool dropPin, IndexScanBatch batch);
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 6f87c6b31..eb306a9df 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -187,6 +187,7 @@ typedef struct IndexScanBatchData
*/
char *itemsvisibility; /* Index-only scan visibility cache */
+ int maxitems;
IndexScanBatchPosItem items[FLEXIBLE_ARRAY_MEMBER];
} IndexScanBatchData;
@@ -266,6 +267,10 @@ typedef struct IndexScanBatchState
int headBatch; /* head batch slot */
int nextBatch; /* next empty batch slot */
+ /* small cache of unused batches, to reduce malloc/free traffic */
+ int batchesCacheSize;
+ IndexScanBatchData **batchesCache;
+
IndexScanBatchData **batches;
/* callback to skip prefetching in IOS etc. */
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index fd3ffa222..0e089001c 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -2196,6 +2196,10 @@ index_batch_init(IndexScanDesc scan)
scan->batchState->headBatch = 0; /* initial head batch */
scan->batchState->nextBatch = 0; /* initial batch starts empty */
+ /* XXX init the cache of batches, capacity 16 is arbitrary */
+ scan->batchState->batchesCacheSize = 16;
+ scan->batchState->batchesCache = NULL;
+
scan->batchState->batches =
palloc(sizeof(IndexScanBatchData *) * scan->batchState->maxBatches);
@@ -2328,34 +2332,102 @@ static void
index_batch_end(IndexScanDesc scan)
{
index_batch_reset(scan, true);
+
+ if (scan->batchState)
+ {
+ if (scan->batchState->batches)
+ pfree(scan->batchState->batches);
+
+ if (scan->batchState->batchesCache)
+ {
+ for (int i = 0; i < scan->batchState->batchesCacheSize; i++)
+ {
+ if (scan->batchState->batchesCache[i] == NULL)
+ continue;
+
+ pfree(scan->batchState->batchesCache[i]);
+ }
+
+ pfree(scan->batchState->batchesCache);
+ }
+ pfree(scan->batchState);
+ }
}
/*
* XXX Both index_batch_alloc() calls in btree use MaxTIDsPerBTreePage,
* which seems unfortunate - it increases the allocation sizes, even if
- * the index would be fine with smaller arrays. This means all batches exceed
- * ALLOC_CHUNK_LIMIT, forcing a separate malloc (expensive).
+ * the index would be fine with smaller arrays. This means all batches
+ * exceed ALLOC_CHUNK_LIMIT, forcing a separate malloc (expensive). The
+ * cache helps for longer queries, not for queries that only create a
+ * single batch, etc.
*/
IndexScanBatch
-index_batch_alloc(int maxitems, bool want_itup)
+index_batch_alloc(IndexScanDesc scan, int maxitems, bool want_itup)
{
- IndexScanBatch batch = palloc(offsetof(IndexScanBatchData, items) +
- sizeof(IndexScanBatchPosItem) * maxitems);
+ IndexScanBatch batch = NULL;
+ /*
+ * try to find a batch in the cache
+ *
+ * XXX We can get here with batchState==NULL for bitmapscans. Could that
+ * mean bitmapscans have issues with malloc/free on batches too? But the
+ * cache can't help with that, when it's in batchState.
+ */
+ if ((scan->batchState != NULL) &&
+ (scan->batchState->batchesCache != NULL))
+ {
+ /*
+ * try to find a batch in the cache, with maxitems high enough
+ *
+ * XXX Maybe should look for a batch with lowest maxitems? That should
+ * increase probability of cache hits in the future?
+ */
+ for (int i = 0; i < scan->batchState->batchesCacheSize; i++)
+ {
+ if ((scan->batchState->batchesCache[i] != NULL) &&
+ (scan->batchState->batchesCache[i]->maxitems >= maxitems))
+ {
+ batch = scan->batchState->batchesCache[i];
+ scan->batchState->batchesCache[i] = NULL;
+ break;
+ }
+ }
+ }
+
+ /* found a batch in the cache? */
+ if (batch)
+ {
+ /* for IOS, we expect to already have the currTuples */
+ Assert(!(want_itup && (batch->currTuples == NULL)));
+
+ /* XXX maybe we could keep these allocations too */
+ Assert(batch->pos == NULL);
+ Assert(batch->itemsvisibility == NULL);
+ }
+ else
+ {
+ batch = palloc(offsetof(IndexScanBatchData, items) +
+ sizeof(IndexScanBatchPosItem) * maxitems);
+
+ batch->maxitems = maxitems;
+
+ /*
+ * If we are doing an index-only scan, we need a tuple storage
+ * workspace. We allocate BLCKSZ for this, which should always give
+ * the index AM enough space to fit a full page's worth of tuples.
+ */
+ batch->currTuples = NULL;
+ if (want_itup)
+ batch->currTuples = palloc(BLCKSZ);
+ }
+
+ /* shared initialization */
batch->firstItem = -1;
batch->lastItem = -1;
batch->killedItems = NULL;
batch->numKilled = 0;
- /*
- * If we are doing an index-only scan, we need a tuple storage workspace.
- * We allocate BLCKSZ for this, which should always give the index AM
- * enough space to fit a full page's worth of tuples.
- */
- batch->currTuples = NULL;
- if (want_itup)
- batch->currTuples = palloc(BLCKSZ);
-
batch->buf = InvalidBuffer;
batch->pos = NULL;
batch->itemsvisibility = NULL; /* per-batch IOS visibility */
@@ -2396,3 +2468,76 @@ index_batch_unlock(Relation rel, bool dropPin, IndexScanBatch batch)
ReleaseBuffer(batch->buf);
batch->buf = InvalidBuffer; /* defensive */
}
+
+/* add the buffer to the cache, or free it */
+void
+index_batch_release(IndexScanDesc scan, IndexScanBatch batch)
+{
+ /*
+ * first free some allocations
+ *
+ * XXX We could keep/reuse some of those.
+ */
+
+ if (batch->killedItems != NULL)
+ {
+ pfree(batch->killedItems);
+ batch->killedItems = NULL;
+ }
+
+ if (batch->itemsvisibility != NULL)
+ {
+ pfree(batch->itemsvisibility);
+ batch->itemsvisibility = NULL;
+ }
+
+ /* XXX a bit unclear what's release by AM vs. indexam */
+ Assert(batch->pos == NULL);
+
+ /*
+ * try adding it to the cache - finds a slot that's either empty or has a
+ * lower maxitems value (and replace that batch)
+ *
+ * XXX maybe we should track the number of empty slots, and minimum value
+ * of maxitems, so that we can skip pointless searches?
+ *
+ * XXX ignores cases with batchState=NULL (can we get here with bitmap
+ * scans?)
+ */
+ if (scan->batchState != NULL)
+ {
+ /* lowest maxitems we found in the cache (to replace with batch) */
+ int maxitems = batch->maxitems;
+ int slot = scan->batchState->batchesCacheSize;
+
+ /* first time through, initialize the cache */
+ if (scan->batchState->batchesCache == NULL)
+ scan->batchState->batchesCache
+ = palloc0_array(IndexScanBatch,
+ scan->batchState->batchesCacheSize);
+
+ for (int i = 0; i < scan->batchState->batchesCacheSize; i++)
+ {
+ /* found empty slot, we're done */
+ if (scan->batchState->batchesCache[i] == NULL)
+ {
+ scan->batchState->batchesCache[i] = batch;
+ return;
+ }
+
+ /* update lowest maxitems? */
+ if (scan->batchState->batchesCache[i]->maxitems < maxitems)
+ {
+ maxitems = scan->batchState->batchesCache[i]->maxitems;
+ slot = i;
+ }
+ }
+
+ /* found a batch to replace? */
+ if (maxitems < batch->maxitems)
+ {
+ pfree(scan->batchState->batchesCache[slot]);
+ scan->batchState->batchesCache[slot] = batch;
+ }
+ }
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index fba562df8..18c734c4a 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -382,21 +382,22 @@ btfreebatch(IndexScanDesc scan, IndexScanBatch batch)
if (batch->numKilled > 0)
_bt_killitems(scan, batch);
- if (batch->itemsvisibility)
- pfree(batch->itemsvisibility);
-
- if (batch->currTuples)
- pfree(batch->currTuples);
-
if (batch->pos)
{
if (!scan->batchState || !scan->batchState->dropPin)
ReleaseBuffer(batch->buf);
pfree(batch->pos);
+
+ /* XXX maybe should be done in index_batch_free? */
+ batch->buf = InvalidBuffer;
+ batch->pos = NULL;
}
- pfree(batch);
+ /* XXX keep itemsvisibility, killItems and currTuples */
+
+ /* free the batch (or cache it for reuse) */
+ index_batch_release(scan, batch);
}
/*
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 54a767a4f..b019c19f8 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1200,7 +1200,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
}
/* Allocate space for first batch */
- firstbatch = index_batch_alloc(MaxTIDsPerBTreePage, scan->xs_want_itup);
+ firstbatch = index_batch_alloc(scan, MaxTIDsPerBTreePage, scan->xs_want_itup);
firstbatch->pos = palloc(sizeof(BTScanPosData));
/*
@@ -2237,7 +2237,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
BTScanPos newpos;
/* Allocate space for next batch */
- newbatch = index_batch_alloc(MaxTIDsPerBTreePage, scan->xs_want_itup);
+ newbatch = index_batch_alloc(scan, MaxTIDsPerBTreePage, scan->xs_want_itup);
newbatch->pos = palloc(sizeof(BTScanPosData));
newpos = newbatch->pos;
--
2.51.0