v20251109-0003-Reduce-malloc-free-traffic-by-caching-batc.patch

application/octet-stream

Filename: v20251109-0003-Reduce-malloc-free-traffic-by-caching-batc.patch
Type: application/octet-stream
Part: 2
Message: Re: index prefetching

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: format-patch
Series: patch v20251109-0003
Subject: Reduce malloc/free traffic by caching batches
File+
src/backend/access/index/indexam.c 188 14
src/backend/access/nbtree/nbtree.c 9 7
src/backend/access/nbtree/nbtsearch.c 2 2
src/include/access/genam.h 3 1
src/include/access/relscan.h 9 0
From 6fe6c4b4d5e52c2e476e292d0c1d4e3add3f6b63 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Wed, 10 Sep 2025 16:54:50 -0400
Subject: [PATCH v20251109 3/3] Reduce malloc/free traffic by caching batches

Instead of immediately freeing a batch, stash it in a cache (a small
fixed-size array), for reuse by the same scan.

There's room for improvement:

- Keeping some of the batch pieces (killItems, itemsvisibility, ...)
  instead of freeing them in index_batch_release.

- Allocating only space we need (both index_batch_alloc calls use
  MaxTIDsPerBTreePage, and thus malloc - because of ALLOC_CHUNK_LIMIT).
---
 src/include/access/genam.h            |   4 +-
 src/include/access/relscan.h          |   9 ++
 src/backend/access/index/indexam.c    | 202 ++++++++++++++++++++++++--
 src/backend/access/nbtree/nbtree.c    |  16 +-
 src/backend/access/nbtree/nbtsearch.c |   4 +-
 5 files changed, 211 insertions(+), 24 deletions(-)

diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 767503bb6..1a92a8195 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -230,7 +230,9 @@ extern void index_store_float8_orderby_distances(IndexScanDesc scan,
 												 bool recheckOrderBy);
 extern bytea *index_opclass_options(Relation indrel, AttrNumber attnum,
 									Datum attoptions, bool validate);
-extern IndexScanBatch index_batch_alloc(int maxitems, bool want_itup);
+extern IndexScanBatch index_batch_alloc(IndexScanDesc scan,
+										int maxitems, bool want_itup);
+extern void index_batch_release(IndexScanDesc scan, IndexScanBatch batch);
 extern void index_batch_unlock(Relation rel, bool dropPin, IndexScanBatch batch);
 
 
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index db5e3b309..763f4ffe4 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -187,6 +187,8 @@ typedef struct IndexScanBatchData
 	 */
 	char	   *itemsvisibility;	/* Index-only scan visibility cache */
 
+	/* capacity of the batch (size of the items array) */
+	int			maxitems;
 	IndexScanBatchPosItem items[FLEXIBLE_ARRAY_MEMBER];
 } IndexScanBatchData;
 
@@ -266,6 +268,13 @@ typedef struct IndexScanBatchState
 	int			headBatch;		/* head batch slot */
 	int			nextBatch;		/* next empty batch slot */
 
+	/* small cache of unused batches, to reduce malloc/free traffic */
+	struct
+	{
+		int			maxbatches;
+		IndexScanBatchData **batches;
+	}			cache;
+
 	IndexScanBatchData **batches;
 
 	/* callback to skip prefetching in IOS etc. */
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index d42cc1c50..f1f2531b3 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -2196,6 +2196,10 @@ index_batch_init(IndexScanDesc scan)
 	scan->batchState->headBatch = 0;	/* initial head batch */
 	scan->batchState->nextBatch = 0;	/* initial batch starts empty */
 
+	/* XXX init the cache of batches, capacity 16 is arbitrary */
+	scan->batchState->cache.maxbatches = 16;
+	scan->batchState->cache.batches = NULL;
+
 	scan->batchState->batches =
 		palloc(sizeof(IndexScanBatchData *) * scan->batchState->maxBatches);
 
@@ -2328,34 +2332,123 @@ static void
 index_batch_end(IndexScanDesc scan)
 {
 	index_batch_reset(scan, true);
+
+	/* bail out without batching */
+	if (!scan->batchState)
+		return;
+
+	/* we can simply free batches thanks to the earlier reset */
+	if (scan->batchState->batches)
+		pfree(scan->batchState->batches);
+
+	/* also walk the cache of batches, if any */
+	if (scan->batchState->cache.batches)
+	{
+		for (int i = 0; i < scan->batchState->cache.maxbatches; i++)
+		{
+			if (scan->batchState->cache.batches[i] == NULL)
+				continue;
+
+			pfree(scan->batchState->cache.batches[i]);
+		}
+
+		pfree(scan->batchState->cache.batches);
+	}
+
+	pfree(scan->batchState);
 }
 
 /*
+ * index_batch_alloc
+ *		Allocate a batch that can fit maxitems index tuples.
+ *
+ * Returns a IndexScanBatch struct with capacity sufficient for maxitems
+ * index tuples. It's either newly allocated or loaded from a small cache
+ * maintained for individual scans.
+ *
+ * maxitems determines the minimum size of the batch (it may be larger)
+ * want_itup determines whether the bach allocates space for currTuples
+ *
  * XXX Both index_batch_alloc() calls in btree use MaxTIDsPerBTreePage,
  * which seems unfortunate - it increases the allocation sizes, even if
- * the index would be fine with smaller arrays. This means all batches exceed
- * ALLOC_CHUNK_LIMIT, forcing a separate malloc (expensive).
+ * the index would be fine with smaller arrays. This means all batches
+ * exceed ALLOC_CHUNK_LIMIT, forcing a separate malloc (expensive). The
+ * cache helps for longer queries, not for queries that only create a
+ * single batch, etc.
  */
 IndexScanBatch
-index_batch_alloc(int maxitems, bool want_itup)
+index_batch_alloc(IndexScanDesc scan, int maxitems, bool want_itup)
 {
-	IndexScanBatch batch = palloc(offsetof(IndexScanBatchData, items) +
-								  sizeof(IndexScanBatchPosItem) * maxitems);
+	IndexScanBatch batch = NULL;
 
+	/*
+	 * Try to find a sufficiently large batch in the cache.
+	 *
+	 * Use the first batch that can fit the requested number of items. We
+	 * could be smarter and look for the smallest of such batches. But that
+	 * probably won't help very much. We expect batches to be mostly uniform,
+	 * with about the same size. And index_batch_release() prefers larger
+	 * batches, so we should end up with mostly larger batches in the cache.
+	 *
+	 * XXX We can get here with batchState==NULL for bitmapscans. Could that
+	 * mean bitmapscans have issues with malloc/free on batches too? But the
+	 * cache can't help with that, when it's in batchState (because bitmap
+	 * scans don't have that).
+	 */
+	if ((scan->batchState != NULL) &&
+		(scan->batchState->cache.batches != NULL))
+	{
+		/*
+		 * try to find a batch in the cache, with maxitems high enough
+		 *
+		 * XXX Maybe should look for a batch with lowest maxitems? That should
+		 * increase probability of cache hits in the future?
+		 */
+		for (int i = 0; i < scan->batchState->cache.maxbatches; i++)
+		{
+			if ((scan->batchState->cache.batches[i] != NULL) &&
+				(scan->batchState->cache.batches[i]->maxitems >= maxitems))
+			{
+				batch = scan->batchState->cache.batches[i];
+				scan->batchState->cache.batches[i] = NULL;
+				break;
+			}
+		}
+	}
+
+	/* found a batch in the cache? */
+	if (batch)
+	{
+		/* for IOS, we expect to already have the currTuples */
+		Assert(!(want_itup && (batch->currTuples == NULL)));
+
+		/* XXX maybe we could keep these allocations too */
+		Assert(batch->pos == NULL);
+		Assert(batch->itemsvisibility == NULL);
+	}
+	else
+	{
+		batch = palloc(offsetof(IndexScanBatchData, items) +
+					   sizeof(IndexScanBatchPosItem) * maxitems);
+
+		batch->maxitems = maxitems;
+
+		/*
+		 * If we are doing an index-only scan, we need a tuple storage
+		 * workspace. We allocate BLCKSZ for this, which should always give
+		 * the index AM enough space to fit a full page's worth of tuples.
+		 */
+		batch->currTuples = NULL;
+		if (want_itup)
+			batch->currTuples = palloc(BLCKSZ);
+	}
+
+	/* shared initialization */
 	batch->firstItem = -1;
 	batch->lastItem = -1;
 	batch->killedItems = NULL;
 	batch->numKilled = 0;
 
-	/*
-	 * If we are doing an index-only scan, we need a tuple storage workspace.
-	 * We allocate BLCKSZ for this, which should always give the index AM
-	 * enough space to fit a full page's worth of tuples.
-	 */
-	batch->currTuples = NULL;
-	if (want_itup)
-		batch->currTuples = palloc(BLCKSZ);
-
 	batch->buf = InvalidBuffer;
 	batch->pos = NULL;
 	batch->itemsvisibility = NULL;	/* per-batch IOS visibility */
@@ -2396,3 +2489,84 @@ index_batch_unlock(Relation rel, bool dropPin, IndexScanBatch batch)
 	ReleaseBuffer(batch->buf);
 	batch->buf = InvalidBuffer; /* defensive */
 }
+
+/*
+ * index_batch_release
+ *		Either stash the batch info a small cache for reuse, or free it.
+ */
+void
+index_batch_release(IndexScanDesc scan, IndexScanBatch batch)
+{
+	/* custom fields should have been cleaned by amfreebatch */
+	Assert(batch->pos == NULL);
+	Assert(batch->buf == InvalidBuffer);
+
+	/*
+	 * free killedItems / itemsvisibility
+	 *
+	 * XXX We could keep/reuse those too, I guess.
+	 */
+
+	if (batch->killedItems != NULL)
+	{
+		pfree(batch->killedItems);
+		batch->killedItems = NULL;
+	}
+
+	if (batch->itemsvisibility != NULL)
+	{
+		pfree(batch->itemsvisibility);
+		batch->itemsvisibility = NULL;
+	}
+
+	/*
+	 * Try adding the batch to the small cache - find a slot that's either
+	 * empty or used by a smaller batch (with smallest maxitems value), and
+	 * replace that batch.
+	 *
+	 * XXX There may be ways to improve this. We could track the number of
+	 * empty slots, and minimum maxitems value, which would allow skipping
+	 * pointless searches (in cases when should just discard the batch).
+	 */
+	if (scan->batchState != NULL)
+	{
+		/* lowest maxitems we found in the cache (to replace with this batch) */
+		int			maxitems = batch->maxitems;
+		int			slot = scan->batchState->cache.maxbatches;
+
+		/* first time through, initialize the cache */
+		if (scan->batchState->cache.batches == NULL)
+			scan->batchState->cache.batches
+				= palloc0_array(IndexScanBatch,
+								scan->batchState->cache.maxbatches);
+
+		/* find am empty or sufficiently large batch */
+		for (int i = 0; i < scan->batchState->cache.maxbatches; i++)
+		{
+			/* found empty slot, we're done */
+			if (scan->batchState->cache.batches[i] == NULL)
+			{
+				scan->batchState->cache.batches[i] = batch;
+				return;
+			}
+
+			/* found a smaller slot, remember it */
+			if (scan->batchState->cache.batches[i]->maxitems < maxitems)
+			{
+				maxitems = scan->batchState->cache.batches[i]->maxitems;
+				slot = i;
+			}
+		}
+
+		/* found a slot for this batch? */
+		if (maxitems < batch->maxitems)
+		{
+			pfree(scan->batchState->cache.batches[slot]);
+			scan->batchState->cache.batches[slot] = batch;
+			return;
+		}
+	}
+
+	/* either no cache or no slot for this batch */
+	pfree(batch);
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index fba562df8..af947b6dc 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -382,21 +382,23 @@ btfreebatch(IndexScanDesc scan, IndexScanBatch batch)
 	if (batch->numKilled > 0)
 		_bt_killitems(scan, batch);
 
-	if (batch->itemsvisibility)
-		pfree(batch->itemsvisibility);
-
-	if (batch->currTuples)
-		pfree(batch->currTuples);
-
+	/* free AM-specific fields of the batch */
 	if (batch->pos)
 	{
 		if (!scan->batchState || !scan->batchState->dropPin)
+		{
 			ReleaseBuffer(batch->buf);
+			batch->buf = InvalidBuffer;
+		}
 
 		pfree(batch->pos);
+		batch->pos = NULL;
 	}
 
-	pfree(batch);
+	/* other fields (itemsvisibility, killItems, currTuples) freed elsewhere */
+
+	/* free the batch (or cache it for reuse) */
+	index_batch_release(scan, batch);
 }
 
 /*
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 5bb28b4bb..ee947a758 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1200,7 +1200,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	}
 
 	/* Allocate space for first batch */
-	firstbatch = index_batch_alloc(MaxTIDsPerBTreePage, scan->xs_want_itup);
+	firstbatch = index_batch_alloc(scan, MaxTIDsPerBTreePage, scan->xs_want_itup);
 	firstbatch->pos = palloc(sizeof(BTScanPosData));
 
 	/*
@@ -2237,7 +2237,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 	BTScanPos	newpos;
 
 	/* Allocate space for next batch */
-	newbatch = index_batch_alloc(MaxTIDsPerBTreePage, scan->xs_want_itup);
+	newbatch = index_batch_alloc(scan, MaxTIDsPerBTreePage, scan->xs_want_itup);
 	newbatch->pos = palloc(sizeof(BTScanPosData));
 	newpos = newbatch->pos;
 
-- 
2.51.0