v20251102-0003-Reduce-malloc-free-traffic-by-caching-batc.patch

application/octet-stream

Filename: v20251102-0003-Reduce-malloc-free-traffic-by-caching-batc.patch
Type: application/octet-stream
Part: 1
Message: Re: index prefetching

Patch

Same data as JSON: GET /api/v1/attachments/:id/patch the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes. API reference →
Format: format-patch
Series: patch v20251102-0003
Subject: Reduce malloc/free traffic by caching batches
File+
src/backend/access/index/indexam.c 159 14
src/backend/access/nbtree/nbtree.c 8 7
src/backend/access/nbtree/nbtsearch.c 2 2
src/include/access/genam.h 2 1
src/include/access/relscan.h 5 0
From d5f0e9b0ed6f2a2c7284cb72ebadb3373687b5ab Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Wed, 10 Sep 2025 16:54:50 -0400
Subject: [PATCH v20251102 3/3] Reduce malloc/free traffic by caching batches

Instead of immediately freeing a batch, stash it in a cache (a small
fixed-size array), for reuse by the same scan.

There's room for improvement:

- Keeping some of the batch pieces (killItems, itemsvisibility, ...)
  instead of freeing them in index_batch_release.

- Allocating only space we need (both index_batch_alloc calls use
  MaxTIDsPerBTreePage, and thus malloc - because of ALLOC_CHUNK_LIMIT).
---
 src/include/access/genam.h            |   3 +-
 src/include/access/relscan.h          |   5 +
 src/backend/access/index/indexam.c    | 173 +++++++++++++++++++++++---
 src/backend/access/nbtree/nbtree.c    |  15 +--
 src/backend/access/nbtree/nbtsearch.c |   4 +-
 5 files changed, 176 insertions(+), 24 deletions(-)

diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 767503bb6..913945c4b 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -230,7 +230,8 @@ extern void index_store_float8_orderby_distances(IndexScanDesc scan,
 												 bool recheckOrderBy);
 extern bytea *index_opclass_options(Relation indrel, AttrNumber attnum,
 									Datum attoptions, bool validate);
-extern IndexScanBatch index_batch_alloc(int maxitems, bool want_itup);
+extern IndexScanBatch index_batch_alloc(IndexScanDesc scan, int maxitems, bool want_itup);
+extern void index_batch_release(IndexScanDesc scan, IndexScanBatch batch);
 extern void index_batch_unlock(Relation rel, bool dropPin, IndexScanBatch batch);
 
 
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 6f87c6b31..eb306a9df 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -187,6 +187,7 @@ typedef struct IndexScanBatchData
 	 */
 	char	   *itemsvisibility;	/* Index-only scan visibility cache */
 
+	int			maxitems;
 	IndexScanBatchPosItem items[FLEXIBLE_ARRAY_MEMBER];
 } IndexScanBatchData;
 
@@ -266,6 +267,10 @@ typedef struct IndexScanBatchState
 	int			headBatch;		/* head batch slot */
 	int			nextBatch;		/* next empty batch slot */
 
+	/* small cache of unused batches, to reduce malloc/free traffic */
+	int			batchesCacheSize;
+	IndexScanBatchData **batchesCache;
+
 	IndexScanBatchData **batches;
 
 	/* callback to skip prefetching in IOS etc. */
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index fd3ffa222..0e089001c 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -2196,6 +2196,10 @@ index_batch_init(IndexScanDesc scan)
 	scan->batchState->headBatch = 0;	/* initial head batch */
 	scan->batchState->nextBatch = 0;	/* initial batch starts empty */
 
+	/* XXX init the cache of batches, capacity 16 is arbitrary */
+	scan->batchState->batchesCacheSize = 16;
+	scan->batchState->batchesCache = NULL;
+
 	scan->batchState->batches =
 		palloc(sizeof(IndexScanBatchData *) * scan->batchState->maxBatches);
 
@@ -2328,34 +2332,102 @@ static void
 index_batch_end(IndexScanDesc scan)
 {
 	index_batch_reset(scan, true);
+
+	if (scan->batchState)
+	{
+		if (scan->batchState->batches)
+			pfree(scan->batchState->batches);
+
+		if (scan->batchState->batchesCache)
+		{
+			for (int i = 0; i < scan->batchState->batchesCacheSize; i++)
+			{
+				if (scan->batchState->batchesCache[i] == NULL)
+					continue;
+
+				pfree(scan->batchState->batchesCache[i]);
+			}
+
+			pfree(scan->batchState->batchesCache);
+		}
+		pfree(scan->batchState);
+	}
 }
 
 /*
  * XXX Both index_batch_alloc() calls in btree use MaxTIDsPerBTreePage,
  * which seems unfortunate - it increases the allocation sizes, even if
- * the index would be fine with smaller arrays. This means all batches exceed
- * ALLOC_CHUNK_LIMIT, forcing a separate malloc (expensive).
+ * the index would be fine with smaller arrays. This means all batches
+ * exceed ALLOC_CHUNK_LIMIT, forcing a separate malloc (expensive). The
+ * cache helps for longer queries, not for queries that only create a
+ * single batch, etc.
  */
 IndexScanBatch
-index_batch_alloc(int maxitems, bool want_itup)
+index_batch_alloc(IndexScanDesc scan, int maxitems, bool want_itup)
 {
-	IndexScanBatch batch = palloc(offsetof(IndexScanBatchData, items) +
-								  sizeof(IndexScanBatchPosItem) * maxitems);
+	IndexScanBatch batch = NULL;
 
+	/*
+	 * try to find a batch in the cache
+	 *
+	 * XXX We can get here with batchState==NULL for bitmapscans. Could that
+	 * mean bitmapscans have issues with malloc/free on batches too? But the
+	 * cache can't help with that, when it's in batchState.
+	 */
+	if ((scan->batchState != NULL) &&
+		(scan->batchState->batchesCache != NULL))
+	{
+		/*
+		 * try to find a batch in the cache, with maxitems high enough
+		 *
+		 * XXX Maybe should look for a batch with lowest maxitems? That should
+		 * increase probability of cache hits in the future?
+		 */
+		for (int i = 0; i < scan->batchState->batchesCacheSize; i++)
+		{
+			if ((scan->batchState->batchesCache[i] != NULL) &&
+				(scan->batchState->batchesCache[i]->maxitems >= maxitems))
+			{
+				batch = scan->batchState->batchesCache[i];
+				scan->batchState->batchesCache[i] = NULL;
+				break;
+			}
+		}
+	}
+
+	/* found a batch in the cache? */
+	if (batch)
+	{
+		/* for IOS, we expect to already have the currTuples */
+		Assert(!(want_itup && (batch->currTuples == NULL)));
+
+		/* XXX maybe we could keep these allocations too */
+		Assert(batch->pos == NULL);
+		Assert(batch->itemsvisibility == NULL);
+	}
+	else
+	{
+		batch = palloc(offsetof(IndexScanBatchData, items) +
+					   sizeof(IndexScanBatchPosItem) * maxitems);
+
+		batch->maxitems = maxitems;
+
+		/*
+		 * If we are doing an index-only scan, we need a tuple storage
+		 * workspace. We allocate BLCKSZ for this, which should always give
+		 * the index AM enough space to fit a full page's worth of tuples.
+		 */
+		batch->currTuples = NULL;
+		if (want_itup)
+			batch->currTuples = palloc(BLCKSZ);
+	}
+
+	/* shared initialization */
 	batch->firstItem = -1;
 	batch->lastItem = -1;
 	batch->killedItems = NULL;
 	batch->numKilled = 0;
 
-	/*
-	 * If we are doing an index-only scan, we need a tuple storage workspace.
-	 * We allocate BLCKSZ for this, which should always give the index AM
-	 * enough space to fit a full page's worth of tuples.
-	 */
-	batch->currTuples = NULL;
-	if (want_itup)
-		batch->currTuples = palloc(BLCKSZ);
-
 	batch->buf = InvalidBuffer;
 	batch->pos = NULL;
 	batch->itemsvisibility = NULL;	/* per-batch IOS visibility */
@@ -2396,3 +2468,76 @@ index_batch_unlock(Relation rel, bool dropPin, IndexScanBatch batch)
 	ReleaseBuffer(batch->buf);
 	batch->buf = InvalidBuffer; /* defensive */
 }
+
+/* add the buffer to the cache, or free it */
+void
+index_batch_release(IndexScanDesc scan, IndexScanBatch batch)
+{
+	/*
+	 * first free some allocations
+	 *
+	 * XXX We could keep/reuse some of those.
+	 */
+
+	if (batch->killedItems != NULL)
+	{
+		pfree(batch->killedItems);
+		batch->killedItems = NULL;
+	}
+
+	if (batch->itemsvisibility != NULL)
+	{
+		pfree(batch->itemsvisibility);
+		batch->itemsvisibility = NULL;
+	}
+
+	/* XXX a bit unclear what's release by AM vs. indexam */
+	Assert(batch->pos == NULL);
+
+	/*
+	 * try adding it to the cache - finds a slot that's either empty or has a
+	 * lower maxitems value (and replace that batch)
+	 *
+	 * XXX maybe we should track the number of empty slots, and minimum value
+	 * of maxitems, so that we can skip pointless searches?
+	 *
+	 * XXX ignores cases with batchState=NULL (can we get here with bitmap
+	 * scans?)
+	 */
+	if (scan->batchState != NULL)
+	{
+		/* lowest maxitems we found in the cache (to replace with batch) */
+		int			maxitems = batch->maxitems;
+		int			slot = scan->batchState->batchesCacheSize;
+
+		/* first time through, initialize the cache */
+		if (scan->batchState->batchesCache == NULL)
+			scan->batchState->batchesCache
+				= palloc0_array(IndexScanBatch,
+								scan->batchState->batchesCacheSize);
+
+		for (int i = 0; i < scan->batchState->batchesCacheSize; i++)
+		{
+			/* found empty slot, we're done */
+			if (scan->batchState->batchesCache[i] == NULL)
+			{
+				scan->batchState->batchesCache[i] = batch;
+				return;
+			}
+
+			/* update lowest maxitems? */
+			if (scan->batchState->batchesCache[i]->maxitems < maxitems)
+			{
+				maxitems = scan->batchState->batchesCache[i]->maxitems;
+				slot = i;
+			}
+		}
+
+		/* found a batch to replace? */
+		if (maxitems < batch->maxitems)
+		{
+			pfree(scan->batchState->batchesCache[slot]);
+			scan->batchState->batchesCache[slot] = batch;
+		}
+	}
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index fba562df8..18c734c4a 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -382,21 +382,22 @@ btfreebatch(IndexScanDesc scan, IndexScanBatch batch)
 	if (batch->numKilled > 0)
 		_bt_killitems(scan, batch);
 
-	if (batch->itemsvisibility)
-		pfree(batch->itemsvisibility);
-
-	if (batch->currTuples)
-		pfree(batch->currTuples);
-
 	if (batch->pos)
 	{
 		if (!scan->batchState || !scan->batchState->dropPin)
 			ReleaseBuffer(batch->buf);
 
 		pfree(batch->pos);
+
+		/* XXX maybe should be done in index_batch_free? */
+		batch->buf = InvalidBuffer;
+		batch->pos = NULL;
 	}
 
-	pfree(batch);
+	/* XXX keep itemsvisibility, killItems and currTuples */
+
+	/* free the batch (or cache it for reuse) */
+	index_batch_release(scan, batch);
 }
 
 /*
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 54a767a4f..b019c19f8 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1200,7 +1200,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 	}
 
 	/* Allocate space for first batch */
-	firstbatch = index_batch_alloc(MaxTIDsPerBTreePage, scan->xs_want_itup);
+	firstbatch = index_batch_alloc(scan, MaxTIDsPerBTreePage, scan->xs_want_itup);
 	firstbatch->pos = palloc(sizeof(BTScanPosData));
 
 	/*
@@ -2237,7 +2237,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 	BTScanPos	newpos;
 
 	/* Allocate space for next batch */
-	newbatch = index_batch_alloc(MaxTIDsPerBTreePage, scan->xs_want_itup);
+	newbatch = index_batch_alloc(scan, MaxTIDsPerBTreePage, scan->xs_want_itup);
 	newbatch->pos = palloc(sizeof(BTScanPosData));
 	newpos = newbatch->pos;
 
-- 
2.51.0