v3-0003-dshash-add-partition-scoped-sequential-scan.patch

application/x-patch

Filename: v3-0003-dshash-add-partition-scoped-sequential-scan.patch
Type: application/x-patch
Part: 7
Message: Re: Improve pg_stat_statements scalability
From bf08305059d33b65953a424243d452ac90f00786 Mon Sep 17 00:00:00 2001
From: Sami Imseih <samimseih@gmail.com>
Date: Wed, 27 May 2026 10:02:55 -0500
Subject: [PATCH v3 3/5] dshash: add partition-scoped sequential scan

Add dshash_seq_init_partition(), which restricts a sequential scan to
a single partition.  This allows multiple backends to scan a dshash
table in parallel by dividing partitions among them, which a future
commit will use to implement parallel clock-sweep eviction.

Also move the DSHASH_NUM_PARTITIONS macros to dshash.h so that callers
can determine the partition count.

Rename dshash_seq_status.nbuckets to endbucket, since the field
represents the exclusive upper-bound bucket index at which the scan
terminates, not a count.  This is clearer now that partition-scoped
scans set it to the first bucket of the next partition rather than
the total number of buckets.
---
 src/backend/lib/dshash.c | 44 +++++++++++++++++++++++++++++-----------
 src/include/lib/dshash.h | 14 ++++++++++++-
 2 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/src/backend/lib/dshash.c b/src/backend/lib/dshash.c
index 1999989c14f..01e67856f43 100644
--- a/src/backend/lib/dshash.c
+++ b/src/backend/lib/dshash.c
@@ -52,15 +52,6 @@ struct dshash_table_item
 	/* The user's entry object follows here.  See ENTRY_FROM_ITEM(item). */
 };
 
-/*
- * The number of partitions for locking purposes.  This is set to match
- * NUM_BUFFER_PARTITIONS for now, on the basis that whatever's good enough for
- * the buffer pool must be good enough for any other purpose.  This could
- * become a runtime parameter in future.
- */
-#define DSHASH_NUM_PARTITIONS_LOG2 7
-#define DSHASH_NUM_PARTITIONS (1 << DSHASH_NUM_PARTITIONS_LOG2)
-
 /* A magic value used to identify our hash tables. */
 #define DSHASH_MAGIC 0x75ff6a20
 
@@ -661,13 +652,42 @@ dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
 {
 	status->hash_table = hash_table;
 	status->curbucket = 0;
-	status->nbuckets = 0;
+	status->endbucket = 0;
 	status->curitem = NULL;
 	status->pnextitem = InvalidDsaPointer;
 	status->curpartition = -1;
 	status->exclusive = exclusive;
 }
 
+/*
+ * Initialize a sequential scan restricted to a single partition.
+ *
+ * Only entries in the specified partition are visited.  The caller must
+ * ensure that 0 <= partition < DSHASH_NUM_PARTITIONS.
+ */
+void
+dshash_seq_init_partition(dshash_seq_status *status, dshash_table *hash_table,
+						  bool exclusive, int partition)
+{
+	Assert(partition >= 0 && partition < DSHASH_NUM_PARTITIONS);
+
+	status->hash_table = hash_table;
+	status->curitem = NULL;
+	status->pnextitem = InvalidDsaPointer;
+	status->exclusive = exclusive;
+
+	LWLockAcquire(PARTITION_LOCK(hash_table, partition),
+				  exclusive ? LW_EXCLUSIVE : LW_SHARED);
+	ensure_valid_bucket_pointers(hash_table);
+
+	status->curpartition = partition;
+	status->endbucket =
+		BUCKET_INDEX_FOR_PARTITION(partition + 1, hash_table->size_log2);
+	/* Set to one before first bucket as the seq scan will ++curbucket */
+	status->curbucket =
+		BUCKET_INDEX_FOR_PARTITION(partition, hash_table->size_log2) - 1;
+}
+
 /*
  * Returns the next element.
  *
@@ -701,7 +721,7 @@ dshash_seq_next(dshash_seq_status *status)
 
 		ensure_valid_bucket_pointers(status->hash_table);
 
-		status->nbuckets =
+		status->endbucket =
 			NUM_BUCKETS(status->hash_table->control->size_log2);
 		next_item_pointer = status->hash_table->buckets[status->curbucket];
 	}
@@ -717,7 +737,7 @@ dshash_seq_next(dshash_seq_status *status)
 	{
 		int			next_partition;
 
-		if (++status->curbucket >= status->nbuckets)
+		if (++status->curbucket >= status->endbucket)
 		{
 			/* all buckets have been scanned. finish. */
 			return NULL;
diff --git a/src/include/lib/dshash.h b/src/include/lib/dshash.h
index 64b758b381b..655b025b996 100644
--- a/src/include/lib/dshash.h
+++ b/src/include/lib/dshash.h
@@ -73,7 +73,7 @@ typedef struct dshash_seq_status
 {
 	dshash_table *hash_table;	/* dshash table working on */
 	int			curbucket;		/* bucket number we are at */
-	int			nbuckets;		/* total number of buckets in the dshash */
+	int			endbucket;		/* first bucket beyond scan range */
 	dshash_table_item *curitem; /* item we are currently at */
 	dsa_pointer pnextitem;		/* dsa-pointer to the next item */
 	int			curpartition;	/* partition number we are at */
@@ -109,9 +109,21 @@ extern void dshash_release_lock(dshash_table *hash_table, void *entry);
 #define dshash_find_or_insert(hash_table, key, found) \
 	dshash_find_or_insert_extended(hash_table, key, found, 0)
 
+/*
+ * The number of partitions for locking purposes.  This is set to match
+ * NUM_BUFFER_PARTITIONS for now, on the basis that whatever's good enough for
+ * the buffer pool must be good enough for any other purpose.  This could
+ * become a runtime parameter in future.
+ */
+#define DSHASH_NUM_PARTITIONS_LOG2 7
+#define DSHASH_NUM_PARTITIONS (1 << DSHASH_NUM_PARTITIONS_LOG2)
+
 /* seq scan support */
 extern void dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
 							bool exclusive);
+extern void dshash_seq_init_partition(dshash_seq_status *status,
+									  dshash_table *hash_table,
+									  bool exclusive, int partition);
 extern void *dshash_seq_next(dshash_seq_status *status);
 extern void dshash_seq_term(dshash_seq_status *status);
 extern void dshash_delete_current(dshash_seq_status *status);
-- 
2.50.1 (Apple Git-155)