v10f-0002-Lower-pg_stat_io_histogram-private-backend-memo.patch
text/x-patch
Filename: v10f-0002-Lower-pg_stat_io_histogram-private-backend-memo.patch
Type: text/x-patch
Part: 0
Message:
Re: pg_stat_io_histogram
From d6783896069de828b13c554c4c21ce439a76d2bc Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Wed, 18 Mar 2026 07:24:14 +0100
Subject: [PATCH v10f 2/3] Lower pg_stat_io_histogram private (backend) memory
in pending_hist_time_buckets by using array with indirect offsets.
---
src/backend/utils/activity/pgstat.c | 9 +--
src/backend/utils/activity/pgstat_io.c | 90 ++++++++++++++++++++++++--
src/include/pgstat.h | 19 ++++--
src/include/utils/pgstat_internal.h | 1 +
4 files changed, 102 insertions(+), 17 deletions(-)
diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c
index 9feb2f1370b..7c597932671 100644
--- a/src/backend/utils/activity/pgstat.c
+++ b/src/backend/utils/activity/pgstat.c
@@ -445,6 +445,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE]
.shared_data_off = offsetof(PgStatShared_IO, stats),
.shared_data_len = sizeof(((PgStatShared_IO *) 0)->stats),
+ .init_backend_cb = pgstat_io_init_backend_cb,
.flush_static_cb = pgstat_io_flush_cb,
.init_shmem_cb = pgstat_io_init_shmem_cb,
.reset_all_cb = pgstat_io_reset_all_cb,
@@ -691,14 +692,6 @@ pgstat_initialize(void)
/* Set up a process-exit hook to clean up */
before_shmem_exit(pgstat_shutdown_hook, 0);
- /* Allocate I/O latency buckets only if we are going to populate it */
- if (track_io_timing || track_wal_io_timing)
- PendingIOStats.pending_hist_time_buckets = MemoryContextAllocZero(TopMemoryContext,
- IOOBJECT_NUM_TYPES * IOCONTEXT_NUM_TYPES * IOOP_NUM_TYPES *
- PGSTAT_IO_HIST_BUCKETS * sizeof(uint64));
- else
- PendingIOStats.pending_hist_time_buckets = NULL;
-
#ifdef USE_ASSERT_CHECKING
pgstat_is_initialized = true;
#endif
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index bbf910ac4bb..1696f278a77 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -16,6 +16,7 @@
#include "postgres.h"
+#include "access/xlog.h"
#include "executor/instrument.h"
#include "port/pg_bitutils.h"
#include "storage/bufmgr.h"
@@ -66,6 +67,27 @@ pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
return true;
}
+int
+pgstat_bktype_count_potentially_used(BackendType bktype)
+{
+ int cnt = 0;
+
+ for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
+ {
+ for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
+ {
+ for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
+ {
+ /* we do track it */
+ if (pgstat_tracks_io_op(bktype, io_object, io_context, io_op))
+ cnt++;
+ }
+ }
+ }
+
+ return cnt;
+}
+
void
pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op,
uint32 cnt, uint64 bytes)
@@ -186,12 +208,16 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
if (PendingIOStats.pending_hist_time_buckets != NULL)
{
+ int offset;
+
/*
* calculate the bucket_index based on latency in nanoseconds
* (uint64)
*/
bucket_index = get_bucket_index(INSTR_TIME_GET_NANOSEC(io_time));
- PendingIOStats.pending_hist_time_buckets[io_object][io_context][io_op][bucket_index]++;
+
+ offset = PendingIOStats.pending_hist_time_buckets_offsets[io_object][io_context][io_op];
+ PendingIOStats.pending_hist_time_buckets[offset][bucket_index]++;
}
/* Add the per-backend count */
@@ -264,10 +290,23 @@ pgstat_io_flush_cb(bool nowait)
bktype_shstats->times[io_object][io_context][io_op] +=
INSTR_TIME_GET_MICROSEC(time);
+ /*
+ * If tracking I/O stats, save I/O histograms from backend
+ * local's PendingIOStats by using indirect offsets from the
+ * pending_hist_time_buckets dynamic array (accessed with
+ * offsets to save memory) into shared memory.
+ */
if (PendingIOStats.pending_hist_time_buckets != NULL)
for (int b = 0; b < PGSTAT_IO_HIST_BUCKETS; b++)
- bktype_shstats->hist_time_buckets[io_object][io_context][io_op][b] +=
- PendingIOStats.pending_hist_time_buckets[io_object][io_context][io_op][b];
+ {
+ int pending_off = PendingIOStats.pending_hist_time_buckets_offsets[io_object][io_context][io_op];
+
+ if (pending_off != -1)
+ {
+ bktype_shstats->hist_time_buckets[io_object][io_context][io_op][b] +=
+ PendingIOStats.pending_hist_time_buckets[pending_off][b];
+ }
+ }
}
}
}
@@ -276,8 +315,14 @@ pgstat_io_flush_cb(bool nowait)
LWLockRelease(bktype_lock);
- /* Avoid overwriting latency buckets array pointer */
+ /*
+ * Avoid overwriting histogram latency array (with offsets) and pointer to
+ * dynamically allocated memory
+ */
memset(&PendingIOStats, 0, offsetof(PgStat_PendingIO, pending_hist_time_buckets));
+ if (PendingIOStats.pending_hist_time_buckets != NULL)
+ memset(PendingIOStats.pending_hist_time_buckets, 0,
+ PendingIOStats.pending_hist_time_buckets_size * sizeof(*PendingIOStats.pending_hist_time_buckets));
have_iostats = false;
@@ -349,6 +394,43 @@ pgstat_get_io_op_name(IOOp io_op)
pg_unreachable();
}
+void
+pgstat_io_init_backend_cb(void)
+{
+ /* Allocate I/O latency buckets only if we are going to populate it */
+ if (track_io_timing || track_wal_io_timing)
+ {
+ int alloc_sz,
+ io_histograms_used = 0;
+
+ PendingIOStats.pending_hist_time_buckets_size = pgstat_bktype_count_potentially_used(MyBackendType);
+ alloc_sz = PendingIOStats.pending_hist_time_buckets_size * sizeof(*PendingIOStats.pending_hist_time_buckets);
+ PendingIOStats.pending_hist_time_buckets = MemoryContextAllocZero(TopMemoryContext, alloc_sz);
+
+ for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
+ {
+ for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
+ {
+ for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
+ {
+ if (pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op))
+ {
+ Assert(io_histograms_used <= PendingIOStats.pending_hist_time_buckets_size);
+
+ PendingIOStats.pending_hist_time_buckets_offsets[io_object][io_context][io_op] =
+ io_histograms_used++;
+ }
+ else
+ PendingIOStats.pending_hist_time_buckets_offsets[io_object][io_context][io_op] = -1;
+ }
+ }
+ }
+ }
+ else
+ PendingIOStats.pending_hist_time_buckets = NULL;
+
+}
+
void
pgstat_io_init_shmem_cb(void *stats)
{
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 34fd93f86dc..984914e69b8 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -352,12 +352,20 @@ typedef struct PgStat_PendingIO
instr_time pending_times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
/*
- * Dynamically allocated array of
- * [IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES]
- * [IOOP_NUM_TYPES][PGSTAT_IO_HIST_BUCKETS] only with track_io_timings
- * true.
+ * Dynamically allocated array for pg_stat_io_histograms only when
+ * track_io_timings is true. pending_hist_time_buckets_offsets is just an
+ * offset within pending_hist_time_buckets to avoid using unnecessary
+ * memory.
*/
- uint64 (*pending_hist_time_buckets)[IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES][PGSTAT_IO_HIST_BUCKETS];
+ uint64 (*pending_hist_time_buckets)[PGSTAT_IO_HIST_BUCKETS];
+ uint64 pending_hist_time_buckets_offsets[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+
+ /*
+ * Cache how much histograms we have allocated to avoid repetably calling
+ * pgstat_bktype_count_potentially_used(MyBackendType) from
+ * pgstat_io_flush_cb()
+ */
+ int pending_hist_time_buckets_size;
} PgStat_PendingIO;
extern PgStat_PendingIO PendingIOStats;
@@ -645,6 +653,7 @@ extern PgStat_CheckpointerStats *pgstat_fetch_stat_checkpointer(void);
extern bool pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io,
BackendType bktype);
+extern int pgstat_bktype_count_potentially_used(BackendType bktype);
extern void pgstat_count_io_op(IOObject io_object, IOContext io_context,
IOOp io_op, uint32 cnt, uint64 bytes);
extern instr_time pgstat_prepare_io_time(bool track_io_guc);
diff --git a/src/include/utils/pgstat_internal.h b/src/include/utils/pgstat_internal.h
index a3ce8b04723..fcaf21db574 100644
--- a/src/include/utils/pgstat_internal.h
+++ b/src/include/utils/pgstat_internal.h
@@ -759,6 +759,7 @@ extern void pgstat_function_reset_timestamp_cb(PgStatShared_Common *header, Time
extern void pgstat_flush_io(bool nowait);
extern bool pgstat_io_flush_cb(bool nowait);
+extern void pgstat_io_init_backend_cb(void);
extern void pgstat_io_init_shmem_cb(void *stats);
extern void pgstat_io_reset_all_cb(TimestampTz ts);
extern void pgstat_io_snapshot_cb(void);
--
2.43.0