v10f-0001-Add-pg_stat_io_histogram-view-to-provide-more-d.patch
text/x-patch
Filename: v10f-0001-Add-pg_stat_io_histogram-view-to-provide-more-d.patch
Type: text/x-patch
Part: 1
Message:
Re: pg_stat_io_histogram
From de74801e8e784f5126adf7029a2d3e112c985fde Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Fri, 23 Jan 2026 08:10:09 +0100
Subject: [PATCH v10f 1/3] Add pg_stat_io_histogram view to provide more
detailed insight into IO profile
pg_stat_io_histogram displays a histogram of IO latencies for specific
backend_type, object, context and io_type. The histogram has buckets that allow
faster identification of I/O latency outliers due to faulty hardware and/or
misbehaving I/O stack. Such I/O outliers e.g. slow fsyncs could sometimes
cause intermittent issues e.g. for COMMIT or affect the synchronous standbys
performance.
Author: Jakub Wartak <jakub.wartak@enterprisedb.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Ants Aasma <ants.aasma@cybertec.at>
Reviewed-by: Tomas Vondra <tomas@vondra.me>
Discussion: https://postgr.es/m/CAKZiRmwvE4uJLKTgPXeBA4m%2Bd4tTghayoefcaM9%3Dz3_S7i72GA%40mail.gmail.com
---
configure | 38 +++
configure.ac | 1 +
doc/src/sgml/config.sgml | 12 +-
doc/src/sgml/monitoring.sgml | 290 ++++++++++++++++++++
doc/src/sgml/wal.sgml | 5 +-
meson.build | 1 +
src/backend/catalog/system_views.sql | 11 +
src/backend/utils/activity/pgstat.c | 19 +-
src/backend/utils/activity/pgstat_backend.c | 4 +-
src/backend/utils/activity/pgstat_io.c | 92 ++++++-
src/backend/utils/adt/pgstatfuncs.c | 148 ++++++++++
src/include/catalog/pg_proc.dat | 9 +
src/include/pgstat.h | 38 ++-
src/include/port/pg_bitutils.h | 38 ++-
src/include/utils/pgstat_internal.h | 2 +-
src/test/recovery/t/029_stats_restart.pl | 29 ++
src/test/regress/expected/rules.out | 8 +
src/tools/pgindent/typedefs.list | 1 +
18 files changed, 727 insertions(+), 19 deletions(-)
diff --git a/configure b/configure
index f66c1054a7a..c09329240be 100755
--- a/configure
+++ b/configure
@@ -16054,6 +16054,44 @@ cat >>confdefs.h <<_ACEOF
#define HAVE__BUILTIN_CLZ 1
_ACEOF
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_clzl" >&5
+$as_echo_n "checking for __builtin_clzl... " >&6; }
+if ${pgac_cv__builtin_clzl+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+
+int
+call__builtin_clzl(unsigned long x)
+{
+ return __builtin_clzl(x);
+}
+int
+main ()
+{
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ pgac_cv__builtin_clzl=yes
+else
+ pgac_cv__builtin_clzl=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__builtin_clzl" >&5
+$as_echo "$pgac_cv__builtin_clzl" >&6; }
+if test x"${pgac_cv__builtin_clzl}" = xyes ; then
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE__BUILTIN_CLZL 1
+_ACEOF
+
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_ctz" >&5
$as_echo_n "checking for __builtin_ctz... " >&6; }
diff --git a/configure.ac b/configure.ac
index 8d176bd3468..8f804464bc5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1881,6 +1881,7 @@ PGAC_CHECK_BUILTIN_FUNC([__builtin_bswap32], [int x])
PGAC_CHECK_BUILTIN_FUNC([__builtin_bswap64], [long int x])
# We assume that we needn't test all widths of these explicitly:
PGAC_CHECK_BUILTIN_FUNC([__builtin_clz], [unsigned int x])
+PGAC_CHECK_BUILTIN_FUNC([__builtin_clzl], [unsigned long x])
PGAC_CHECK_BUILTIN_FUNC([__builtin_ctz], [unsigned int x])
# __builtin_frame_address may draw a diagnostic for non-constant argument,
# so it needs a different test function.
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 73cc0412330..91b1fd7e635 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9067,9 +9067,11 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
displayed in <link linkend="monitoring-pg-stat-database-view">
<structname>pg_stat_database</structname></link>,
<link linkend="monitoring-pg-stat-io-view">
- <structname>pg_stat_io</structname></link> (if <varname>object</varname>
- is not <literal>wal</literal>), in the output of the
- <link linkend="pg-stat-get-backend-io">
+ <structname>pg_stat_io</structname></link> and
+ <link linkend="monitoring-pg-stat-io-histogram-view">
+ <structname>pg_stat_io_histogram</structname></link>
+ (if <varname>object</varname> is not <literal>wal</literal>),
+ in the output of the <link linkend="pg-stat-get-backend-io">
<function>pg_stat_get_backend_io()</function></link> function (if
<varname>object</varname> is not <literal>wal</literal>), in the
output of <xref linkend="sql-explain"/> when the <literal>BUFFERS</literal>
@@ -9099,7 +9101,9 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
measure the overhead of timing on your system.
I/O timing information is displayed in
<link linkend="monitoring-pg-stat-io-view">
- <structname>pg_stat_io</structname></link> for the
+ <structname>pg_stat_io</structname></link> and
+ <link linkend="monitoring-pg-stat-io-histogram-view">
+ <structname>pg_stat_io_histogram</structname></link> for the
<varname>object</varname> <literal>wal</literal> and in the output of
the <link linkend="pg-stat-get-backend-io">
<function>pg_stat_get_backend_io()</function></link> function for the
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 08d5b824552..e8c5f391841 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -509,6 +509,17 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
</entry>
</row>
+ <row>
+ <entry><structname>pg_stat_io_histogram</structname><indexterm><primary>pg_stat_io_histogram</primary></indexterm></entry>
+ <entry>
+ One row for each combination of backend type, context, target object,
+ IO operation type and latency bucket (in microseconds) containing
+ cluster-wide I/O statistics.
+ See <link linkend="monitoring-pg-stat-io-histogram-view">
+ <structname>pg_stat_io_histogram</structname></link> for details.
+ </entry>
+ </row>
+
<row>
<entry><structname>pg_stat_lock</structname><indexterm><primary>pg_stat_lock</primary></indexterm></entry>
<entry>
@@ -734,6 +745,8 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
Users are advised to use the <productname>PostgreSQL</productname>
statistics views in combination with operating system utilities for a more
complete picture of their database's I/O performance.
+ Furthermore the <structname>pg_stat_io_histogram</structname> view can be helpful
+ identifying latency outliers for specific I/O operations.
</para>
</sect2>
@@ -3302,6 +3315,283 @@ description | Waiting for a newly initialized WAL file to reach durable storage
</sect2>
+ <sect2 id="monitoring-pg-stat-io-histogram-view">
+ <title><structname>pg_stat_io_histogram</structname></title>
+
+ <indexterm>
+ <primary>pg_stat_io_histogram</primary>
+ </indexterm>
+
+ <para>
+ The <structname>pg_stat_io_histogram</structname> view will contain one row for each
+ combination of backend type, target I/O object, and I/O context, IO operation
+ type, bucket latency cluster-wide I/O statistics. Combinations which do not make sense
+ are omitted.
+ </para>
+
+ <para>
+ The view shows measured perceived I/O latency by the backend, not the kernel or device
+ one. This is important distinction when troubleshooting, as the I/O latency observed by
+ the backend might get affected by:
+ <itemizedlist>
+ <listitem>
+ <para>OS scheduler decisions and available CPU resources.</para>
+ <para>With AIO, it might include time to service other IOs from the queue. That will often inflate IO latency.</para>
+ <para>In case of writing, additional filesystem journaling operations.</para>
+ </listitem>
+ </itemizedlist>
+ </para>
+
+ <para>
+ Currently, I/O on relations (e.g. tables, indexes) and WAL activity are
+ tracked. However, relation I/O which bypasses shared buffers
+ (e.g. when moving a table from one tablespace to another) is currently
+ not tracked.
+ </para>
+
+ <table id="pg-stat-io-histogram-view" xreflabel="pg_stat_io_histogram">
+ <title><structname>pg_stat_io_histogram</structname> View</title>
+ <tgroup cols="1">
+ <thead>
+ <row>
+ <entry role="catalog_table_entry">
+ <para role="column_definition">
+ Column Type
+ </para>
+ <para>
+ Description
+ </para>
+ </entry>
+ </row>
+ </thead>
+ <tbody>
+ <row>
+ <entry role="catalog_table_entry">
+ <para role="column_definition">
+ <structfield>backend_type</structfield> <type>text</type>
+ </para>
+ <para>
+ Type of backend (e.g. background worker, autovacuum worker). See <link
+ linkend="monitoring-pg-stat-activity-view">
+ <structname>pg_stat_activity</structname></link> for more information
+ on <varname>backend_type</varname>s. Some
+ <varname>backend_type</varname>s do not accumulate I/O operation
+ statistics and will not be included in the view.
+ </para>
+ </entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry">
+ <para role="column_definition">
+ <structfield>object</structfield> <type>text</type>
+ </para>
+ <para>
+ Target object of an I/O operation. Possible values are:
+ <itemizedlist>
+ <listitem>
+ <para>
+ <literal>relation</literal>: Permanent relations.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>temp relation</literal>: Temporary relations.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>wal</literal>: Write Ahead Logs.
+ </para>
+ </listitem>
+ </itemizedlist>
+ </para>
+ </entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry">
+ <para role="column_definition">
+ <structfield>context</structfield> <type>text</type>
+ </para>
+ <para>
+ The context of an I/O operation. Possible values are:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ <literal>normal</literal>: The default or standard
+ <varname>context</varname> for a type of I/O operation. For
+ example, by default, relation data is read into and written out from
+ shared buffers. Thus, reads and writes of relation data to and from
+ shared buffers are tracked in <varname>context</varname>
+ <literal>normal</literal>.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>init</literal>: I/O operations performed while creating the
+ WAL segments are tracked in <varname>context</varname>
+ <literal>init</literal>.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>vacuum</literal>: I/O operations performed outside of shared
+ buffers while vacuuming and analyzing permanent relations. Temporary
+ table vacuums use the same local buffer pool as other temporary table
+ I/O operations and are tracked in <varname>context</varname>
+ <literal>normal</literal>.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>bulkread</literal>: Certain large read I/O operations
+ done outside of shared buffers, for example, a sequential scan of a
+ large table.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>bulkwrite</literal>: Certain large write I/O operations
+ done outside of shared buffers, such as <command>COPY</command>.
+ </para>
+ </listitem>
+ </itemizedlist>
+ </entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry">
+ <para role="column_definition">
+ <structfield>io_type</structfield> <type>text</type>
+ </para>
+ <para>
+ The type of I/O operation. Possible values are:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ <literal>evict</literal>: eviction from shared buffers cache.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>fsync</literal>: synchronization of modified kernel's
+ filesystem page cache with storage device.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>hit</literal>: shared buffers cache lookup hit.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>reuse</literal>: reuse of existing buffer in case of
+ reusing limited-space ring buffer (applies to <literal>bulkread</literal>,
+ <literal>bulkwrite</literal>, or <literal>vacuum</literal> contexts).
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>writeback</literal>: advise kernel that the described dirty
+ data should be flushed to disk preferably asynchronously.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>extend</literal>: add new zeroed blocks to the end of file.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>read</literal>: self explanatory.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <literal>write</literal>: self explanatory.
+ </para>
+ </listitem>
+ </itemizedlist>
+ </entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry">
+ <para role="column_definition">
+ <structfield>bucket_latency_us</structfield> <type>int4range</type>
+ </para>
+ <para>
+ The latency bucket (in microseconds).
+ </para>
+ </entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry">
+ <para role="column_definition">
+ <structfield>bucket_count</structfield> <type>bigint</type>
+ </para>
+ <para>
+ Number of times latency of the I/O operation hit this specific bucket (with
+ up to <varname>bucket_latency_us</varname> microseconds).
+ </para>
+ </entry>
+ </row>
+
+ <row>
+ <entry role="catalog_table_entry">
+ <para role="column_definition">
+ <structfield>stats_reset</structfield> <type>timestamp with time zone</type>
+ </para>
+ <para>
+ Time at which these statistics were last reset.
+ </para>
+ </entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+
+ <para>
+ Some backend types never perform I/O operations on some I/O objects and/or
+ in some I/O contexts. These rows might display zero bucket counts for such
+ specific operations.
+ </para>
+
+ <para>
+ <structname>pg_stat_io_histogram</structname> can be used to identify
+ I/O storage issues
+ For example:
+ <itemizedlist>
+ <listitem>
+ <para>
+ Presence of abnormally high latency for <varname>fsyncs</varname> might
+ indicate I/O saturation, oversubscription or hardware connectivity issues.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Unusually high latency for <varname>fsyncs</varname> on standby's startup
+ backend type, might be responsible for high duration of commits in
+ synchronous replication setups.
+ </para>
+ </listitem>
+ </itemizedlist>
+ </para>
+
+ <note>
+ <para>
+ Columns tracking I/O wait time will only be non-zero when
+ <xref linkend="guc-track-io-timing"/> is enabled. The user should be
+ careful when referencing these columns in combination with their
+ corresponding I/O operations in case <varname>track_io_timing</varname>
+ was not enabled for the entire time since the last stats reset.
+ </para>
+ </note>
+ </sect2>
<sect2 id="monitoring-pg-stat-lock-view">
<title><structname>pg_stat_lock</structname></title>
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml
index c32931edde3..531245935da 100644
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -950,8 +950,9 @@
of times <function>XLogWrite</function> writes and
<function>issue_xlog_fsync</function> syncs WAL data to disk are also
counted as <varname>writes</varname> and <varname>fsyncs</varname>
- in <structname>pg_stat_io</structname> for the <varname>object</varname>
- <literal>wal</literal>, respectively.
+ in <structname>pg_stat_io</structname> and
+ <structname>pg_stat_io_histogram</structname> for the
+ <varname>object</varname> <literal>wal</literal>, respectively.
</para>
<para>
diff --git a/meson.build b/meson.build
index 20b887f1a1b..51058165742 100644
--- a/meson.build
+++ b/meson.build
@@ -2048,6 +2048,7 @@ builtins = [
'bswap32',
'bswap64',
'clz',
+ 'clzl',
'ctz',
'constant_p',
'frame_address',
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 73a1c1c4670..a752ab157ba 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1282,6 +1282,17 @@ SELECT
b.stats_reset
FROM pg_stat_get_io() b;
+CREATE VIEW pg_stat_io_histogram AS
+SELECT
+ b.backend_type,
+ b.object,
+ b.context,
+ b.io_type,
+ b.bucket_latency_us,
+ b.bucket_count,
+ b.stats_reset
+FROM pg_stat_get_io_histogram() b;
+
CREATE VIEW pg_stat_wal AS
SELECT
w.wal_records,
diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c
index b67da88c7dc..9feb2f1370b 100644
--- a/src/backend/utils/activity/pgstat.c
+++ b/src/backend/utils/activity/pgstat.c
@@ -105,8 +105,10 @@
#include <unistd.h>
#include "access/xact.h"
+#include "access/xlog.h"
#include "lib/dshash.h"
#include "pgstat.h"
+#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/lwlock.h"
@@ -689,6 +691,14 @@ pgstat_initialize(void)
/* Set up a process-exit hook to clean up */
before_shmem_exit(pgstat_shutdown_hook, 0);
+ /* Allocate I/O latency buckets only if we are going to populate it */
+ if (track_io_timing || track_wal_io_timing)
+ PendingIOStats.pending_hist_time_buckets = MemoryContextAllocZero(TopMemoryContext,
+ IOOBJECT_NUM_TYPES * IOCONTEXT_NUM_TYPES * IOOP_NUM_TYPES *
+ PGSTAT_IO_HIST_BUCKETS * sizeof(uint64));
+ else
+ PendingIOStats.pending_hist_time_buckets = NULL;
+
#ifdef USE_ASSERT_CHECKING
pgstat_is_initialized = true;
#endif
@@ -1668,10 +1678,17 @@ pgstat_write_statsfile(void)
pgstat_build_snapshot_fixed(kind);
if (pgstat_is_kind_builtin(kind))
- ptr = ((char *) &pgStatLocal.snapshot) + info->snapshot_ctl_off;
+ {
+ if (kind == PGSTAT_KIND_IO)
+ ptr = (char *) pgStatLocal.snapshot.io;
+ else
+ ptr = ((char *) &pgStatLocal.snapshot) + info->snapshot_ctl_off;
+ }
else
ptr = pgStatLocal.snapshot.custom_data[kind - PGSTAT_KIND_CUSTOM_MIN];
+ Assert(ptr != NULL);
+
fputc(PGSTAT_FILE_ENTRY_FIXED, fpout);
pgstat_write_chunk_s(fpout, &kind);
pgstat_write_chunk(fpout, ptr, info->shared_data_len);
diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c
index 73461c9bca5..fc1bf824a31 100644
--- a/src/backend/utils/activity/pgstat_backend.c
+++ b/src/backend/utils/activity/pgstat_backend.c
@@ -168,7 +168,7 @@ pgstat_flush_backend_entry_io(PgStat_EntryRef *entry_ref)
{
PgStatShared_Backend *shbackendent;
PgStat_BktypeIO *bktype_shstats;
- PgStat_PendingIO pending_io;
+ PgStat_BackendPendingIO pending_io;
/*
* This function can be called even if nothing at all has happened for IO
@@ -205,7 +205,7 @@ pgstat_flush_backend_entry_io(PgStat_EntryRef *entry_ref)
/*
* Clear out the statistics buffer, so it can be re-used.
*/
- MemSet(&PendingBackendStats.pending_io, 0, sizeof(PgStat_PendingIO));
+ MemSet(&PendingBackendStats.pending_io, 0, sizeof(PgStat_BackendPendingIO));
backend_has_iostats = false;
}
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index 2be26e92283..bbf910ac4bb 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -17,10 +17,12 @@
#include "postgres.h"
#include "executor/instrument.h"
+#include "port/pg_bitutils.h"
#include "storage/bufmgr.h"
+#include "utils/memutils.h"
#include "utils/pgstat_internal.h"
-static PgStat_PendingIO PendingIOStats;
+PgStat_PendingIO PendingIOStats;
static bool have_iostats = false;
/*
@@ -107,6 +109,35 @@ pgstat_prepare_io_time(bool track_io_guc)
return io_start;
}
+#define MIN_PG_STAT_IO_HIST_LATENCY 8191
+static inline int
+get_bucket_index(uint64_t ns)
+{
+ const uint32_t max_index = PGSTAT_IO_HIST_BUCKETS - 1;
+
+ /*
+ * hopefully pre-calculated by the compiler: clzl(8191) =
+ * clz(01111111111111b on uint64)
+ */
+ const uint32_t min_latency_leading_zeros =
+ pg_leading_zero_bits64(MIN_PG_STAT_IO_HIST_LATENCY);
+
+ /*
+ * make sure the tmp value has at least 8191 (our minimum bucket size) as
+ * __builtin_clzl might return undefined behavior when operating on 0
+ */
+ uint64_t tmp = ns | MIN_PG_STAT_IO_HIST_LATENCY;
+
+ /* count leading zeros */
+ int leading_zeros = pg_leading_zero_bits64(tmp);
+
+ /* normalize the index */
+ uint32_t index = min_latency_leading_zeros - leading_zeros;
+
+ /* clamp it to the maximum */
+ return (index > max_index) ? max_index : index;
+}
+
/*
* Like pgstat_count_io_op() except it also accumulates time.
*
@@ -125,6 +156,7 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
if (!INSTR_TIME_IS_ZERO(start_time))
{
instr_time io_time;
+ int bucket_index;
INSTR_TIME_SET_CURRENT(io_time);
INSTR_TIME_SUBTRACT(io_time, start_time);
@@ -152,6 +184,16 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op],
io_time);
+ if (PendingIOStats.pending_hist_time_buckets != NULL)
+ {
+ /*
+ * calculate the bucket_index based on latency in nanoseconds
+ * (uint64)
+ */
+ bucket_index = get_bucket_index(INSTR_TIME_GET_NANOSEC(io_time));
+ PendingIOStats.pending_hist_time_buckets[io_object][io_context][io_op][bucket_index]++;
+ }
+
/* Add the per-backend count */
pgstat_count_backend_io_op_time(io_object, io_context, io_op,
io_time);
@@ -165,7 +207,7 @@ pgstat_fetch_stat_io(void)
{
pgstat_snapshot_fixed(PGSTAT_KIND_IO);
- return &pgStatLocal.snapshot.io;
+ return pgStatLocal.snapshot.io;
}
/*
@@ -221,6 +263,11 @@ pgstat_io_flush_cb(bool nowait)
bktype_shstats->times[io_object][io_context][io_op] +=
INSTR_TIME_GET_MICROSEC(time);
+
+ if (PendingIOStats.pending_hist_time_buckets != NULL)
+ for (int b = 0; b < PGSTAT_IO_HIST_BUCKETS; b++)
+ bktype_shstats->hist_time_buckets[io_object][io_context][io_op][b] +=
+ PendingIOStats.pending_hist_time_buckets[io_object][io_context][io_op][b];
}
}
}
@@ -229,7 +276,8 @@ pgstat_io_flush_cb(bool nowait)
LWLockRelease(bktype_lock);
- memset(&PendingIOStats, 0, sizeof(PendingIOStats));
+ /* Avoid overwriting latency buckets array pointer */
+ memset(&PendingIOStats, 0, offsetof(PgStat_PendingIO, pending_hist_time_buckets));
have_iostats = false;
@@ -274,6 +322,33 @@ pgstat_get_io_object_name(IOObject io_object)
pg_unreachable();
}
+const char *
+pgstat_get_io_op_name(IOOp io_op)
+{
+ switch (io_op)
+ {
+ case IOOP_EVICT:
+ return "evict";
+ case IOOP_FSYNC:
+ return "fsync";
+ case IOOP_HIT:
+ return "hit";
+ case IOOP_REUSE:
+ return "reuse";
+ case IOOP_WRITEBACK:
+ return "writeback";
+ case IOOP_EXTEND:
+ return "extend";
+ case IOOP_READ:
+ return "read";
+ case IOOP_WRITE:
+ return "write";
+ }
+
+ elog(ERROR, "unrecognized IOOp value: %d", io_op);
+ pg_unreachable();
+}
+
void
pgstat_io_init_shmem_cb(void *stats)
{
@@ -281,6 +356,9 @@ pgstat_io_init_shmem_cb(void *stats)
for (int i = 0; i < BACKEND_NUM_TYPES; i++)
LWLockInitialize(&stat_shmem->locks[i], LWTRANCHE_PGSTATS_DATA);
+
+ /* this might end up being lazily allocated in pgstat_io_snapshot_cb() */
+ pgStatLocal.snapshot.io = NULL;
}
void
@@ -308,11 +386,15 @@ pgstat_io_reset_all_cb(TimestampTz ts)
void
pgstat_io_snapshot_cb(void)
{
+ if (unlikely(pgStatLocal.snapshot.io == NULL))
+ pgStatLocal.snapshot.io = MemoryContextAllocZero(TopMemoryContext,
+ sizeof(PgStat_IO));
+
for (int i = 0; i < BACKEND_NUM_TYPES; i++)
{
LWLock *bktype_lock = &pgStatLocal.shmem->io.locks[i];
PgStat_BktypeIO *bktype_shstats = &pgStatLocal.shmem->io.stats.stats[i];
- PgStat_BktypeIO *bktype_snap = &pgStatLocal.snapshot.io.stats[i];
+ PgStat_BktypeIO *bktype_snap = &pgStatLocal.snapshot.io->stats[i];
LWLockAcquire(bktype_lock, LW_SHARED);
@@ -321,7 +403,7 @@ pgstat_io_snapshot_cb(void)
* the reset timestamp as well.
*/
if (i == 0)
- pgStatLocal.snapshot.io.stat_reset_timestamp =
+ pgStatLocal.snapshot.io->stat_reset_timestamp =
pgStatLocal.shmem->io.stats.stat_reset_timestamp;
/* using struct assignment due to better type safety */
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 7a9dfa9ba3b..04eab7b3bcd 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -18,6 +18,7 @@
#include "access/xlog.h"
#include "access/xlogprefetcher.h"
#include "catalog/catalog.h"
+#include "catalog/namespace.h"
#include "catalog/pg_authid.h"
#include "catalog/pg_type.h"
#include "common/ip.h"
@@ -30,6 +31,7 @@
#include "storage/procarray.h"
#include "utils/acl.h"
#include "utils/builtins.h"
+#include "utils/rangetypes.h"
#include "utils/timestamp.h"
#include "utils/tuplestore.h"
#include "utils/wait_event.h"
@@ -1638,6 +1640,152 @@ pg_stat_get_backend_io(PG_FUNCTION_ARGS)
return (Datum) 0;
}
+/*
+* When adding a new column to the pg_stat_io_histogram view and the
+* pg_stat_get_io_histogram() function, add a new enum value here above
+* HIST_IO_NUM_COLUMNS.
+*/
+typedef enum hist_io_stat_col
+{
+ HIST_IO_COL_INVALID = -1,
+ HIST_IO_COL_BACKEND_TYPE,
+ HIST_IO_COL_OBJECT,
+ HIST_IO_COL_CONTEXT,
+ HIST_IO_COL_IOTYPE,
+ HIST_IO_COL_BUCKET_US,
+ HIST_IO_COL_COUNT,
+ HIST_IO_COL_RESET_TIME,
+ HIST_IO_NUM_COLUMNS
+} histogram_io_stat_col;
+
+/*
+ * pg_stat_io_histogram_build_tuples
+ *
+ * Helper routine for pg_stat_get_io_histogram() and pg_stat_get_backend_io()
+ * filling a result tuplestore with one tuple for each object and each
+ * context supported by the caller, based on the contents of bktype_stats.
+ */
+static void
+pg_stat_io_histogram_build_tuples(ReturnSetInfo *rsinfo,
+ PgStat_BktypeIO *bktype_stats,
+ BackendType bktype,
+ TimestampTz stat_reset_timestamp)
+{
+ /* Get OID for int4range type */
+ Datum bktype_desc = CStringGetTextDatum(GetBackendTypeDesc(bktype));
+ Oid range_typid = TypenameGetTypid("int4range");
+ TypeCacheEntry *typcache = lookup_type_cache(range_typid, TYPECACHE_RANGE_INFO);
+
+ for (int io_obj = 0; io_obj < IOOBJECT_NUM_TYPES; io_obj++)
+ {
+ const char *obj_name = pgstat_get_io_object_name(io_obj);
+
+ for (int io_context = 0; io_context < IOCONTEXT_NUM_TYPES; io_context++)
+ {
+ const char *context_name = pgstat_get_io_context_name(io_context);
+
+ /*
+ * Some combinations of BackendType, IOObject, and IOContext are
+ * not valid for any type of IOOp. In such cases, omit the entire
+ * row from the view.
+ */
+ if (!pgstat_tracks_io_object(bktype, io_obj, io_context))
+ continue;
+
+ for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
+ {
+ const char *op_name = pgstat_get_io_op_name(io_op);
+
+ for (int bucket = 0; bucket < PGSTAT_IO_HIST_BUCKETS; bucket++)
+ {
+ Datum values[HIST_IO_NUM_COLUMNS] = {0};
+ bool nulls[HIST_IO_NUM_COLUMNS] = {0};
+ RangeBound lower,
+ upper;
+ RangeType *range;
+
+ values[HIST_IO_COL_BACKEND_TYPE] = bktype_desc;
+ values[HIST_IO_COL_OBJECT] = CStringGetTextDatum(obj_name);
+ values[HIST_IO_COL_CONTEXT] = CStringGetTextDatum(context_name);
+ values[HIST_IO_COL_IOTYPE] = CStringGetTextDatum(op_name);
+
+ /* bucket's maximum latency as range in microseconds */
+ if (bucket == 0)
+ lower.val = Int32GetDatum(0);
+ else
+ lower.val = Int32GetDatum(1 << (2 + bucket));
+ lower.infinite = false;
+ lower.inclusive = true;
+ lower.lower = true;
+
+ if (bucket == PGSTAT_IO_HIST_BUCKETS - 1)
+ upper.infinite = true;
+ else
+ {
+ upper.val = Int32GetDatum(1 << (2 + bucket + 1));
+ upper.infinite = false;
+ }
+ upper.inclusive = false;
+ upper.lower = false;
+
+ range = make_range(typcache, &lower, &upper, false, NULL);
+ values[HIST_IO_COL_BUCKET_US] = RangeTypePGetDatum(range);
+
+ /* bucket count */
+ values[HIST_IO_COL_COUNT] = Int64GetDatum(
+ bktype_stats->hist_time_buckets[io_obj][io_context][io_op][bucket]);
+
+ if (stat_reset_timestamp != 0)
+ values[HIST_IO_COL_RESET_TIME] = TimestampTzGetDatum(stat_reset_timestamp);
+ else
+ nulls[HIST_IO_COL_RESET_TIME] = true;
+
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+ values, nulls);
+ }
+ }
+ }
+ }
+}
+
+Datum
+pg_stat_get_io_histogram(PG_FUNCTION_ARGS)
+{
+ ReturnSetInfo *rsinfo;
+ PgStat_IO *backends_io_stats;
+
+ InitMaterializedSRF(fcinfo, 0);
+ rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+
+ backends_io_stats = pgstat_fetch_stat_io();
+
+ for (int bktype = 0; bktype < BACKEND_NUM_TYPES; bktype++)
+ {
+ PgStat_BktypeIO *bktype_stats = &backends_io_stats->stats[bktype];
+
+ /*
+ * In Assert builds, we can afford an extra loop through all of the
+ * counters (in pg_stat_io_build_tuples()), checking that only
+ * expected stats are non-zero, since it keeps the non-Assert code
+ * cleaner.
+ */
+ Assert(pgstat_bktype_io_stats_valid(bktype_stats, bktype));
+
+ /*
+ * For those BackendTypes without IO Operation stats, skip
+ * representing them in the view altogether.
+ */
+ if (!pgstat_tracks_io_bktype(bktype))
+ continue;
+
+ /* save tuples with data from this PgStat_BktypeIO */
+ pg_stat_io_histogram_build_tuples(rsinfo, bktype_stats, bktype,
+ backends_io_stats->stat_reset_timestamp);
+ }
+
+ return (Datum) 0;
+}
+
/*
* pg_stat_wal_build_tuple
*
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index fa9ae79082b..4b50f80140c 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -6061,6 +6061,15 @@
proargnames => '{backend_type,object,context,reads,read_bytes,read_time,writes,write_bytes,write_time,writebacks,writeback_time,extends,extend_bytes,extend_time,hits,evictions,reuses,fsyncs,fsync_time,stats_reset}',
prosrc => 'pg_stat_get_io' },
+{ oid => '6149', descr => 'statistics: per backend type IO latency histogram',
+ proname => 'pg_stat_get_io_histogram', prorows => '30', proretset => 't',
+ provolatile => 'v', proparallel => 'r', prorettype => 'record',
+ proargtypes => '',
+ proallargtypes => '{text,text,text,text,int4range,int8,timestamptz}',
+ proargmodes => '{o,o,o,o,o,o,o}',
+ proargnames => '{backend_type,object,context,io_type,bucket_latency_us,bucket_count,stats_reset}',
+ prosrc => 'pg_stat_get_io_histogram' },
+
{ oid => '9375', descr => 'statistics: per lock type statistics',
proname => 'pg_stat_get_lock', prorows => '10', proretset => 't',
provolatile => 'v', proparallel => 'r', prorettype => 'record',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index dfa2e837638..34fd93f86dc 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -326,11 +326,23 @@ typedef enum IOOp
(((unsigned int) (io_op)) < IOOP_NUM_TYPES && \
((unsigned int) (io_op)) >= IOOP_EXTEND)
+/*
+ * This should represent balance between being fast and providing value
+ * to the users:
+ * 1. We want to cover various fast and slow device types (0.01ms - 15ms)
+ * 2. We want to also cover sporadic long tail latencies (hardware issues,
+ * delayed fsyncs, stuck I/O)
+ * 3. We want to be as small as possible here in terms of size:
+ * 16 * sizeof(uint64) = which should be less than two cachelines.
+ */
+#define PGSTAT_IO_HIST_BUCKETS 16
+
typedef struct PgStat_BktypeIO
{
uint64 bytes[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
PgStat_Counter times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+ uint64 hist_time_buckets[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES][PGSTAT_IO_HIST_BUCKETS];
} PgStat_BktypeIO;
typedef struct PgStat_PendingIO
@@ -338,8 +350,18 @@ typedef struct PgStat_PendingIO
uint64 bytes[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
instr_time pending_times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+
+ /*
+ * Dynamically allocated array of
+ * [IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES]
+ * [IOOP_NUM_TYPES][PGSTAT_IO_HIST_BUCKETS] only with track_io_timings
+ * true.
+ */
+ uint64 (*pending_hist_time_buckets)[IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES][PGSTAT_IO_HIST_BUCKETS];
} PgStat_PendingIO;
+extern PgStat_PendingIO PendingIOStats;
+
typedef struct PgStat_IO
{
TimestampTz stat_reset_timestamp;
@@ -526,15 +548,24 @@ typedef struct PgStat_Backend
} PgStat_Backend;
/* ---------
- * PgStat_BackendPending Non-flushed backend stats.
+ * PgStat_BackendPending(IO) Non-flushed backend stats.
* ---------
*/
+typedef struct PgStat_BackendPendingIO
+{
+ uint64 bytes[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+ PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+ instr_time pending_times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
+} PgStat_BackendPendingIO;
+
typedef struct PgStat_BackendPending
{
/*
- * Backend statistics store the same amount of IO data as PGSTAT_KIND_IO.
+ * Backend statistics store almost the same amount of IO data as
+ * PGSTAT_KIND_IO. The only difference between PgStat_BackendPendingIO and
+ * PgStat_PendingIO is that the latter also track IO latency histograms.
*/
- PgStat_PendingIO pending_io;
+ PgStat_BackendPendingIO pending_io;
} PgStat_BackendPending;
/*
@@ -624,6 +655,7 @@ extern void pgstat_count_io_op_time(IOObject io_object, IOContext io_context,
extern PgStat_IO *pgstat_fetch_stat_io(void);
extern const char *pgstat_get_io_context_name(IOContext io_context);
extern const char *pgstat_get_io_object_name(IOObject io_object);
+extern const char *pgstat_get_io_op_name(IOOp io_op);
extern bool pgstat_tracks_io_bktype(BackendType bktype);
extern bool pgstat_tracks_io_object(BackendType bktype,
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 7a00d197013..b27913a2ad8 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -32,6 +32,42 @@ extern PGDLLIMPORT const uint8 pg_leftmost_one_pos[256];
extern PGDLLIMPORT const uint8 pg_rightmost_one_pos[256];
extern PGDLLIMPORT const uint8 pg_number_of_ones[256];
+
+/*
+ * pg_leading_zero_bits64
+ * Returns the number of leading 0-bits in x, starting at the most significant bit position.
+ * Word must not be 0 (as it is undefined behavior).
+ */
+static inline int
+pg_leading_zero_bits64(uint64 word)
+{
+#ifdef HAVE__BUILTIN_CLZL
+ Assert(word != 0);
+
+#if SIZEOF_LONG == 8
+ return __builtin_clzl(word);
+#elif SIZEOF_LONG_LONG == 8
+ return __builtin_clzll(word);
+#else
+#error "cannot find integer type of the same size as uint64_t"
+#endif
+
+#else
+ uint64 y;
+ int n = 64;
+ if (word == 0)
+ return 64;
+
+ y = word >> 32; if (y != 0) { n -= 32; word = y; }
+ y = word >> 16; if (y != 0) { n -= 16; word = y; }
+ y = word >> 8; if (y != 0) { n -= 8; word = y; }
+ y = word >> 4; if (y != 0) { n -= 4; word = y; }
+ y = word >> 2; if (y != 0) { n -= 2; word = y; }
+ y = word >> 1; if (y != 0) { return n - 2; }
+ return n - 1;
+#endif
+}
+
/*
* pg_leftmost_one_pos32
* Returns the position of the most significant set bit in "word",
@@ -71,7 +107,7 @@ pg_leftmost_one_pos32(uint32 word)
static inline int
pg_leftmost_one_pos64(uint64 word)
{
-#ifdef HAVE__BUILTIN_CLZ
+#ifdef HAVE__BUILTIN_CLZL
Assert(word != 0);
#if SIZEOF_LONG == 8
diff --git a/src/include/utils/pgstat_internal.h b/src/include/utils/pgstat_internal.h
index fe463faaf63..a3ce8b04723 100644
--- a/src/include/utils/pgstat_internal.h
+++ b/src/include/utils/pgstat_internal.h
@@ -608,7 +608,7 @@ typedef struct PgStat_Snapshot
PgStat_CheckpointerStats checkpointer;
- PgStat_IO io;
+ PgStat_IO *io;
PgStat_Lock lock;
diff --git a/src/test/recovery/t/029_stats_restart.pl b/src/test/recovery/t/029_stats_restart.pl
index cdc427dbc78..33939c8701a 100644
--- a/src/test/recovery/t/029_stats_restart.pl
+++ b/src/test/recovery/t/029_stats_restart.pl
@@ -293,7 +293,36 @@ cmp_ok(
$wal_restart_immediate->{reset},
"$sect: reset timestamp is new");
+
+## Test pg_stat_io_histogram that is becoming active due to dynamic memory
+## allocation only for new backends with globally set track_[io|wal_io]_timing
+$sect = "pg_stat_io_histogram";
+$node->append_conf('postgresql.conf', "track_io_timing = 'on'");
+$node->append_conf('postgresql.conf', "track_wal_io_timing = 'on'");
+$node->restart;
+
+
+## Check that pg_stat_io_histograms sees some growing counts in buckets
+## We could also try with checkpointer, but it often runs with fsync=off
+## during test.
+my $countbefore = $node->safe_psql('postgres',
+ "SELECT sum(bucket_count) AS hist_bucket_count_sum FROM pg_stat_get_io_histogram() " .
+ "WHERE backend_type='client backend' AND object='relation' AND context='normal'");
+
+$node->safe_psql('postgres', "CREATE TABLE test_io_hist(id bigint);");
+$node->safe_psql('postgres', "INSERT INTO test_io_hist SELECT generate_series(1, 100) s;");
+$node->safe_psql('postgres', "SELECT pg_stat_force_next_flush();");
+
+my $countafter = $node->safe_psql('postgres',
+ "SELECT sum(bucket_count) AS hist_bucket_count_sum FROM pg_stat_get_io_histogram() " .
+ "WHERE backend_type='client backend' AND object='relation' AND context='normal'");
+
+cmp_ok(
+ $countafter, '>', $countbefore,
+ "pg_stat_io_histogram: latency buckets growing");
+
$node->stop;
+
done_testing();
sub trigger_funcrel_stat
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index a65a5bf0c4f..c0067cb653b 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1967,6 +1967,14 @@ pg_stat_io| SELECT backend_type,
fsync_time,
stats_reset
FROM pg_stat_get_io() b(backend_type, object, context, reads, read_bytes, read_time, writes, write_bytes, write_time, writebacks, writeback_time, extends, extend_bytes, extend_time, hits, evictions, reuses, fsyncs, fsync_time, stats_reset);
+pg_stat_io_histogram| SELECT backend_type,
+ object,
+ context,
+ io_type,
+ bucket_latency_us,
+ bucket_count,
+ stats_reset
+ FROM pg_stat_get_io_histogram() b(backend_type, object, context, io_type, bucket_latency_us, bucket_count, stats_reset);
pg_stat_lock| SELECT locktype,
waits,
wait_time,
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 3ade7c08b6d..7a6b604ce5e 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3838,6 +3838,7 @@ gzFile
having_collation_ctx
heap_page_items_state
help_handler
+histogram_io_stat_col
hlCheck
hstoreCheckKeyLen_t
hstoreCheckValLen_t
--
2.43.0