Thread

  1. [RFC PATCH v0 1/7] Add EXPLAIN WAITS statement reporting

    Ilmar Y <tanswis42@gmail.com> — 2026-05-08T23:22:31Z

    ---
     doc/src/sgml/ref/explain.sgml           |  14 +++
     src/backend/commands/explain.c          | 129 +++++++++++++++++++-
     src/backend/commands/explain_state.c    |   8 ++
     src/backend/utils/activity/wait_event.c | 149 ++++++++++++++++++++++++
     src/include/commands/explain_state.h    |   1 +
     src/include/utils/wait_event.h          |  29 +++++
     src/test/regress/expected/explain.out   |  17 +++
     src/test/regress/sql/explain.sql        |   5 +
     8 files changed, 349 insertions(+), 3 deletions(-)
    
    diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
    index e95e19081e1..5460568af1d 100644
    --- a/doc/src/sgml/ref/explain.sgml
    +++ b/doc/src/sgml/ref/explain.sgml
    @@ -47,6 +47,7 @@ EXPLAIN [ ( <replaceable class="parameter">option</replaceable> [, ...] ) ] <rep
         SUMMARY [ <replaceable class="parameter">boolean</replaceable> ]
         MEMORY [ <replaceable class="parameter">boolean</replaceable> ]
         IO [ <replaceable class="parameter">boolean</replaceable> ]
    +    WAITS [ <replaceable class="parameter">boolean</replaceable> ]
         FORMAT { TEXT | XML | JSON | YAML }
     </synopsis>
      </refsynopsisdiv>
    @@ -310,6 +311,19 @@ ROLLBACK;
         </listitem>
        </varlistentry>
     
    +   <varlistentry>
    +    <term><literal>WAITS</literal></term>
    +    <listitem>
    +     <para>
    +      Include exact backend-local wait event timing observed during query
    +      execution.  Wait events are grouped by wait event type and wait event
    +      name, and include the number of waits and total time spent waiting.
    +      This parameter may only be used when <literal>ANALYZE</literal> is also
    +      enabled.  It defaults to <literal>FALSE</literal>.
    +     </para>
    +    </listitem>
    +   </varlistentry>
    +
        <varlistentry>
         <term><literal>FORMAT</literal></term>
         <listitem>
    diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
    index 112c17b0d64..0b7cc5c15c6 100644
    --- a/src/backend/commands/explain.c
    +++ b/src/backend/commands/explain.c
    @@ -45,6 +45,7 @@
     #include "utils/tuplesort.h"
     #include "utils/tuplestore.h"
     #include "utils/typcache.h"
    +#include "utils/wait_event.h"
     #include "utils/xml.h"
     
     
    @@ -149,6 +150,9 @@ static const char *explain_get_index_name(Oid indexId);
     static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage);
     static void show_buffer_usage(ExplainState *es, const BufferUsage *usage);
     static void show_wal_usage(ExplainState *es, const WalUsage *usage);
    +static int	wait_event_usage_cmp(const void *a, const void *b);
    +static void show_wait_event_usage(ExplainState *es,
    +								  const WaitEventUsage *usage);
     static void show_memory_counters(ExplainState *es,
     								 const MemoryContextCounters *mem_counters);
     static void show_result_replacement_info(Result *result, ExplainState *es);
    @@ -510,6 +514,8 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
     	int			eflags;
     	int			instrument_option = 0;
     	SerializeMetrics serializeMetrics = {0};
    +	WaitEventUsage waitEventUsage;
    +	WaitEventUsage *waitEventUsagePtr = NULL;
     
     	Assert(plannedstmt->commandType != CMD_UTILITY);
     
    @@ -583,11 +589,27 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
     		else
     			dir = ForwardScanDirection;
     
    +		if (es->waits)
    +		{
    +			waitEventUsagePtr = &waitEventUsage;
    +			pgstat_begin_wait_event_usage(waitEventUsagePtr,
    +										  queryDesc->estate->es_query_cxt);
    +		}
    +
     		/* run the plan */
    -		ExecutorRun(queryDesc, dir, 0);
    +		PG_TRY();
    +		{
    +			ExecutorRun(queryDesc, dir, 0);
     
    -		/* run cleanup too */
    -		ExecutorFinish(queryDesc);
    +			/* run cleanup too */
    +			ExecutorFinish(queryDesc);
    +		}
    +		PG_FINALLY();
    +		{
    +			if (waitEventUsagePtr)
    +				pgstat_end_wait_event_usage(waitEventUsagePtr);
    +		}
    +		PG_END_TRY();
     
     		/* We can't run ExecutorEnd 'till we're done printing the stats... */
     		totaltime += elapsed_time(&starttime);
    @@ -605,6 +627,9 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
     	/* Create textual dump of plan tree */
     	ExplainPrintPlan(es, queryDesc);
     
    +	if (waitEventUsagePtr)
    +		show_wait_event_usage(es, waitEventUsagePtr);
    +
     	/* Show buffer and/or memory usage in planning */
     	if (peek_buffer_usage(es, bufusage) || mem_counters)
     	{
    @@ -4503,6 +4528,104 @@ show_wal_usage(ExplainState *es, const WalUsage *usage)
     	}
     }
     
    +static int
    +wait_event_usage_cmp(const void *a, const void *b)
    +{
    +	const WaitEventUsageEntry *ea = (const WaitEventUsageEntry *) a;
    +	const WaitEventUsageEntry *eb = (const WaitEventUsageEntry *) b;
    +	int64		ta = INSTR_TIME_GET_MICROSEC(ea->time);
    +	int64		tb = INSTR_TIME_GET_MICROSEC(eb->time);
    +
    +	if (ta < tb)
    +		return 1;
    +	if (ta > tb)
    +		return -1;
    +	if (ea->wait_event_info < eb->wait_event_info)
    +		return -1;
    +	if (ea->wait_event_info > eb->wait_event_info)
    +		return 1;
    +	return 0;
    +}
    +
    +static void
    +show_wait_event_usage(ExplainState *es, const WaitEventUsage *usage)
    +{
    +	WaitEventUsageEntry *entries;
    +
    +	if (usage == NULL)
    +		return;
    +
    +	if (es->format == EXPLAIN_FORMAT_TEXT && usage->nentries == 0)
    +		return;
    +
    +	if (usage->nentries > 0)
    +	{
    +		entries = palloc_array(WaitEventUsageEntry, usage->nentries);
    +		memcpy(entries, usage->entries,
    +			   sizeof(WaitEventUsageEntry) * usage->nentries);
    +		qsort(entries, usage->nentries, sizeof(WaitEventUsageEntry),
    +			  wait_event_usage_cmp);
    +	}
    +	else
    +		entries = NULL;
    +
    +	if (es->format == EXPLAIN_FORMAT_TEXT)
    +	{
    +		ExplainIndentText(es);
    +		appendStringInfoString(es->str, "Wait Events:\n");
    +		es->indent++;
    +
    +		for (int i = 0; i < usage->nentries; i++)
    +		{
    +			const char *event_type;
    +			const char *event_name;
    +
    +			event_type = pgstat_get_wait_event_type(entries[i].wait_event_info);
    +			event_name = pgstat_get_wait_event(entries[i].wait_event_info);
    +
    +			ExplainIndentText(es);
    +			appendStringInfo(es->str, "%s:%s calls=%" PRIu64 " time=%0.3f ms\n",
    +							 event_type ? event_type : "Unknown",
    +							 event_name ? event_name : "unknown",
    +							 entries[i].calls,
    +							 INSTR_TIME_GET_MILLISEC(entries[i].time));
    +		}
    +
    +		es->indent--;
    +	}
    +	else
    +	{
    +		ExplainOpenGroup("Wait-Events", "Wait Events", false, es);
    +
    +		for (int i = 0; i < usage->nentries; i++)
    +		{
    +			const char *event_type;
    +			const char *event_name;
    +
    +			event_type = pgstat_get_wait_event_type(entries[i].wait_event_info);
    +			event_name = pgstat_get_wait_event(entries[i].wait_event_info);
    +
    +			ExplainOpenGroup("Wait-Event", NULL, true, es);
    +			ExplainPropertyText("Wait Event Type",
    +								event_type ? event_type : "Unknown",
    +								es);
    +			ExplainPropertyText("Wait Event",
    +								event_name ? event_name : "unknown",
    +								es);
    +			ExplainPropertyUInteger("Calls", NULL, entries[i].calls, es);
    +			ExplainPropertyFloat("Time", "ms",
    +								 INSTR_TIME_GET_MILLISEC(entries[i].time),
    +								 3, es);
    +			ExplainCloseGroup("Wait-Event", NULL, true, es);
    +		}
    +
    +		ExplainCloseGroup("Wait-Events", "Wait Events", false, es);
    +	}
    +
    +	if (entries)
    +		pfree(entries);
    +}
    +
     /*
      * Show memory usage details.
      */
    diff --git a/src/backend/commands/explain_state.c b/src/backend/commands/explain_state.c
    index a0ee0a664be..2abcb64cc95 100644
    --- a/src/backend/commands/explain_state.c
    +++ b/src/backend/commands/explain_state.c
    @@ -164,6 +164,8 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate)
     		}
     		else if (strcmp(opt->defname, "io") == 0)
     			es->io = defGetBoolean(opt);
    +		else if (strcmp(opt->defname, "waits") == 0)
    +			es->waits = defGetBoolean(opt);
     		else if (!ApplyExtensionExplainOption(es, opt, pstate))
     			ereport(ERROR,
     					(errcode(ERRCODE_SYNTAX_ERROR),
    @@ -196,6 +198,12 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate)
     				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     				 errmsg("EXPLAIN option %s requires ANALYZE", "IO")));
     
    +	/* check that WAITS is used with EXPLAIN ANALYZE */
    +	if (es->waits && !es->analyze)
    +		ereport(ERROR,
    +				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    +				 errmsg("EXPLAIN option %s requires ANALYZE", "WAITS")));
    +
     	/* check that serialize is used with EXPLAIN ANALYZE */
     	if (es->serialize != EXPLAIN_SERIALIZE_NONE && !es->analyze)
     		ereport(ERROR,
    diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
    index 95635c7f56c..60d37ccbb73 100644
    --- a/src/backend/utils/activity/wait_event.c
    +++ b/src/backend/utils/activity/wait_event.c
    @@ -36,11 +36,20 @@ static const char *pgstat_get_wait_client(WaitEventClient w);
     static const char *pgstat_get_wait_ipc(WaitEventIPC w);
     static const char *pgstat_get_wait_timeout(WaitEventTimeout w);
     static const char *pgstat_get_wait_io(WaitEventIO w);
    +static void WaitEventUsageAdd(WaitEventUsage *usage, uint32 wait_event_info,
    +							  const instr_time *elapsed);
     
     
     static uint32 local_my_wait_event_info;
     uint32	   *my_wait_event_info = &local_my_wait_event_info;
     
    +#define WAIT_EVENT_USAGE_INITIAL_EVENTS	16
    +
    +int			pgstat_wait_event_usage_depth = 0;
    +static WaitEventUsage *pgstat_wait_event_usage = NULL;
    +static uint32 pgstat_wait_event_usage_current = 0;
    +static instr_time pgstat_wait_event_usage_start;
    +
     #define WAIT_EVENT_CLASS_MASK	0xFF000000
     #define WAIT_EVENT_ID_MASK		0x0000FFFF
     
    @@ -349,6 +358,146 @@ pgstat_reset_wait_event_storage(void)
     	my_wait_event_info = &local_my_wait_event_info;
     }
     
    +/*
    + * Start collecting exact wait event timings in this backend.
    + *
    + * This is intended for short-lived instrumentation such as EXPLAIN ANALYZE.
    + * It records waits observed through pgstat_report_wait_start/end in backend
    + * local memory.  Nested collection is deliberately treated as part of the
    + * outer collection for now; callers that want independent nested accounting
    + * need a stack of WaitEventUsage contexts.
    + */
    +void
    +pgstat_begin_wait_event_usage(WaitEventUsage *usage, MemoryContext memcontext)
    +{
    +	Assert(usage != NULL);
    +	Assert(memcontext != NULL);
    +
    +	if (pgstat_wait_event_usage_depth++ == 0)
    +	{
    +		memset(usage, 0, sizeof(WaitEventUsage));
    +		usage->memcontext = memcontext;
    +		pgstat_wait_event_usage = usage;
    +		pgstat_wait_event_usage_current = 0;
    +		INSTR_TIME_SET_ZERO(pgstat_wait_event_usage_start);
    +	}
    +}
    +
    +/*
    + * Stop collecting wait event timings.
    + */
    +void
    +pgstat_end_wait_event_usage(WaitEventUsage *usage)
    +{
    +	Assert(usage != NULL);
    +	Assert(pgstat_wait_event_usage_depth > 0);
    +
    +	if (--pgstat_wait_event_usage_depth == 0)
    +	{
    +		if (pgstat_wait_event_usage_current != 0)
    +			pgstat_count_wait_event_end();
    +
    +		pgstat_wait_event_usage = NULL;
    +		pgstat_wait_event_usage_current = 0;
    +		INSTR_TIME_SET_ZERO(pgstat_wait_event_usage_start);
    +	}
    +}
    +
    +/*
    + * Record the beginning of a wait event for exact EXPLAIN-style accounting.
    + */
    +void
    +pgstat_count_wait_event_start(uint32 wait_event_info)
    +{
    +	if (pgstat_wait_event_usage == NULL)
    +		return;
    +
    +	/*
    +	 * Waits are not expected to nest.  If they do, finish the previous wait
    +	 * at the boundary so accounting remains internally consistent.
    +	 */
    +	if (pgstat_wait_event_usage_current != 0)
    +		pgstat_count_wait_event_end();
    +
    +	pgstat_wait_event_usage_current = wait_event_info;
    +	INSTR_TIME_SET_CURRENT(pgstat_wait_event_usage_start);
    +}
    +
    +/*
    + * Record the end of the current wait event.
    + */
    +void
    +pgstat_count_wait_event_end(void)
    +{
    +	instr_time	end;
    +	instr_time	elapsed;
    +
    +	if (pgstat_wait_event_usage == NULL ||
    +		pgstat_wait_event_usage_current == 0)
    +		return;
    +
    +	INSTR_TIME_SET_CURRENT(end);
    +	elapsed = end;
    +	INSTR_TIME_SUBTRACT(elapsed, pgstat_wait_event_usage_start);
    +
    +	WaitEventUsageAdd(pgstat_wait_event_usage,
    +					  pgstat_wait_event_usage_current,
    +					  &elapsed);
    +
    +	pgstat_wait_event_usage_current = 0;
    +	INSTR_TIME_SET_ZERO(pgstat_wait_event_usage_start);
    +}
    +
    +static void
    +WaitEventUsageAdd(WaitEventUsage *usage, uint32 wait_event_info,
    +				  const instr_time *elapsed)
    +{
    +	WaitEventUsageEntry *entry = NULL;
    +
    +	for (int i = 0; i < usage->nentries; i++)
    +	{
    +		if (usage->entries[i].wait_event_info == wait_event_info)
    +		{
    +			entry = &usage->entries[i];
    +			break;
    +		}
    +	}
    +
    +	if (entry == NULL)
    +	{
    +		if (usage->nentries >= usage->maxentries)
    +		{
    +			MemoryContext oldcontext;
    +			int			newmaxentries;
    +
    +			if (usage->maxentries > 0)
    +				newmaxentries = usage->maxentries * 2;
    +			else
    +				newmaxentries = WAIT_EVENT_USAGE_INITIAL_EVENTS;
    +
    +			oldcontext = MemoryContextSwitchTo(usage->memcontext);
    +			if (usage->entries)
    +				usage->entries = repalloc_array(usage->entries,
    +												WaitEventUsageEntry,
    +												newmaxentries);
    +			else
    +				usage->entries = palloc_array(WaitEventUsageEntry,
    +											  newmaxentries);
    +			MemoryContextSwitchTo(oldcontext);
    +
    +			usage->maxentries = newmaxentries;
    +		}
    +
    +		entry = &usage->entries[usage->nentries++];
    +		entry->wait_event_info = wait_event_info;
    +		entry->calls = 0;
    +		INSTR_TIME_SET_ZERO(entry->time);
    +	}
    +
    +	entry->calls++;
    +	INSTR_TIME_ADD(entry->time, *elapsed);
    +}
    +
     /* ----------
      * pgstat_get_wait_event_type() -
      *
    diff --git a/src/include/commands/explain_state.h b/src/include/commands/explain_state.h
    index 97bc7ed49f6..4f0d0d495d4 100644
    --- a/src/include/commands/explain_state.h
    +++ b/src/include/commands/explain_state.h
    @@ -56,6 +56,7 @@ typedef struct ExplainState
     	bool		memory;			/* print planner's memory usage information */
     	bool		settings;		/* print modified settings */
     	bool		io;				/* print info about IO (prefetch, ...) */
    +	bool		waits;			/* print wait event usage */
     	bool		generic;		/* generate a generic plan */
     	ExplainSerializeOption serialize;	/* serialize the query's output? */
     	ExplainFormat format;		/* output format */
    diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
    index 86ee348220d..f7fab5736bb 100644
    --- a/src/include/utils/wait_event.h
    +++ b/src/include/utils/wait_event.h
    @@ -11,16 +11,39 @@
     #define WAIT_EVENT_H
     
     /* enums for wait events */
    +#include "portability/instr_time.h"
    +#include "utils/palloc.h"
     #include "utils/wait_event_types.h"
     
    +typedef struct WaitEventUsageEntry
    +{
    +	uint32		wait_event_info;
    +	uint64		calls;
    +	instr_time	time;
    +} WaitEventUsageEntry;
    +
    +typedef struct WaitEventUsage
    +{
    +	MemoryContext memcontext;
    +	int			nentries;
    +	int			maxentries;
    +	WaitEventUsageEntry *entries;
    +} WaitEventUsage;
    +
     extern const char *pgstat_get_wait_event(uint32 wait_event_info);
     extern const char *pgstat_get_wait_event_type(uint32 wait_event_info);
     static inline void pgstat_report_wait_start(uint32 wait_event_info);
     static inline void pgstat_report_wait_end(void);
     extern void pgstat_set_wait_event_storage(uint32 *wait_event_info);
     extern void pgstat_reset_wait_event_storage(void);
    +extern void pgstat_begin_wait_event_usage(WaitEventUsage *usage,
    +										  MemoryContext memcontext);
    +extern void pgstat_end_wait_event_usage(WaitEventUsage *usage);
    +extern void pgstat_count_wait_event_start(uint32 wait_event_info);
    +extern void pgstat_count_wait_event_end(void);
     
     extern PGDLLIMPORT uint32 *my_wait_event_info;
    +extern PGDLLIMPORT int pgstat_wait_event_usage_depth;
     
     
     /*
    @@ -66,6 +89,9 @@ extern char **GetWaitEventCustomNames(uint32 classId, int *nwaitevents);
     static inline void
     pgstat_report_wait_start(uint32 wait_event_info)
     {
    +	if (pgstat_wait_event_usage_depth > 0)
    +		pgstat_count_wait_event_start(wait_event_info);
    +
     	/*
     	 * Since this is a four-byte field which is always read and written as
     	 * four-bytes, updates are atomic.
    @@ -82,6 +108,9 @@ pgstat_report_wait_start(uint32 wait_event_info)
     static inline void
     pgstat_report_wait_end(void)
     {
    +	if (pgstat_wait_event_usage_depth > 0)
    +		pgstat_count_wait_event_end();
    +
     	/* see pgstat_report_wait_start() */
     	*(volatile uint32 *) my_wait_event_info = 0;
     }
    diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
    index 74a4d87801e..2c7a7e1d4c6 100644
    --- a/src/test/regress/expected/explain.out
    +++ b/src/test/regress/expected/explain.out
    @@ -99,6 +99,23 @@ select explain_filter('explain (buffers, format text) select * from int8_tbl i8'
      Seq Scan on int8_tbl i8  (cost=N.N..N.N rows=N width=N)
     (1 row)
     
    +-- WAITS option
    +select explain_filter('explain (analyze, waits, costs off, summary off, timing off, buffers off) select pg_sleep(0.01)');
    +            explain_filter             
    +---------------------------------------
    + Result (actual rows=N.N loops=N)
    + Wait Events:
    +   Timeout:PgSleep calls=N time=N.N ms
    +(3 rows)
    +
    +select explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json) select pg_sleep(0.01)') #> '{0,Wait Events,0}';
    +                                     ?column?                                     
    +----------------------------------------------------------------------------------
    + {"Time": 0.0, "Calls": 0, "Wait Event": "PgSleep", "Wait Event Type": "Timeout"}
    +(1 row)
    +
    +explain (waits) select 1;
    +ERROR:  EXPLAIN option WAITS requires ANALYZE
     \a
     select explain_filter('explain (analyze, buffers, io, format xml) select * from int8_tbl i8');
     explain_filter
    diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
    index 2f163c64bf6..fe025ddeac5 100644
    --- a/src/test/regress/sql/explain.sql
    +++ b/src/test/regress/sql/explain.sql
    @@ -68,6 +68,11 @@ select explain_filter('explain (analyze, buffers off, verbose) select * from int
     select explain_filter('explain (analyze, buffers, format text) select * from int8_tbl i8');
     select explain_filter('explain (buffers, format text) select * from int8_tbl i8');
     
    +-- WAITS option
    +select explain_filter('explain (analyze, waits, costs off, summary off, timing off, buffers off) select pg_sleep(0.01)');
    +select explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json) select pg_sleep(0.01)') #> '{0,Wait Events,0}';
    +explain (waits) select 1;
    +
     \a
     select explain_filter('explain (analyze, buffers, io, format xml) select * from int8_tbl i8');
     select explain_filter('explain (analyze, serialize, buffers, io, format yaml) select * from int8_tbl i8');
    -- 
    2.52.0