v4-0007-Index-scans-Split-heap-and-index-buffer-access-re.patch

application/octet-stream

Filename: v4-0007-Index-scans-Split-heap-and-index-buffer-access-re.patch
Type: application/octet-stream
Part: 6
Message: Re: Stack-based tracking of per-node WAL/buffer usage
From ccea6e453872a0ae63351b3ba4360845035ec621 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Thu, 30 Oct 2025 22:27:30 -0700
Subject: [PATCH v4 7/7] Index scans: Split heap and index buffer access
 reporting in EXPLAIN

This makes it clear whether activity was on the index directly, or
on the table based on heap fetches.
---
 src/backend/commands/explain.c       | 56 ++++++++++++++++------------
 src/backend/executor/execProcnode.c  | 13 +++++++
 src/backend/executor/instrument.c    | 25 +++++++++++++
 src/backend/executor/nodeIndexscan.c | 15 +++++++-
 src/include/access/genam.h           |  3 ++
 src/include/executor/instrument.h    |  3 ++
 6 files changed, 91 insertions(+), 24 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index dd3bf615581..fb96dd5248c 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -143,7 +143,7 @@ static void show_instrumentation_count(const char *qlabel, int which,
 static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es);
 static const char *explain_get_index_name(Oid indexId);
 static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage);
-static void show_buffer_usage(ExplainState *es, const BufferUsage *usage);
+static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title);
 static void show_wal_usage(ExplainState *es, const WalUsage *usage);
 static void show_memory_counters(ExplainState *es,
 								 const MemoryContextCounters *mem_counters);
@@ -603,7 +603,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 		}
 
 		if (bufusage)
-			show_buffer_usage(es, bufusage);
+			show_buffer_usage(es, bufusage, NULL);
 
 		if (mem_counters)
 			show_memory_counters(es, mem_counters);
@@ -1020,7 +1020,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
 		if (es->buffers && peek_buffer_usage(es, &metrics->bufferUsage))
 		{
 			es->indent++;
-			show_buffer_usage(es, &metrics->bufferUsage);
+			show_buffer_usage(es, &metrics->bufferUsage, NULL);
 			es->indent--;
 		}
 	}
@@ -1034,7 +1034,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
 								BYTES_TO_KILOBYTES(metrics->bytesSent), es);
 		ExplainPropertyText("Format", format, es);
 		if (es->buffers)
-			show_buffer_usage(es, &metrics->bufferUsage);
+			show_buffer_usage(es, &metrics->bufferUsage, NULL);
 	}
 
 	ExplainCloseGroup("Serialization", "Serialization", true, es);
@@ -1960,6 +1960,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 				show_instrumentation_count("Rows Removed by Filter", 1,
 										   planstate, es);
 			show_indexsearches_info(planstate, es);
+
+			if (es->buffers && planstate->instrument)
+				show_buffer_usage(es, &((IndexScanState *) planstate)->iss_Instrument.table_stack.bufusage, "Table");
 			break;
 		case T_IndexOnlyScan:
 			show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
@@ -2278,7 +2281,8 @@ ExplainNode(PlanState *planstate, List *ancestors,
 
 	/* Show buffer/WAL usage */
 	if (es->buffers && planstate->instrument)
-		show_buffer_usage(es, &planstate->instrument->stack.bufusage);
+		show_buffer_usage(es, &planstate->instrument->stack.bufusage,
+						  IsA(plan, IndexScan) ? "Index" : NULL);
 	if (es->wal && planstate->instrument)
 		show_wal_usage(es, &planstate->instrument->stack.walusage);
 
@@ -2297,7 +2301,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 
 			ExplainOpenWorker(n, es);
 			if (es->buffers)
-				show_buffer_usage(es, &instrument->stack.bufusage);
+				show_buffer_usage(es, &instrument->stack.bufusage, NULL);
 			if (es->wal)
 				show_wal_usage(es, &instrument->stack.walusage);
 			ExplainCloseWorker(n, es);
@@ -4097,7 +4101,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage)
  * Show buffer usage details.  This better be sync with peek_buffer_usage.
  */
 static void
-show_buffer_usage(ExplainState *es, const BufferUsage *usage)
+show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title)
 {
 	if (es->format == EXPLAIN_FORMAT_TEXT)
 	{
@@ -4122,6 +4126,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
 		if (has_shared || has_local || has_temp)
 		{
 			ExplainIndentText(es);
+			if (title)
+				appendStringInfo(es->str, "%s ", title);
 			appendStringInfoString(es->str, "Buffers:");
 
 			if (has_shared)
@@ -4177,6 +4183,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
 		if (has_shared_timing || has_local_timing || has_temp_timing)
 		{
 			ExplainIndentText(es);
+			if (title)
+				appendStringInfo(es->str, "%s ", title);
 			appendStringInfoString(es->str, "I/O Timings:");
 
 			if (has_shared_timing)
@@ -4218,44 +4226,46 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
 	}
 	else
 	{
-		ExplainPropertyInteger("Shared Hit Blocks", NULL,
+		char	   *prefix = title ? psprintf("%s ", title) : pstrdup("");
+
+		ExplainPropertyInteger(psprintf("%sShared Hit Blocks", prefix), NULL,
 							   usage->shared_blks_hit, es);
-		ExplainPropertyInteger("Shared Read Blocks", NULL,
+		ExplainPropertyInteger(psprintf("%sShared Read Blocks", prefix), NULL,
 							   usage->shared_blks_read, es);
-		ExplainPropertyInteger("Shared Dirtied Blocks", NULL,
+		ExplainPropertyInteger(psprintf("%sShared Dirtied Blocks", prefix), NULL,
 							   usage->shared_blks_dirtied, es);
-		ExplainPropertyInteger("Shared Written Blocks", NULL,
+		ExplainPropertyInteger(psprintf("%sShared Written Blocks", prefix), NULL,
 							   usage->shared_blks_written, es);
-		ExplainPropertyInteger("Local Hit Blocks", NULL,
+		ExplainPropertyInteger(psprintf("%sLocal Hit Blocks", prefix), NULL,
 							   usage->local_blks_hit, es);
-		ExplainPropertyInteger("Local Read Blocks", NULL,
+		ExplainPropertyInteger(psprintf("%sLocal Read Blocks", prefix), NULL,
 							   usage->local_blks_read, es);
-		ExplainPropertyInteger("Local Dirtied Blocks", NULL,
+		ExplainPropertyInteger(psprintf("%sLocal Dirtied Blocks", prefix), NULL,
 							   usage->local_blks_dirtied, es);
-		ExplainPropertyInteger("Local Written Blocks", NULL,
+		ExplainPropertyInteger(psprintf("%sLocal Written Blocks", prefix), NULL,
 							   usage->local_blks_written, es);
-		ExplainPropertyInteger("Temp Read Blocks", NULL,
+		ExplainPropertyInteger(psprintf("%sTemp Read Blocks", prefix), NULL,
 							   usage->temp_blks_read, es);
-		ExplainPropertyInteger("Temp Written Blocks", NULL,
+		ExplainPropertyInteger(psprintf("%sTemp Written Blocks", prefix), NULL,
 							   usage->temp_blks_written, es);
 		if (track_io_timing)
 		{
-			ExplainPropertyFloat("Shared I/O Read Time", "ms",
+			ExplainPropertyFloat(psprintf("%sShared I/O Read Time", prefix), "ms",
 								 INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time),
 								 3, es);
-			ExplainPropertyFloat("Shared I/O Write Time", "ms",
+			ExplainPropertyFloat(psprintf("%sShared I/O Write Time", prefix), "ms",
 								 INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time),
 								 3, es);
-			ExplainPropertyFloat("Local I/O Read Time", "ms",
+			ExplainPropertyFloat(psprintf("%sLocal I/O Read Time", prefix), "ms",
 								 INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time),
 								 3, es);
-			ExplainPropertyFloat("Local I/O Write Time", "ms",
+			ExplainPropertyFloat(psprintf("%sLocal I/O Write Time", prefix), "ms",
 								 INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time),
 								 3, es);
-			ExplainPropertyFloat("Temp I/O Read Time", "ms",
+			ExplainPropertyFloat(psprintf("%sTemp I/O Read Time", prefix), "ms",
 								 INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time),
 								 3, es);
-			ExplainPropertyFloat("Temp I/O Write Time", "ms",
+			ExplainPropertyFloat(psprintf("%sTemp I/O Write Time", prefix), "ms",
 								 INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time),
 								 3, es);
 		}
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index d00cf820a27..f19af428d97 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -854,7 +854,20 @@ ExecAccumNodeInstrumentation_walker(PlanState *node, void *context)
 	planstate_tree_walker(node, ExecAccumNodeInstrumentation_walker, context);
 
 	if (node->instrument && node->instrument->stack.previous)
+	{
+		/*
+		 * Index Scan nodes account for heap buffer usage separately, so we
+		 * need to explitly add here
+		 */
+		if (IsA(node, IndexScanState))
+		{
+			IndexScanState *iss = castNode(IndexScanState, node);
+
+			InstrStackAdd(node->instrument->stack.previous, &iss->iss_Instrument.table_stack);
+		}
+
 		InstrStackAdd(node->instrument->stack.previous, &node->instrument->stack);
+	}
 
 	return false;
 }
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 2a141fdae07..f6abc8d0c19 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -395,6 +395,31 @@ InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add)
 		WalUsageAdd(&dst->stack.walusage, &add->stack.walusage);
 }
 
+void
+InstrStartNodeStack(NodeInstrumentation * instr, InstrStack * stack)
+{
+	if (instr->need_bufusage || instr->need_walusage)
+	{
+		/* Ensure that we always have a parent, even at the top most node */
+		Assert(pgInstrStack != NULL);
+
+		InstrPushStack(stack);
+	}
+}
+
+void
+InstrStopNodeStack(NodeInstrumentation * instr, InstrStack * stack)
+{
+	if (instr->need_bufusage || instr->need_walusage)
+	{
+		/* Ensure that we always have a parent, even at the top most node */
+		Assert(stack->previous != NULL);
+
+		/* Adding to parent is handled by ExecAccumNodeInstrumentation */
+		InstrPopStack(stack, false);
+	}
+}
+
 /* start instrumentation during parallel executor startup */
 Instrumentation *
 InstrStartParallelQuery(void)
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 7fcaa37fe62..fcb611acb4a 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -83,6 +83,7 @@ IndexNext(IndexScanState *node)
 	ExprContext *econtext;
 	ScanDirection direction;
 	IndexScanDesc scandesc;
+	ItemPointer tid;
 	TupleTableSlot *slot;
 
 	/*
@@ -128,8 +129,20 @@ IndexNext(IndexScanState *node)
 	/*
 	 * ok, now that we have what we need, fetch the next tuple.
 	 */
-	while (index_getnext_slot(scandesc, direction, slot))
+	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
 	{
+		if (node->ss.ps.instrument)
+			InstrStartNodeStack(node->ss.ps.instrument, &node->iss_Instrument.table_stack);
+
+		if (unlikely(!index_fetch_heap(scandesc, slot)))
+			continue;
+
+		if (node->ss.ps.instrument)
+			InstrStopNodeStack(node->ss.ps.instrument, &node->iss_Instrument.table_stack);
+
+		if (scandesc->xs_heap_continue)
+			elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+
 		CHECK_FOR_INTERRUPTS();
 
 		/*
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 9200a22bd9f..7813b4688f5 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -17,6 +17,7 @@
 #include "access/htup.h"
 #include "access/sdir.h"
 #include "access/skey.h"
+#include "executor/instrument.h"
 #include "nodes/tidbitmap.h"
 #include "storage/buf.h"
 #include "storage/lockdefs.h"
@@ -40,6 +41,8 @@ typedef struct IndexScanInstrumentation
 {
 	/* Index search count (incremented with pgstat_count_index_scan call) */
 	uint64		nsearches;
+	/* Buffer usage of heap access during index scans */
+	InstrStack	table_stack;
 } IndexScanInstrumentation;
 
 /*
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index a627cfcac2d..9bff6d8303f 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -185,6 +185,9 @@ extern void InstrUpdateTupleCount(NodeInstrumentation * instr, double nTuples);
 extern void InstrEndLoop(NodeInstrumentation * instr);
 extern void InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add);
 
+extern void InstrStartNodeStack(NodeInstrumentation * dst, InstrStack * stack);
+extern void InstrStopNodeStack(NodeInstrumentation * dst, InstrStack * stack);
+
 extern Instrumentation *InstrStartParallelQuery(void);
 extern void InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage);
 extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
-- 
2.47.1