v4-0007-Index-scans-Split-heap-and-index-buffer-access-re.patch
application/octet-stream
Filename: v4-0007-Index-scans-Split-heap-and-index-buffer-access-re.patch
Type: application/octet-stream
Part: 6
From ccea6e453872a0ae63351b3ba4360845035ec621 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Thu, 30 Oct 2025 22:27:30 -0700
Subject: [PATCH v4 7/7] Index scans: Split heap and index buffer access
reporting in EXPLAIN
This makes it clear whether activity was on the index directly, or
on the table based on heap fetches.
---
src/backend/commands/explain.c | 56 ++++++++++++++++------------
src/backend/executor/execProcnode.c | 13 +++++++
src/backend/executor/instrument.c | 25 +++++++++++++
src/backend/executor/nodeIndexscan.c | 15 +++++++-
src/include/access/genam.h | 3 ++
src/include/executor/instrument.h | 3 ++
6 files changed, 91 insertions(+), 24 deletions(-)
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index dd3bf615581..fb96dd5248c 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -143,7 +143,7 @@ static void show_instrumentation_count(const char *qlabel, int which,
static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es);
static const char *explain_get_index_name(Oid indexId);
static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage);
-static void show_buffer_usage(ExplainState *es, const BufferUsage *usage);
+static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title);
static void show_wal_usage(ExplainState *es, const WalUsage *usage);
static void show_memory_counters(ExplainState *es,
const MemoryContextCounters *mem_counters);
@@ -603,7 +603,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
}
if (bufusage)
- show_buffer_usage(es, bufusage);
+ show_buffer_usage(es, bufusage, NULL);
if (mem_counters)
show_memory_counters(es, mem_counters);
@@ -1020,7 +1020,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
if (es->buffers && peek_buffer_usage(es, &metrics->bufferUsage))
{
es->indent++;
- show_buffer_usage(es, &metrics->bufferUsage);
+ show_buffer_usage(es, &metrics->bufferUsage, NULL);
es->indent--;
}
}
@@ -1034,7 +1034,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics)
BYTES_TO_KILOBYTES(metrics->bytesSent), es);
ExplainPropertyText("Format", format, es);
if (es->buffers)
- show_buffer_usage(es, &metrics->bufferUsage);
+ show_buffer_usage(es, &metrics->bufferUsage, NULL);
}
ExplainCloseGroup("Serialization", "Serialization", true, es);
@@ -1960,6 +1960,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
show_indexsearches_info(planstate, es);
+
+ if (es->buffers && planstate->instrument)
+ show_buffer_usage(es, &((IndexScanState *) planstate)->iss_Instrument.table_stack.bufusage, "Table");
break;
case T_IndexOnlyScan:
show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
@@ -2278,7 +2281,8 @@ ExplainNode(PlanState *planstate, List *ancestors,
/* Show buffer/WAL usage */
if (es->buffers && planstate->instrument)
- show_buffer_usage(es, &planstate->instrument->stack.bufusage);
+ show_buffer_usage(es, &planstate->instrument->stack.bufusage,
+ IsA(plan, IndexScan) ? "Index" : NULL);
if (es->wal && planstate->instrument)
show_wal_usage(es, &planstate->instrument->stack.walusage);
@@ -2297,7 +2301,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
ExplainOpenWorker(n, es);
if (es->buffers)
- show_buffer_usage(es, &instrument->stack.bufusage);
+ show_buffer_usage(es, &instrument->stack.bufusage, NULL);
if (es->wal)
show_wal_usage(es, &instrument->stack.walusage);
ExplainCloseWorker(n, es);
@@ -4097,7 +4101,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage)
* Show buffer usage details. This better be sync with peek_buffer_usage.
*/
static void
-show_buffer_usage(ExplainState *es, const BufferUsage *usage)
+show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title)
{
if (es->format == EXPLAIN_FORMAT_TEXT)
{
@@ -4122,6 +4126,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
if (has_shared || has_local || has_temp)
{
ExplainIndentText(es);
+ if (title)
+ appendStringInfo(es->str, "%s ", title);
appendStringInfoString(es->str, "Buffers:");
if (has_shared)
@@ -4177,6 +4183,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
if (has_shared_timing || has_local_timing || has_temp_timing)
{
ExplainIndentText(es);
+ if (title)
+ appendStringInfo(es->str, "%s ", title);
appendStringInfoString(es->str, "I/O Timings:");
if (has_shared_timing)
@@ -4218,44 +4226,46 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage)
}
else
{
- ExplainPropertyInteger("Shared Hit Blocks", NULL,
+ char *prefix = title ? psprintf("%s ", title) : pstrdup("");
+
+ ExplainPropertyInteger(psprintf("%sShared Hit Blocks", prefix), NULL,
usage->shared_blks_hit, es);
- ExplainPropertyInteger("Shared Read Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sShared Read Blocks", prefix), NULL,
usage->shared_blks_read, es);
- ExplainPropertyInteger("Shared Dirtied Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sShared Dirtied Blocks", prefix), NULL,
usage->shared_blks_dirtied, es);
- ExplainPropertyInteger("Shared Written Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sShared Written Blocks", prefix), NULL,
usage->shared_blks_written, es);
- ExplainPropertyInteger("Local Hit Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sLocal Hit Blocks", prefix), NULL,
usage->local_blks_hit, es);
- ExplainPropertyInteger("Local Read Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sLocal Read Blocks", prefix), NULL,
usage->local_blks_read, es);
- ExplainPropertyInteger("Local Dirtied Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sLocal Dirtied Blocks", prefix), NULL,
usage->local_blks_dirtied, es);
- ExplainPropertyInteger("Local Written Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sLocal Written Blocks", prefix), NULL,
usage->local_blks_written, es);
- ExplainPropertyInteger("Temp Read Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sTemp Read Blocks", prefix), NULL,
usage->temp_blks_read, es);
- ExplainPropertyInteger("Temp Written Blocks", NULL,
+ ExplainPropertyInteger(psprintf("%sTemp Written Blocks", prefix), NULL,
usage->temp_blks_written, es);
if (track_io_timing)
{
- ExplainPropertyFloat("Shared I/O Read Time", "ms",
+ ExplainPropertyFloat(psprintf("%sShared I/O Read Time", prefix), "ms",
INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time),
3, es);
- ExplainPropertyFloat("Shared I/O Write Time", "ms",
+ ExplainPropertyFloat(psprintf("%sShared I/O Write Time", prefix), "ms",
INSTR_TIME_GET_MILLISEC(usage->shared_blk_write_time),
3, es);
- ExplainPropertyFloat("Local I/O Read Time", "ms",
+ ExplainPropertyFloat(psprintf("%sLocal I/O Read Time", prefix), "ms",
INSTR_TIME_GET_MILLISEC(usage->local_blk_read_time),
3, es);
- ExplainPropertyFloat("Local I/O Write Time", "ms",
+ ExplainPropertyFloat(psprintf("%sLocal I/O Write Time", prefix), "ms",
INSTR_TIME_GET_MILLISEC(usage->local_blk_write_time),
3, es);
- ExplainPropertyFloat("Temp I/O Read Time", "ms",
+ ExplainPropertyFloat(psprintf("%sTemp I/O Read Time", prefix), "ms",
INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time),
3, es);
- ExplainPropertyFloat("Temp I/O Write Time", "ms",
+ ExplainPropertyFloat(psprintf("%sTemp I/O Write Time", prefix), "ms",
INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time),
3, es);
}
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index d00cf820a27..f19af428d97 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -854,7 +854,20 @@ ExecAccumNodeInstrumentation_walker(PlanState *node, void *context)
planstate_tree_walker(node, ExecAccumNodeInstrumentation_walker, context);
if (node->instrument && node->instrument->stack.previous)
+ {
+ /*
+ * Index Scan nodes account for heap buffer usage separately, so we
+ * need to explitly add here
+ */
+ if (IsA(node, IndexScanState))
+ {
+ IndexScanState *iss = castNode(IndexScanState, node);
+
+ InstrStackAdd(node->instrument->stack.previous, &iss->iss_Instrument.table_stack);
+ }
+
InstrStackAdd(node->instrument->stack.previous, &node->instrument->stack);
+ }
return false;
}
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 2a141fdae07..f6abc8d0c19 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -395,6 +395,31 @@ InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add)
WalUsageAdd(&dst->stack.walusage, &add->stack.walusage);
}
+void
+InstrStartNodeStack(NodeInstrumentation * instr, InstrStack * stack)
+{
+ if (instr->need_bufusage || instr->need_walusage)
+ {
+ /* Ensure that we always have a parent, even at the top most node */
+ Assert(pgInstrStack != NULL);
+
+ InstrPushStack(stack);
+ }
+}
+
+void
+InstrStopNodeStack(NodeInstrumentation * instr, InstrStack * stack)
+{
+ if (instr->need_bufusage || instr->need_walusage)
+ {
+ /* Ensure that we always have a parent, even at the top most node */
+ Assert(stack->previous != NULL);
+
+ /* Adding to parent is handled by ExecAccumNodeInstrumentation */
+ InstrPopStack(stack, false);
+ }
+}
+
/* start instrumentation during parallel executor startup */
Instrumentation *
InstrStartParallelQuery(void)
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 7fcaa37fe62..fcb611acb4a 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -83,6 +83,7 @@ IndexNext(IndexScanState *node)
ExprContext *econtext;
ScanDirection direction;
IndexScanDesc scandesc;
+ ItemPointer tid;
TupleTableSlot *slot;
/*
@@ -128,8 +129,20 @@ IndexNext(IndexScanState *node)
/*
* ok, now that we have what we need, fetch the next tuple.
*/
- while (index_getnext_slot(scandesc, direction, slot))
+ while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
{
+ if (node->ss.ps.instrument)
+ InstrStartNodeStack(node->ss.ps.instrument, &node->iss_Instrument.table_stack);
+
+ if (unlikely(!index_fetch_heap(scandesc, slot)))
+ continue;
+
+ if (node->ss.ps.instrument)
+ InstrStopNodeStack(node->ss.ps.instrument, &node->iss_Instrument.table_stack);
+
+ if (scandesc->xs_heap_continue)
+ elog(ERROR, "non-MVCC snapshots are not supported in index-only scans");
+
CHECK_FOR_INTERRUPTS();
/*
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 9200a22bd9f..7813b4688f5 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -17,6 +17,7 @@
#include "access/htup.h"
#include "access/sdir.h"
#include "access/skey.h"
+#include "executor/instrument.h"
#include "nodes/tidbitmap.h"
#include "storage/buf.h"
#include "storage/lockdefs.h"
@@ -40,6 +41,8 @@ typedef struct IndexScanInstrumentation
{
/* Index search count (incremented with pgstat_count_index_scan call) */
uint64 nsearches;
+ /* Buffer usage of heap access during index scans */
+ InstrStack table_stack;
} IndexScanInstrumentation;
/*
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index a627cfcac2d..9bff6d8303f 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -185,6 +185,9 @@ extern void InstrUpdateTupleCount(NodeInstrumentation * instr, double nTuples);
extern void InstrEndLoop(NodeInstrumentation * instr);
extern void InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add);
+extern void InstrStartNodeStack(NodeInstrumentation * dst, InstrStack * stack);
+extern void InstrStopNodeStack(NodeInstrumentation * dst, InstrStack * stack);
+
extern Instrumentation *InstrStartParallelQuery(void);
extern void InstrEndParallelQuery(Instrumentation *instr, BufferUsage *bufusage, WalUsage *walusage);
extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
--
2.47.1