v1-0003-Introduce-stack-for-tracking-per-node-WAL-buffer-.patch
application/octet-stream
Filename: v1-0003-Introduce-stack-for-tracking-per-node-WAL-buffer-.patch
Type: application/octet-stream
Part: 2
From 4375fcb4141f18d6cd927659970518553aa3fe94 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sun, 31 Aug 2025 16:37:05 -0700
Subject: [PATCH v1 3/3] Introduce stack for tracking per-node WAL/buffer usage
---
src/backend/commands/explain.c | 8 +-
src/backend/executor/execMain.c | 7 ++
src/backend/executor/execProcnode.c | 9 +++
src/backend/executor/instrument.c | 111 ++++++++++++++++++++++++----
src/include/executor/instrument.h | 42 ++++++++++-
5 files changed, 155 insertions(+), 22 deletions(-)
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 6a135e51996..584f0adbcc1 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2280,9 +2280,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
/* Show buffer/WAL usage */
if (es->buffers && planstate->instrument)
- show_buffer_usage(es, &planstate->instrument->bufusage);
+ show_buffer_usage(es, &planstate->instrument->stack.bufusage);
if (es->wal && planstate->instrument)
- show_wal_usage(es, &planstate->instrument->walusage);
+ show_wal_usage(es, &planstate->instrument->stack.walusage);
/* Prepare per-worker buffer/WAL usage */
if (es->workers_state && (es->buffers || es->wal) && es->verbose)
@@ -2299,9 +2299,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
ExplainOpenWorker(n, es);
if (es->buffers)
- show_buffer_usage(es, &instrument->bufusage);
+ show_buffer_usage(es, &instrument->stack.bufusage);
if (es->wal)
- show_wal_usage(es, &instrument->walusage);
+ show_wal_usage(es, &instrument->stack.walusage);
ExplainCloseWorker(n, es);
}
}
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index b83ced9a57a..1c2268bc608 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -312,6 +312,7 @@ standard_ExecutorRun(QueryDesc *queryDesc,
DestReceiver *dest;
bool sendTuples;
MemoryContext oldcontext;
+ InstrStackResource *res;
/* sanity checks */
Assert(queryDesc != NULL);
@@ -333,6 +334,9 @@ standard_ExecutorRun(QueryDesc *queryDesc,
if (queryDesc->totaltime)
InstrStart(queryDesc->totaltime);
+ /* Start up per-query node level instrumentation */
+ res = InstrStartQuery();
+
/*
* extract information from the query descriptor and the query feature.
*/
@@ -382,6 +386,9 @@ standard_ExecutorRun(QueryDesc *queryDesc,
if (sendTuples)
dest->rShutdown(dest);
+ /* Shut down per-query node level instrumentation */
+ InstrShutdownQuery(res);
+
if (queryDesc->totaltime)
InstrStop(queryDesc->totaltime, estate->es_processed);
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index d286471254b..7436f307994 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -823,8 +823,17 @@ ExecShutdownNode_walker(PlanState *node, void *context)
/* Stop the node if we started it above, reporting 0 tuples. */
if (node->instrument && node->instrument->running)
+ {
InstrStopNode(node->instrument, 0);
+ /*
+ * Propagate WAL/buffer stats to the parent node on the
+ * instrumentation stack (which is where InstrStopNode returned us
+ * to).
+ */
+ InstrNodeAddToCurrent(&node->instrument->stack);
+ }
+
return false;
}
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index c53480d8030..040d1fdecbd 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -16,15 +16,40 @@
#include <unistd.h>
#include "executor/instrument.h"
+#include "utils/memutils.h"
BufferUsage pgBufferUsage;
static BufferUsage save_pgBufferUsage;
WalUsage pgWalUsage;
static WalUsage save_pgWalUsage;
+InstrStack *pgInstrStack = NULL;
static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
static void WalUsageAdd(WalUsage *dst, WalUsage *add);
+/*
+ * Node-specific instrumentation handling uses ResourceOwner mechanism to
+ * reset pgInstrStack on abort.
+ */
+static void ResOwnerReleaseInstrStack(Datum res);
+static const ResourceOwnerDesc instr_stack_resowner_desc =
+{
+ .name = "instrumentation stack scope",
+ .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
+ .release_priority = RELEASE_PRIO_FIRST,
+ .ReleaseResource = ResOwnerReleaseInstrStack,
+ .DebugPrint = NULL, /* default message is fine */
+};
+static inline void
+ResourceOwnerRememberInstrStack(ResourceOwner owner, InstrStackResource * scope)
+{
+ ResourceOwnerRemember(owner, PointerGetDatum(scope), &instr_stack_resowner_desc);
+}
+static inline void
+ResourceOwnerForgetInstrStack(ResourceOwner owner, InstrStackResource * scope)
+{
+ ResourceOwnerForget(owner, PointerGetDatum(scope), &instr_stack_resowner_desc);
+}
/* General purpose instrumentation handling */
Instrumentation *
@@ -139,12 +164,17 @@ InstrStartNode(NodeInstrumentation * instr)
!INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
elog(ERROR, "InstrStartNode called twice in a row");
- /* save buffer usage totals at node entry, if needed */
- if (instr->need_bufusage)
- instr->bufusage_start = pgBufferUsage;
+ if (instr->need_bufusage || instr->need_walusage)
+ {
+ /*
+ * Ensure that we have an active pgInstrStack (InstrStartQuery must
+ * have been called)
+ */
+ Assert(pgInstrStack != NULL);
- if (instr->need_walusage)
- instr->walusage_start = pgWalUsage;
+ instr->stack.previous = pgInstrStack;
+ pgInstrStack = &instr->stack;
+ }
}
/* Exit from a plan node */
@@ -169,14 +199,12 @@ InstrStopNode(NodeInstrumentation * instr, double nTuples)
INSTR_TIME_SET_ZERO(instr->starttime);
}
- /* Add delta of buffer usage since entry to node's totals */
- if (instr->need_bufusage)
- BufferUsageAccumDiff(&instr->bufusage,
- &pgBufferUsage, &instr->bufusage_start);
-
- if (instr->need_walusage)
- WalUsageAccumDiff(&instr->walusage,
- &pgWalUsage, &instr->walusage_start);
+ if (instr->need_bufusage || instr->need_walusage)
+ {
+ /* Ensure that there is a stack entry above the top-most node */
+ Assert(instr->stack.previous != NULL);
+ pgInstrStack = instr->stack.previous;
+ }
/* Is this the first tuple of this cycle? */
if (!instr->running)
@@ -257,10 +285,65 @@ InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add)
/* Add delta of buffer usage since entry to node's totals */
if (dst->need_bufusage)
- BufferUsageAdd(&dst->bufusage, &add->bufusage);
+ BufferUsageAdd(&dst->stack.bufusage, &add->stack.bufusage);
if (dst->need_walusage)
+ WalUsageAdd(&dst->stack.walusage, &add->stack.walusage);
+}
+
+InstrStackResource *
+InstrStartQuery()
+{
+ InstrStack *usage = MemoryContextAllocZero(CurTransactionContext, sizeof(InstrStack));
+ InstrStackResource *usageRes = MemoryContextAllocZero(CurTransactionContext, sizeof(InstrStackResource));
+ ResourceOwner owner = CurrentResourceOwner;
+
+ Assert(owner != NULL);
+
+ usageRes->owner = owner;
+
+ ResourceOwnerEnlarge(owner);
+ ResourceOwnerRememberInstrStack(owner, usageRes);
+
+ usage->previous = pgInstrStack;
+ pgInstrStack = usage;
+
+ return usageRes;
+}
+
+void
+InstrShutdownQuery(InstrStackResource * res)
+{
+ Assert(res != NULL);
+ Assert(res->owner != NULL);
+
+ pgInstrStack = res->previous;
+
+ ResourceOwnerForgetInstrStack(res->owner, res);
+}
+
+static void
+ResOwnerReleaseInstrStack(Datum res)
+{
+ /*
+ * XXX: Registered resources are *not* called in reverse order, i.e. we'll
+ * get what was first registered first at shutdown. To avoid handling
+ * that, we are resetting the stack here on abort (instead of recovering
+ * to previous).
+ */
+ pgInstrStack = NULL;
+}
+
+void
+InstrNodeAddToCurrent(InstrStack * add)
+{
+ if (pgInstrStack != NULL)
+ {
+ InstrStack *dst = pgInstrStack;
+
+ BufferUsageAdd(&dst->bufusage, &add->bufusage);
WalUsageAdd(&dst->walusage, &add->walusage);
+ }
}
/* note current values during parallel executor startup */
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 3a280f4caae..a98efab5f93 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -14,6 +14,7 @@
#define INSTRUMENT_H
#include "portability/instr_time.h"
+#include "utils/resowner.h"
/*
@@ -66,6 +67,21 @@ typedef enum InstrumentOption
INSTRUMENT_ALL = PG_INT32_MAX
} InstrumentOption;
+/* Stack of WAL/buffer usage used for per-node instrumentation */
+typedef struct InstrStack
+{
+ struct InstrStack *previous;
+ BufferUsage bufusage;
+ WalUsage walusage;
+} InstrStack;
+
+/* Used to manage resetting of instrumentation stack on abort. */
+typedef struct InstrStackResource
+{
+ InstrStack *previous;
+ ResourceOwner owner;
+} InstrStackResource;
+
/*
* General purpose instrumentation that can capture time, WAL/buffer usage and tuples
*
@@ -91,6 +107,10 @@ typedef struct Instrumentation
/*
* Specialized instrumentation for per-node execution statistics
+ *
+ * Requires use of InstrStartQuery to initialize the stack used for WAL/buffer
+ * usage statistics, and cleanup through InstrShutdownQuery. Solely intended for
+ * the executor and anyone reporting about its activities (e.g. EXPLAIN ANALYZE).
*/
typedef struct NodeInstrumentation
{
@@ -105,8 +125,6 @@ typedef struct NodeInstrumentation
instr_time counter; /* accumulated runtime for this node */
double firsttuple; /* time for first tuple of this cycle */
double tuplecount; /* # of tuples emitted so far this cycle */
- BufferUsage bufusage_start; /* buffer usage at start */
- WalUsage walusage_start; /* WAL usage at start */
/* Accumulated statistics across all completed cycles: */
double startup; /* total startup time (in seconds) */
double total; /* total time (in seconds) */
@@ -115,8 +133,7 @@ typedef struct NodeInstrumentation
double nloops; /* # of run cycles for this node */
double nfiltered1; /* # of tuples removed by scanqual or joinqual */
double nfiltered2; /* # of tuples removed by "other" quals */
- BufferUsage bufusage; /* total buffer usage */
- WalUsage walusage; /* total WAL usage */
+ InstrStack stack; /* stack tracking buffer/WAL usage */
} NodeInstrumentation;
typedef struct WorkerInstrumentation
@@ -127,6 +144,7 @@ typedef struct WorkerInstrumentation
extern PGDLLIMPORT BufferUsage pgBufferUsage;
extern PGDLLIMPORT WalUsage pgWalUsage;
+extern PGDLLIMPORT InstrStack * pgInstrStack;
extern Instrumentation *InstrAlloc(int n, int instrument_options);
extern void InstrStart(Instrumentation *instr);
@@ -135,11 +153,14 @@ extern void InstrStop(Instrumentation *instr, double nTuples);
extern NodeInstrumentation * InstrAllocNode(int n, int instrument_options,
bool async_mode);
extern void InstrInitNode(NodeInstrumentation * instr, int instrument_options);
+extern InstrStackResource * InstrStartQuery(void);
+extern void InstrShutdownQuery(InstrStackResource * res);
extern void InstrStartNode(NodeInstrumentation * instr);
extern void InstrStopNode(NodeInstrumentation * instr, double nTuples);
extern void InstrUpdateTupleCount(NodeInstrumentation * instr, double nTuples);
extern void InstrEndLoop(NodeInstrumentation * instr);
extern void InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add);
+extern void InstrNodeAddToCurrent(InstrStack * stack);
extern void InstrStartParallelQuery(void);
extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
@@ -151,21 +172,34 @@ extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add,
#define INSTR_BUFUSAGE_INCR(fld) do { \
pgBufferUsage.fld++; \
+ if (pgInstrStack) \
+ pgInstrStack->bufusage.fld++; \
} while(0)
#define INSTR_BUFUSAGE_ADD(fld,val) do { \
pgBufferUsage.fld += val; \
+ if (pgInstrStack) \
+ pgInstrStack->bufusage.fld += val; \
} while(0)
#define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \
INSTR_TIME_ADD(pgBufferUsage.fld, val); \
+ if (pgInstrStack) \
+ INSTR_TIME_ADD(pgInstrStack->bufusage.fld, val); \
} while (0)
#define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \
INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \
+ if (pgInstrStack) \
+ INSTR_TIME_ACCUM_DIFF(pgInstrStack->bufusage.fld, endval, startval); \
} while (0)
+
#define INSTR_WALUSAGE_INCR(fld) do { \
pgWalUsage.fld++; \
+ if (pgInstrStack) \
+ pgInstrStack->walusage.fld++; \
} while(0)
#define INSTR_WALUSAGE_ADD(fld,val) do { \
pgWalUsage.fld += val; \
+ if (pgInstrStack) \
+ pgInstrStack->walusage.fld += val; \
} while(0)
#endif /* INSTRUMENT_H */
--
2.47.1