v3-0002-pg_stat_statements-move-out-jumble-specific-routi.patch
application/octet-stream
Filename: v3-0002-pg_stat_statements-move-out-jumble-specific-routi.patch
Type: application/octet-stream
Part: 1
Message:
Re: Bug in pg_stat_statements
Patch
Same data as JSON:
GET /api/v1/attachments/:id/patch
the parsed metadata as JSON — format, series position, per-file stats; never the diff bytes.
API reference →
Format: format-patch
Series: patch v3-0002
Subject: pg_stat_statements: move out jumble-specific routines
| File | + | − |
|---|---|---|
| contrib/pg_stat_statements/pg_stat_statements.c | 7 | 267 |
| src/backend/nodes/queryjumblefuncs.c | 258 | 0 |
| src/include/nodes/queryjumble.h | 4 | 0 |
From 110bf28c56218846d8a75180e868f97608b0cf80 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-46-230.ec2.internal>
Date: Fri, 24 Oct 2025 20:42:30 +0000
Subject: [PATCH v3 2/2] pg_stat_statements: move out jumble-specific routines
During discussion of commit 8eddb06, it was noted that the
fill_in_constant_lengths and generate_normalized_query routines
depend on intricate jumbling knowledge. Implementing this code
within pg_stat_statements does not make sense; it belongs in the
core jumbling module, queryjumblefuncs.c. pg_stat_statements can
then consume these routines without needing to know the details.
This separation became especially apparent with the squashing of
constant lists, but it has been a general issue all along.
In addition to reducing the pg_stat_statements codebase, this
change makes these routines available to other extensions
performing similar operations.
---
.../pg_stat_statements/pg_stat_statements.c | 274 +-----------------
src/backend/nodes/queryjumblefuncs.c | 258 +++++++++++++++++
src/include/nodes/queryjumble.h | 4 +
3 files changed, 269 insertions(+), 267 deletions(-)
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 6c9e49b471b..95293d3b594 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -50,7 +50,6 @@
#include "access/htup_details.h"
#include "access/parallel.h"
#include "catalog/pg_authid.h"
-#include "common/int.h"
#include "executor/instrument.h"
#include "funcapi.h"
#include "jit/jit.h"
@@ -59,7 +58,6 @@
#include "nodes/queryjumble.h"
#include "optimizer/planner.h"
#include "parser/analyze.h"
-#include "parser/scanner.h"
#include "pgstat.h"
#include "storage/fd.h"
#include "storage/ipc.h"
@@ -377,12 +375,6 @@ static char *qtext_fetch(Size query_offset, int query_len,
static bool need_gc_qtexts(void);
static void gc_qtexts(void);
static TimestampTz entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only);
-static char *generate_normalized_query(JumbleState *jstate, const char *query,
- int query_loc, int *query_len_p);
-static void fill_in_constant_lengths(JumbleState *jstate, const char *query,
- int query_loc);
-static int comp_location(const void *a, const void *b);
-
/*
* Module load callback
@@ -1359,6 +1351,13 @@ pgss_store(const char *query, int64 queryId,
if (jstate)
{
LWLockRelease(pgss->lock);
+
+ /*
+ * generate the normalized query. Note that the normalized
+ * representation may well vary depending on just which
+ * "equivalent" query is used to create the hashtable entry. We
+ * assume this is OK.
+ */
norm_query = generate_normalized_query(jstate, query,
query_location,
&query_len);
@@ -2823,262 +2822,3 @@ release_lock:
return stats_reset;
}
-
-/*
- * Generate a normalized version of the query string that will be used to
- * represent all similar queries.
- *
- * Note that the normalized representation may well vary depending on
- * just which "equivalent" query is used to create the hashtable entry.
- * We assume this is OK.
- *
- * If query_loc > 0, then "query" has been advanced by that much compared to
- * the original string start, so we need to translate the provided locations
- * to compensate. (This lets us avoid re-scanning statements before the one
- * of interest, so it's worth doing.)
- *
- * *query_len_p contains the input string length, and is updated with
- * the result string length on exit. The resulting string might be longer
- * or shorter depending on what happens with replacement of constants.
- *
- * Returns a palloc'd string.
- */
-static char *
-generate_normalized_query(JumbleState *jstate, const char *query,
- int query_loc, int *query_len_p)
-{
- char *norm_query;
- int query_len = *query_len_p;
- int norm_query_buflen, /* Space allowed for norm_query */
- len_to_wrt, /* Length (in bytes) to write */
- quer_loc = 0, /* Source query byte location */
- n_quer_loc = 0, /* Normalized query byte location */
- last_off = 0, /* Offset from start for previous tok */
- last_tok_len = 0; /* Length (in bytes) of that tok */
- int num_constants_replaced = 0;
-
- /*
- * Get constants' lengths (core system only gives us locations). Note
- * this also ensures the items are sorted by location.
- */
- fill_in_constant_lengths(jstate, query, query_loc);
-
- /*
- * Allow for $n symbols to be longer than the constants they replace.
- * Constants must take at least one byte in text form, while a $n symbol
- * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We
- * could refine that limit based on the max value of n for the current
- * query, but it hardly seems worth any extra effort to do so.
- */
- norm_query_buflen = query_len + jstate->clocations_count * 10;
-
- /* Allocate result buffer */
- norm_query = palloc(norm_query_buflen + 1);
-
- for (int i = 0; i < jstate->clocations_count; i++)
- {
- int off, /* Offset from start for cur tok */
- tok_len; /* Length (in bytes) of that tok */
-
- /*
- * If we have an external param at this location, but no lists are
- * being squashed across the query, then we skip here; this will make
- * us print the characters found in the original query that represent
- * the parameter in the next iteration (or after the loop is done),
- * which is a bit odd but seems to work okay in most cases.
- */
- if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists)
- continue;
-
- off = jstate->clocations[i].location;
-
- /* Adjust recorded location if we're dealing with partial string */
- off -= query_loc;
-
- tok_len = jstate->clocations[i].length;
-
- if (tok_len < 0)
- continue; /* ignore any duplicates */
-
- /* Copy next chunk (what precedes the next constant) */
- len_to_wrt = off - last_off;
- len_to_wrt -= last_tok_len;
- Assert(len_to_wrt >= 0);
- memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
- n_quer_loc += len_to_wrt;
-
- /*
- * And insert a param symbol in place of the constant token; and, if
- * we have a squashable list, insert a placeholder comment starting
- * from the list's second value.
- */
- n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s",
- num_constants_replaced + 1 + jstate->highest_extern_param_id,
- jstate->clocations[i].squashed ? " /*, ... */" : "");
- num_constants_replaced++;
-
- /* move forward */
- quer_loc = off + tok_len;
- last_off = off;
- last_tok_len = tok_len;
- }
-
- /*
- * We've copied up until the last ignorable constant. Copy over the
- * remaining bytes of the original query string.
- */
- len_to_wrt = query_len - quer_loc;
-
- Assert(len_to_wrt >= 0);
- memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
- n_quer_loc += len_to_wrt;
-
- Assert(n_quer_loc <= norm_query_buflen);
- norm_query[n_quer_loc] = '\0';
-
- *query_len_p = n_quer_loc;
- return norm_query;
-}
-
-/*
- * Given a valid SQL string and an array of constant-location records,
- * fill in the textual lengths of those constants.
- *
- * The constants may use any allowed constant syntax, such as float literals,
- * bit-strings, single-quoted strings and dollar-quoted strings. This is
- * accomplished by using the public API for the core scanner.
- *
- * It is the caller's job to ensure that the string is a valid SQL statement
- * with constants at the indicated locations. Since in practice the string
- * has already been parsed, and the locations that the caller provides will
- * have originated from within the authoritative parser, this should not be
- * a problem.
- *
- * Duplicate constant pointers are possible, and will have their lengths
- * marked as '-1', so that they are later ignored.
- *
- * If query_loc > 0, then "query" has been advanced by that much compared to
- * the original string start, so we need to translate the provided locations
- * to compensate. (This lets us avoid re-scanning statements before the one
- * of interest, so it's worth doing.)
- *
- * N.B. There is an assumption that a '-' character at a Const location begins
- * a negative numeric constant. This precludes there ever being another
- * reason for a constant to start with a '-'.
- */
-static void
-fill_in_constant_lengths(JumbleState *jstate, const char *query,
- int query_loc)
-{
- LocationLen *locs;
- core_yyscan_t yyscanner;
- core_yy_extra_type yyextra;
- core_YYSTYPE yylval;
- YYLTYPE yylloc;
- int last_loc = -1;
- int i;
-
- /*
- * Sort the records by location so that we can process them in order while
- * scanning the query text.
- */
- if (jstate->clocations_count > 1)
- qsort(jstate->clocations, jstate->clocations_count,
- sizeof(LocationLen), comp_location);
- locs = jstate->clocations;
-
- /* initialize the flex scanner --- should match raw_parser() */
- yyscanner = scanner_init(query,
- &yyextra,
- &ScanKeywords,
- ScanKeywordTokens);
-
- /* we don't want to re-emit any escape string warnings */
- yyextra.escape_string_warning = false;
-
- /* Search for each constant, in sequence */
- for (i = 0; i < jstate->clocations_count; i++)
- {
- int loc = locs[i].location;
- int tok;
- bool squashed = locs[i].squashed;
-
- /* Adjust recorded location if we're dealing with partial string */
- loc -= query_loc;
-
- Assert(loc >= 0);
-
- if (loc <= last_loc)
- {
- locs[i].length = -1;
- continue; /* Duplicate constant, ignore */
- }
-
- /* We have the next valid location, save the current location */
- last_loc = loc;
-
- if (squashed)
- continue; /* squashable list, ignore */
-
- /* Lex tokens until we find the desired constant */
- for (;;)
- {
- tok = core_yylex(&yylval, &yylloc, yyscanner);
-
- /* We should not hit end-of-string, but if we do, behave sanely */
- if (tok == 0)
- break; /* out of inner for-loop */
-
- /*
- * We should find the token position exactly, but if we somehow
- * run past it, work with that.
- */
- if (yylloc >= loc)
- {
- if (query[loc] == '-')
- {
- /*
- * It's a negative value - this is the one and only case
- * where we replace more than a single token.
- *
- * Do not compensate for the core system's special-case
- * adjustment of location to that of the leading '-'
- * operator in the event of a negative constant. It is
- * also useful for our purposes to start from the minus
- * symbol. In this way, queries like "select * from foo
- * where bar = 1" and "select * from foo where bar = -2"
- * will have identical normalized query strings.
- */
- tok = core_yylex(&yylval, &yylloc, yyscanner);
- if (tok == 0)
- break; /* out of inner for-loop */
- }
-
- /*
- * We now rely on the assumption that flex has placed a zero
- * byte after the text of the current token in scanbuf.
- */
- locs[i].length = strlen(yyextra.scanbuf + loc);
- break; /* out of inner for-loop */
- }
- }
-
- /* If we hit end-of-string, give up, leaving remaining lengths -1 */
- if (tok == 0)
- break;
- }
-
- scanner_finish(yyscanner);
-}
-
-/*
- * comp_location: comparator for qsorting LocationLen structs by location
- */
-static int
-comp_location(const void *a, const void *b)
-{
- int l = ((const LocationLen *) a)->location;
- int r = ((const LocationLen *) b)->location;
-
- return pg_cmp_s32(l, r);
-}
diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c
index 31f97151977..9244e1468f3 100644
--- a/src/backend/nodes/queryjumblefuncs.c
+++ b/src/backend/nodes/queryjumblefuncs.c
@@ -40,10 +40,12 @@
#include "access/transam.h"
#include "catalog/pg_proc.h"
#include "common/hashfn.h"
+#include "common/int.h"
#include "miscadmin.h"
#include "nodes/nodeFuncs.h"
#include "nodes/queryjumble.h"
#include "utils/lsyscache.h"
+#include "parser/scanner.h"
#include "parser/scansup.h"
#define JUMBLE_SIZE 1024 /* query serialization buffer size */
@@ -77,6 +79,262 @@ static void _jumbleVariableSetStmt(JumbleState *jstate, Node *node);
static void _jumbleRangeTblEntry_eref(JumbleState *jstate,
RangeTblEntry *rte,
Alias *expr);
+static int comp_location(const void *a, const void *b);
+
+/*
+ * comp_location: comparator for qsorting LocationLen structs by location
+ */
+static int
+comp_location(const void *a, const void *b)
+{
+ int l = ((const LocationLen *) a)->location;
+ int r = ((const LocationLen *) b)->location;
+
+ return pg_cmp_s32(l, r);
+}
+
+ /*
+ * Generate a normalized version of the query string that will be used to
+ * represent all similar queries.
+ *
+ * If query_loc > 0, then "query" has been advanced by that much compared to
+ * the original string start, so we need to translate the provided locations
+ * to compensate. (This lets us avoid re-scanning statements before the one
+ * of interest, so it's worth doing.)
+ *
+ * *query_len_p contains the input string length, and is updated with the
+ * result string length on exit. The resulting string might be longer or
+ * shorter depending on what happens with replacement of constants.
+ *
+ * Returns a palloc'd string.
+ */
+char *
+generate_normalized_query(JumbleState *jstate, const char *query,
+ int query_loc, int *query_len_p)
+{
+ char *norm_query;
+ int query_len = *query_len_p;
+ int norm_query_buflen, /* Space allowed for norm_query */
+ len_to_wrt, /* Length (in bytes) to write */
+ quer_loc = 0, /* Source query byte location */
+ n_quer_loc = 0, /* Normalized query byte location */
+ last_off = 0, /* Offset from start for previous tok */
+ last_tok_len = 0; /* Length (in bytes) of that tok */
+ int num_constants_replaced = 0;
+
+ /*
+ * Get constants' lengths (core system only gives us locations). Note
+ * this also ensures the items are sorted by location.
+ */
+ fill_in_constant_lengths(jstate, query, query_loc);
+
+ /*
+ * Allow for $n symbols to be longer than the constants they replace.
+ * Constants must take at least one byte in text form, while a $n symbol
+ * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We
+ * could refine that limit based on the max value of n for the current
+ * query, but it hardly seems worth any extra effort to do so.
+ */
+ norm_query_buflen = query_len + jstate->clocations_count * 10;
+
+ /* Allocate result buffer */
+ norm_query = palloc(norm_query_buflen + 1);
+
+ for (int i = 0; i < jstate->clocations_count; i++)
+ {
+ int off, /* Offset from start for cur tok */
+ tok_len; /* Length (in bytes) of that tok */
+
+ /*
+ * If we have an external param at this location, but no lists are
+ * being squashed across the query, then we skip here; this will make
+ * us print the characters found in the original query that represent
+ * the parameter in the next iteration (or after the loop is done),
+ * which is a bit odd but seems to work okay in most cases.
+ */
+ if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists)
+ continue;
+
+ off = jstate->clocations[i].location;
+
+ /* Adjust recorded location if we're dealing with partial string */
+ off -= query_loc;
+
+ tok_len = jstate->clocations[i].length;
+
+ if (tok_len < 0)
+ continue; /* ignore any duplicates */
+
+ /* Copy next chunk (what precedes the next constant) */
+ len_to_wrt = off - last_off;
+ len_to_wrt -= last_tok_len;
+ Assert(len_to_wrt >= 0);
+ memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
+ n_quer_loc += len_to_wrt;
+
+ /*
+ * And insert a param symbol in place of the constant token; and, if
+ * we have a squashable list, insert a placeholder comment starting
+ * from the list's second value.
+ */
+ n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s",
+ num_constants_replaced + 1 + jstate->highest_extern_param_id,
+ jstate->clocations[i].squashed ? " /*, ... */" : "");
+ num_constants_replaced++;
+
+ /* move forward */
+ quer_loc = off + tok_len;
+ last_off = off;
+ last_tok_len = tok_len;
+ }
+
+ /*
+ * We've copied up until the last ignorable constant. Copy over the
+ * remaining bytes of the original query string.
+ */
+ len_to_wrt = query_len - quer_loc;
+
+ Assert(len_to_wrt >= 0);
+ memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
+ n_quer_loc += len_to_wrt;
+
+ Assert(n_quer_loc <= norm_query_buflen);
+ norm_query[n_quer_loc] = '\0';
+
+ *query_len_p = n_quer_loc;
+ return norm_query;
+}
+
+/*
+ * Given a valid SQL string and an array of constant-location records,
+ * fill in the textual lengths of those constants.
+ *
+ * The constants may use any allowed constant syntax, such as float literals,
+ * bit-strings, single-quoted strings and dollar-quoted strings. This is
+ * accomplished by using the public API for the core scanner.
+ *
+ * It is the caller's job to ensure that the string is a valid SQL statement
+ * with constants at the indicated locations. Since in practice the string
+ * has already been parsed, and the locations that the caller provides will
+ * have originated from within the authoritative parser, this should not be
+ * a problem.
+ *
+ * Duplicate constant pointers are possible, and will have their lengths
+ * marked as '-1', so that they are later ignored.
+ *
+ * If query_loc > 0, then "query" has been advanced by that much compared to
+ * the original string start, so we need to translate the provided locations
+ * to compensate. (This lets us avoid re-scanning statements before the one
+ * of interest, so it's worth doing.)
+ *
+ * N.B. There is an assumption that a '-' character at a Const location begins
+ * a negative numeric constant. This precludes there ever being another
+ * reason for a constant to start with a '-'.
+ */
+void
+fill_in_constant_lengths(JumbleState *jstate, const char *query,
+ int query_loc)
+{
+ LocationLen *locs;
+ core_yyscan_t yyscanner;
+ core_yy_extra_type yyextra;
+ core_YYSTYPE yylval;
+ YYLTYPE yylloc;
+ int last_loc = -1;
+ int i;
+
+ /*
+ * Sort the records by location so that we can process them in order while
+ * scanning the query text.
+ */
+ if (jstate->clocations_count > 1)
+ qsort(jstate->clocations, jstate->clocations_count,
+ sizeof(LocationLen), comp_location);
+ locs = jstate->clocations;
+
+ /* initialize the flex scanner --- should match raw_parser() */
+ yyscanner = scanner_init(query,
+ &yyextra,
+ &ScanKeywords,
+ ScanKeywordTokens);
+
+ /* we don't want to re-emit any escape string warnings */
+ yyextra.escape_string_warning = false;
+
+ /* Search for each constant, in sequence */
+ for (i = 0; i < jstate->clocations_count; i++)
+ {
+ int loc = locs[i].location;
+ int tok;
+ bool squashed = locs[i].squashed;
+
+ /* Adjust recorded location if we're dealing with partial string */
+ loc -= query_loc;
+
+ Assert(loc >= 0);
+
+ if (loc <= last_loc)
+ {
+ locs[i].length = -1;
+ continue; /* Duplicate constant, ignore */
+ }
+
+ /* We have the next valid location, save the current location */
+ last_loc = loc;
+
+ if (squashed)
+ continue; /* squashable list, ignore */
+
+ /* Lex tokens until we find the desired constant */
+ for (;;)
+ {
+ tok = core_yylex(&yylval, &yylloc, yyscanner);
+
+ /* We should not hit end-of-string, but if we do, behave sanely */
+ if (tok == 0)
+ break; /* out of inner for-loop */
+
+ /*
+ * We should find the token position exactly, but if we somehow
+ * run past it, work with that.
+ */
+ if (yylloc >= loc)
+ {
+ if (query[loc] == '-')
+ {
+ /*
+ * It's a negative value - this is the one and only case
+ * where we replace more than a single token.
+ *
+ * Do not compensate for the core system's special-case
+ * adjustment of location to that of the leading '-'
+ * operator in the event of a negative constant. It is
+ * also useful for our purposes to start from the minus
+ * symbol. In this way, queries like "select * from foo
+ * where bar = 1" and "select * from foo where bar = -2"
+ * will have identical normalized query strings.
+ */
+ tok = core_yylex(&yylval, &yylloc, yyscanner);
+ if (tok == 0)
+ break; /* out of inner for-loop */
+ }
+
+ /*
+ * We now rely on the assumption that flex has placed a zero
+ * byte after the text of the current token in scanbuf.
+ */
+ locs[i].length = strlen(yyextra.scanbuf + loc);
+ break; /* out of inner for-loop */
+ }
+ }
+
+ /* If we hit end-of-string, give up, leaving remaining lengths -1 */
+ if (tok == 0)
+ break;
+ }
+
+ scanner_finish(yyscanner);
+}
/*
* Given a possibly multi-statement source string, confine our attention to the
diff --git a/src/include/nodes/queryjumble.h b/src/include/nodes/queryjumble.h
index dcb36dcb44f..8024a0883f7 100644
--- a/src/include/nodes/queryjumble.h
+++ b/src/include/nodes/queryjumble.h
@@ -91,6 +91,10 @@ extern PGDLLIMPORT int compute_query_id;
extern const char *CleanQuerytext(const char *query, int *location, int *len);
+extern char *generate_normalized_query(JumbleState *jstate, const char *query,
+ int query_loc, int *query_len_p);
+extern void fill_in_constant_lengths(JumbleState *jstate, const char *query,
+ int query_loc);
extern JumbleState *JumbleQuery(Query *query);
extern void EnableQueryId(void);
--
2.43.0