v3-0001-pg_stat_statements-Move-query-normalization-to-co.patch
application/octet-stream
Filename: v3-0001-pg_stat_statements-Move-query-normalization-to-co.patch
Type: application/octet-stream
Part: 0
From ffe6963fd217f64230d5d638ab9a16570a895172 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-46-230.ec2.internal>
Date: Thu, 18 Dec 2025 18:59:31 +0000
Subject: [PATCH v3] pg_stat_statements: Move query normalization to core
pg_stat_statements was modifying core generated JumbleState instances
by calling fill_in_constant_lengths() via generate_normalized_query()
to populate constant lengths. This creates problems since extensions
should not modify shared core data structures that may be used by
other extensions, and this could be considered a layering violation.
Move query normalization functions to core as a global function to
fix this. Extensions now call GenerateNormalizedQuery() instead of
directly modifying JumbleState.
This change also addresses code duplication, as several extensions
have been copying generate_normalized_query() to implement their own
normalization. The new core API provides a single, consistent
implementation for all extensions to use.
Functions are renamed to match core naming conventions and comments
are updated for clarity.
Discussion: https://postgr.es/m/CAA5RZ0tZp5qU0ikZEEqJnxvdSNGh1DWv80sb-k4QAUmiMoOp_Q@mail.gmail.com
---
.../pg_stat_statements/pg_stat_statements.c | 276 +-----------------
src/backend/nodes/queryjumblefuncs.c | 248 ++++++++++++++++
src/include/nodes/queryjumble.h | 2 +
3 files changed, 260 insertions(+), 266 deletions(-)
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 39208f80b5b..b90891859d9 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -50,7 +50,6 @@
#include "access/htup_details.h"
#include "access/parallel.h"
#include "catalog/pg_authid.h"
-#include "common/int.h"
#include "executor/instrument.h"
#include "funcapi.h"
#include "jit/jit.h"
@@ -59,7 +58,6 @@
#include "nodes/queryjumble.h"
#include "optimizer/planner.h"
#include "parser/analyze.h"
-#include "parser/scanner.h"
#include "pgstat.h"
#include "storage/fd.h"
#include "storage/ipc.h"
@@ -377,11 +375,6 @@ static char *qtext_fetch(Size query_offset, int query_len,
static bool need_gc_qtexts(void);
static void gc_qtexts(void);
static TimestampTz entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only);
-static char *generate_normalized_query(JumbleState *jstate, const char *query,
- int query_loc, int *query_len_p);
-static void fill_in_constant_lengths(JumbleState *jstate, const char *query,
- int query_loc);
-static int comp_location(const void *a, const void *b);
/*
@@ -1359,9 +1352,16 @@ pgss_store(const char *query, int64 queryId,
if (jstate)
{
LWLockRelease(pgss->lock);
- norm_query = generate_normalized_query(jstate, query,
- query_location,
- &query_len);
+
+ /*
+ * generate the normalized query. Note that the normalized
+ * representation may well vary depending on just which
+ * "equivalent" query is used to create the hashtable entry. We
+ * assume this is OK.
+ */
+ norm_query = GenerateNormalizedQuery(jstate, query,
+ query_location,
+ &query_len);
LWLockAcquire(pgss->lock, LW_SHARED);
}
@@ -2823,259 +2823,3 @@ release_lock:
return stats_reset;
}
-
-/*
- * Generate a normalized version of the query string that will be used to
- * represent all similar queries.
- *
- * Note that the normalized representation may well vary depending on
- * just which "equivalent" query is used to create the hashtable entry.
- * We assume this is OK.
- *
- * If query_loc > 0, then "query" has been advanced by that much compared to
- * the original string start, so we need to translate the provided locations
- * to compensate. (This lets us avoid re-scanning statements before the one
- * of interest, so it's worth doing.)
- *
- * *query_len_p contains the input string length, and is updated with
- * the result string length on exit. The resulting string might be longer
- * or shorter depending on what happens with replacement of constants.
- *
- * Returns a palloc'd string.
- */
-static char *
-generate_normalized_query(JumbleState *jstate, const char *query,
- int query_loc, int *query_len_p)
-{
- char *norm_query;
- int query_len = *query_len_p;
- int norm_query_buflen, /* Space allowed for norm_query */
- len_to_wrt, /* Length (in bytes) to write */
- quer_loc = 0, /* Source query byte location */
- n_quer_loc = 0, /* Normalized query byte location */
- last_off = 0, /* Offset from start for previous tok */
- last_tok_len = 0; /* Length (in bytes) of that tok */
- int num_constants_replaced = 0;
-
- /*
- * Get constants' lengths (core system only gives us locations). Note
- * this also ensures the items are sorted by location.
- */
- fill_in_constant_lengths(jstate, query, query_loc);
-
- /*
- * Allow for $n symbols to be longer than the constants they replace.
- * Constants must take at least one byte in text form, while a $n symbol
- * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We
- * could refine that limit based on the max value of n for the current
- * query, but it hardly seems worth any extra effort to do so.
- */
- norm_query_buflen = query_len + jstate->clocations_count * 10;
-
- /* Allocate result buffer */
- norm_query = palloc(norm_query_buflen + 1);
-
- for (int i = 0; i < jstate->clocations_count; i++)
- {
- int off, /* Offset from start for cur tok */
- tok_len; /* Length (in bytes) of that tok */
-
- /*
- * If we have an external param at this location, but no lists are
- * being squashed across the query, then we skip here; this will make
- * us print the characters found in the original query that represent
- * the parameter in the next iteration (or after the loop is done),
- * which is a bit odd but seems to work okay in most cases.
- */
- if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists)
- continue;
-
- off = jstate->clocations[i].location;
-
- /* Adjust recorded location if we're dealing with partial string */
- off -= query_loc;
-
- tok_len = jstate->clocations[i].length;
-
- if (tok_len < 0)
- continue; /* ignore any duplicates */
-
- /* Copy next chunk (what precedes the next constant) */
- len_to_wrt = off - last_off;
- len_to_wrt -= last_tok_len;
- Assert(len_to_wrt >= 0);
- memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
- n_quer_loc += len_to_wrt;
-
- /*
- * And insert a param symbol in place of the constant token; and, if
- * we have a squashable list, insert a placeholder comment starting
- * from the list's second value.
- */
- n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s",
- num_constants_replaced + 1 + jstate->highest_extern_param_id,
- jstate->clocations[i].squashed ? " /*, ... */" : "");
- num_constants_replaced++;
-
- /* move forward */
- quer_loc = off + tok_len;
- last_off = off;
- last_tok_len = tok_len;
- }
-
- /*
- * We've copied up until the last ignorable constant. Copy over the
- * remaining bytes of the original query string.
- */
- len_to_wrt = query_len - quer_loc;
-
- Assert(len_to_wrt >= 0);
- memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
- n_quer_loc += len_to_wrt;
-
- Assert(n_quer_loc <= norm_query_buflen);
- norm_query[n_quer_loc] = '\0';
-
- *query_len_p = n_quer_loc;
- return norm_query;
-}
-
-/*
- * Given a valid SQL string and an array of constant-location records,
- * fill in the textual lengths of those constants.
- *
- * The constants may use any allowed constant syntax, such as float literals,
- * bit-strings, single-quoted strings and dollar-quoted strings. This is
- * accomplished by using the public API for the core scanner.
- *
- * It is the caller's job to ensure that the string is a valid SQL statement
- * with constants at the indicated locations. Since in practice the string
- * has already been parsed, and the locations that the caller provides will
- * have originated from within the authoritative parser, this should not be
- * a problem.
- *
- * Multiple constants can have the same location. We reset lengths of those
- * past the first to -1 so that they can later be ignored.
- *
- * If query_loc > 0, then "query" has been advanced by that much compared to
- * the original string start, so we need to translate the provided locations
- * to compensate. (This lets us avoid re-scanning statements before the one
- * of interest, so it's worth doing.)
- *
- * N.B. There is an assumption that a '-' character at a Const location begins
- * a negative numeric constant. This precludes there ever being another
- * reason for a constant to start with a '-'.
- */
-static void
-fill_in_constant_lengths(JumbleState *jstate, const char *query,
- int query_loc)
-{
- LocationLen *locs;
- core_yyscan_t yyscanner;
- core_yy_extra_type yyextra;
- core_YYSTYPE yylval;
- YYLTYPE yylloc;
-
- /*
- * Sort the records by location so that we can process them in order while
- * scanning the query text.
- */
- if (jstate->clocations_count > 1)
- qsort(jstate->clocations, jstate->clocations_count,
- sizeof(LocationLen), comp_location);
- locs = jstate->clocations;
-
- /* initialize the flex scanner --- should match raw_parser() */
- yyscanner = scanner_init(query,
- &yyextra,
- &ScanKeywords,
- ScanKeywordTokens);
-
- /* we don't want to re-emit any escape string warnings */
- yyextra.escape_string_warning = false;
-
- /* Search for each constant, in sequence */
- for (int i = 0; i < jstate->clocations_count; i++)
- {
- int loc;
- int tok;
-
- /* Ignore constants after the first one in the same location */
- if (i > 0 && locs[i].location == locs[i - 1].location)
- {
- locs[i].length = -1;
- continue;
- }
-
- if (locs[i].squashed)
- continue; /* squashable list, ignore */
-
- /* Adjust recorded location if we're dealing with partial string */
- loc = locs[i].location - query_loc;
- Assert(loc >= 0);
-
- /*
- * We have a valid location for a constant that's not a dupe. Lex
- * tokens until we find the desired constant.
- */
- for (;;)
- {
- tok = core_yylex(&yylval, &yylloc, yyscanner);
-
- /* We should not hit end-of-string, but if we do, behave sanely */
- if (tok == 0)
- break; /* out of inner for-loop */
-
- /*
- * We should find the token position exactly, but if we somehow
- * run past it, work with that.
- */
- if (yylloc >= loc)
- {
- if (query[loc] == '-')
- {
- /*
- * It's a negative value - this is the one and only case
- * where we replace more than a single token.
- *
- * Do not compensate for the core system's special-case
- * adjustment of location to that of the leading '-'
- * operator in the event of a negative constant. It is
- * also useful for our purposes to start from the minus
- * symbol. In this way, queries like "select * from foo
- * where bar = 1" and "select * from foo where bar = -2"
- * will have identical normalized query strings.
- */
- tok = core_yylex(&yylval, &yylloc, yyscanner);
- if (tok == 0)
- break; /* out of inner for-loop */
- }
-
- /*
- * We now rely on the assumption that flex has placed a zero
- * byte after the text of the current token in scanbuf.
- */
- locs[i].length = strlen(yyextra.scanbuf + loc);
- break; /* out of inner for-loop */
- }
- }
-
- /* If we hit end-of-string, give up, leaving remaining lengths -1 */
- if (tok == 0)
- break;
- }
-
- scanner_finish(yyscanner);
-}
-
-/*
- * comp_location: comparator for qsorting LocationLen structs by location
- */
-static int
-comp_location(const void *a, const void *b)
-{
- int l = ((const LocationLen *) a)->location;
- int r = ((const LocationLen *) b)->location;
-
- return pg_cmp_s32(l, r);
-}
diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c
index ffc230af427..9e04dd6fbc6 100644
--- a/src/backend/nodes/queryjumblefuncs.c
+++ b/src/backend/nodes/queryjumblefuncs.c
@@ -40,10 +40,12 @@
#include "access/transam.h"
#include "catalog/pg_proc.h"
#include "common/hashfn.h"
+#include "common/int.h"
#include "miscadmin.h"
#include "nodes/nodeFuncs.h"
#include "nodes/queryjumble.h"
#include "utils/lsyscache.h"
+#include "parser/scanner.h"
#include "parser/scansup.h"
#define JUMBLE_SIZE 1024 /* query serialization buffer size */
@@ -773,3 +775,249 @@ _jumbleRangeTblEntry_eref(JumbleState *jstate,
*/
JUMBLE_STRING(aliasname);
}
+
+/*
+ * CompLocation: comparator for qsorting LocationLen structs by location
+ */
+static int
+CompLocation(const void *a, const void *b)
+{
+ int l = ((const LocationLen *) a)->location;
+ int r = ((const LocationLen *) b)->location;
+
+ return pg_cmp_s32(l, r);
+}
+
+/*
+ * Given a valid SQL string and an array of constant-location records,
+ * fill in the textual lengths of those constants in JumbleState.
+ *
+ * The constants may use any allowed constant syntax, such as float literals,
+ * bit-strings, single-quoted strings and dollar-quoted strings. This is
+ * accomplished by using the public API for the core scanner.
+ *
+ * Multiple constants can have the same location. We reset lengths of those
+ * past the first to -1 so that they can later be ignored.
+ *
+ * N.B. There is an assumption that a '-' character at a Const location begins
+ * a negative numeric constant. This precludes there ever being another
+ * reason for a constant to start with a '-'.
+ */
+static void
+SetConstantLengths(JumbleState *jstate, const char *query,
+ int query_loc)
+{
+ LocationLen *locs;
+ core_yyscan_t yyscanner;
+ core_yy_extra_type yyextra;
+ core_YYSTYPE yylval;
+ YYLTYPE yylloc;
+
+ /*
+ * Sort the records by location so that we can process them in order while
+ * scanning the query text.
+ */
+ if (jstate->clocations_count > 1)
+ qsort(jstate->clocations, jstate->clocations_count,
+ sizeof(LocationLen), CompLocation);
+ locs = jstate->clocations;
+
+ /* initialize the flex scanner --- should match raw_parser() */
+ yyscanner = scanner_init(query,
+ &yyextra,
+ &ScanKeywords,
+ ScanKeywordTokens);
+
+ /* we don't want to re-emit any escape string warnings */
+ yyextra.escape_string_warning = false;
+
+ /* Search for each constant, in sequence */
+ for (int i = 0; i < jstate->clocations_count; i++)
+ {
+ int loc;
+ int tok;
+
+ /* Ignore constants after the first one in the same location */
+ if (i > 0 && locs[i].location == locs[i - 1].location)
+ {
+ locs[i].length = -1;
+ continue;
+ }
+
+ if (locs[i].squashed)
+ continue; /* squashable list, ignore */
+
+ /*
+ * Adjust the constant's location using the provided starting location
+ * of the current statement. This allows us to avoid scanning a
+ * multi-statement string from the beginning.
+ */
+ loc = locs[i].location - query_loc;
+ Assert(loc >= 0);
+
+ /*
+ * We have a valid location for a constant that's not a dupe. Lex
+ * tokens until we find the desired constant.
+ */
+ for (;;)
+ {
+ tok = core_yylex(&yylval, &yylloc, yyscanner);
+
+ /* We should not hit end-of-string, but if we do, behave sanely */
+ if (tok == 0)
+ break; /* out of inner for-loop */
+
+ /*
+ * We should find the token position exactly, but if we somehow
+ * run past it, work with that.
+ */
+ if (yylloc >= loc)
+ {
+ if (query[loc] == '-')
+ {
+ /*
+ * It's a negative value - this is the one and only case
+ * where we replace more than a single token.
+ *
+ * Do not compensate for the special-case adjustment of
+ * location to that of the leading '-' operator in the
+ * event of a negative constant (see doNegate() in
+ * gram.y). It is also useful for our purposes to start
+ * from the minus symbol. In this way, queries like
+ * "select * from foo where bar = 1" and "select * from
+ * foo where bar = -2" will have identical normalized
+ * query strings.
+ */
+ tok = core_yylex(&yylval, &yylloc, yyscanner);
+ if (tok == 0)
+ break; /* out of inner for-loop */
+ }
+
+ /*
+ * We now rely on the assumption that flex has placed a zero
+ * byte after the text of the current token in scanbuf.
+ */
+ locs[i].length = strlen(yyextra.scanbuf + loc);
+ break; /* out of inner for-loop */
+ }
+ }
+
+ /* If we hit end-of-string, give up, leaving remaining lengths -1 */
+ if (tok == 0)
+ break;
+ }
+
+ scanner_finish(yyscanner);
+}
+
+/*
+ * Callback to generate a normalized version of the query string that will be used to
+ * represent all similar queries.
+ *
+ * It is the caller's job to ensure that the string is a valid SQL statement
+ * with the correct constant locations in jstate->clocations. Since in
+ * practice the string has already been parsed using the authoritative parser
+ * and the locations are set by core query jumbling, this should not be a problem.
+ *
+ * *query_len_p contains the input string length, and is updated with
+ * the result string length on exit. The resulting string might be longer
+ * or shorter depending on what happens with replacement of constants.
+ *
+ * Returns a palloc'd string.
+ */
+char *
+GenerateNormalizedQuery(JumbleState *jstate, const char *query,
+ int query_loc, int *query_len_p)
+{
+ char *norm_query;
+ int query_len = *query_len_p;
+ int norm_query_buflen, /* Space allowed for norm_query */
+ len_to_wrt, /* Length (in bytes) to write */
+ quer_loc = 0, /* Source query byte location */
+ n_quer_loc = 0, /* Normalized query byte location */
+ last_off = 0, /* Offset from start for previous tok */
+ last_tok_len = 0; /* Length (in bytes) of that tok */
+ int num_constants_replaced = 0;
+
+ /*
+ * Set constants' lengths in JumbleState, as only locations are set during
+ * DoJumble(). Note this also ensures the items are sorted by location.
+ */
+ SetConstantLengths((JumbleState *) jstate, query, query_loc);
+
+ /*
+ * Allow for $n symbols to be longer than the constants they replace.
+ * Constants must take at least one byte in text form, while a $n symbol
+ * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We
+ * could refine that limit based on the max value of n for the current
+ * query, but it hardly seems worth any extra effort to do so.
+ */
+ norm_query_buflen = query_len + jstate->clocations_count * 10;
+
+ /* Allocate result buffer */
+ norm_query = palloc(norm_query_buflen + 1);
+
+ for (int i = 0; i < jstate->clocations_count; i++)
+ {
+ int off, /* Offset from start for cur tok */
+ tok_len; /* Length (in bytes) of that tok */
+
+ /*
+ * If we have an external param at this location, but no lists are
+ * being squashed across the query, then we skip here; this will make
+ * us print the characters found in the original query that represent
+ * the parameter in the next iteration (or after the loop is done),
+ * which is a bit odd but seems to work okay in most cases.
+ */
+ if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists)
+ continue;
+
+ off = jstate->clocations[i].location;
+
+ /* Adjust the constant's location (see SetConstantLengths) */
+ off -= query_loc;
+
+ tok_len = jstate->clocations[i].length;
+
+ if (tok_len < 0)
+ continue; /* ignore any duplicates */
+
+ /* Copy next chunk (what precedes the next constant) */
+ len_to_wrt = off - last_off;
+ len_to_wrt -= last_tok_len;
+ Assert(len_to_wrt >= 0);
+ memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
+ n_quer_loc += len_to_wrt;
+
+ /*
+ * And insert a param symbol in place of the constant token; and, if
+ * we have a squashable list, insert a placeholder comment starting
+ * from the list's second value.
+ */
+ n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s",
+ num_constants_replaced + 1 + jstate->highest_extern_param_id,
+ jstate->clocations[i].squashed ? " /*, ... */" : "");
+ num_constants_replaced++;
+
+ /* move forward */
+ quer_loc = off + tok_len;
+ last_off = off;
+ last_tok_len = tok_len;
+ }
+
+ /*
+ * We've copied up until the last ignorable constant. Copy over the
+ * remaining bytes of the original query string.
+ */
+ len_to_wrt = query_len - quer_loc;
+
+ Assert(len_to_wrt >= 0);
+ memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
+ n_quer_loc += len_to_wrt;
+
+ Assert(n_quer_loc <= norm_query_buflen);
+ norm_query[n_quer_loc] = '\0';
+
+ *query_len_p = n_quer_loc;
+ return norm_query;
+}
diff --git a/src/include/nodes/queryjumble.h b/src/include/nodes/queryjumble.h
index dcb36dcb44f..cb8741b1d05 100644
--- a/src/include/nodes/queryjumble.h
+++ b/src/include/nodes/queryjumble.h
@@ -91,6 +91,8 @@ extern PGDLLIMPORT int compute_query_id;
extern const char *CleanQuerytext(const char *query, int *location, int *len);
+extern char *GenerateNormalizedQuery(JumbleState *jstate, const char *query,
+ int query_loc, int *query_len_p);
extern JumbleState *JumbleQuery(Query *query);
extern void EnableQueryId(void);
--
2.43.0