From 4eb9fef9a0e83d6a66d9253387c311c07bac995d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 18 Dec 2025 18:59:31 +0000 Subject: [PATCH v1 1/2] pg_stat_statements: Move jumbling related functions A normalized query string generated by generate_normalized_query can be used by other extensions, as all will use the same query string and constant locations generated by core query jumbling. There is nothing specific to pg_stat_statements about this function, so it should be moved to core as a global function. This also moves the related helper functions fill_in_constant_length and comp_location. Discussion: https://postgr.es/m/aQA9v9nLu5qsX8IE%40paquier.xyz --- .../pg_stat_statements/pg_stat_statements.c | 270 +----------------- src/backend/nodes/queryjumblefuncs.c | 252 ++++++++++++++++ src/include/nodes/queryjumble.h | 2 + 3 files changed, 261 insertions(+), 263 deletions(-) diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 39208f80b5b..3eb29dfbd47 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -50,7 +50,6 @@ #include "access/htup_details.h" #include "access/parallel.h" #include "catalog/pg_authid.h" -#include "common/int.h" #include "executor/instrument.h" #include "funcapi.h" #include "jit/jit.h" @@ -59,7 +58,6 @@ #include "nodes/queryjumble.h" #include "optimizer/planner.h" #include "parser/analyze.h" -#include "parser/scanner.h" #include "pgstat.h" #include "storage/fd.h" #include "storage/ipc.h" @@ -377,11 +375,6 @@ static char *qtext_fetch(Size query_offset, int query_len, static bool need_gc_qtexts(void); static void gc_qtexts(void); static TimestampTz entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only); -static char *generate_normalized_query(JumbleState *jstate, const char *query, - int query_loc, int *query_len_p); -static void fill_in_constant_lengths(JumbleState *jstate, const char *query, - int query_loc); -static int comp_location(const void *a, const void *b); /* @@ -1359,6 +1352,13 @@ pgss_store(const char *query, int64 queryId, if (jstate) { LWLockRelease(pgss->lock); + + /* + * generate the normalized query. Note that the normalized + * representation may well vary depending on just which + * "equivalent" query is used to create the hashtable entry. We + * assume this is OK. + */ norm_query = generate_normalized_query(jstate, query, query_location, &query_len); @@ -2823,259 +2823,3 @@ release_lock: return stats_reset; } - -/* - * Generate a normalized version of the query string that will be used to - * represent all similar queries. - * - * Note that the normalized representation may well vary depending on - * just which "equivalent" query is used to create the hashtable entry. - * We assume this is OK. - * - * If query_loc > 0, then "query" has been advanced by that much compared to - * the original string start, so we need to translate the provided locations - * to compensate. (This lets us avoid re-scanning statements before the one - * of interest, so it's worth doing.) - * - * *query_len_p contains the input string length, and is updated with - * the result string length on exit. The resulting string might be longer - * or shorter depending on what happens with replacement of constants. - * - * Returns a palloc'd string. - */ -static char * -generate_normalized_query(JumbleState *jstate, const char *query, - int query_loc, int *query_len_p) -{ - char *norm_query; - int query_len = *query_len_p; - int norm_query_buflen, /* Space allowed for norm_query */ - len_to_wrt, /* Length (in bytes) to write */ - quer_loc = 0, /* Source query byte location */ - n_quer_loc = 0, /* Normalized query byte location */ - last_off = 0, /* Offset from start for previous tok */ - last_tok_len = 0; /* Length (in bytes) of that tok */ - int num_constants_replaced = 0; - - /* - * Get constants' lengths (core system only gives us locations). Note - * this also ensures the items are sorted by location. - */ - fill_in_constant_lengths(jstate, query, query_loc); - - /* - * Allow for $n symbols to be longer than the constants they replace. - * Constants must take at least one byte in text form, while a $n symbol - * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We - * could refine that limit based on the max value of n for the current - * query, but it hardly seems worth any extra effort to do so. - */ - norm_query_buflen = query_len + jstate->clocations_count * 10; - - /* Allocate result buffer */ - norm_query = palloc(norm_query_buflen + 1); - - for (int i = 0; i < jstate->clocations_count; i++) - { - int off, /* Offset from start for cur tok */ - tok_len; /* Length (in bytes) of that tok */ - - /* - * If we have an external param at this location, but no lists are - * being squashed across the query, then we skip here; this will make - * us print the characters found in the original query that represent - * the parameter in the next iteration (or after the loop is done), - * which is a bit odd but seems to work okay in most cases. - */ - if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists) - continue; - - off = jstate->clocations[i].location; - - /* Adjust recorded location if we're dealing with partial string */ - off -= query_loc; - - tok_len = jstate->clocations[i].length; - - if (tok_len < 0) - continue; /* ignore any duplicates */ - - /* Copy next chunk (what precedes the next constant) */ - len_to_wrt = off - last_off; - len_to_wrt -= last_tok_len; - Assert(len_to_wrt >= 0); - memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); - n_quer_loc += len_to_wrt; - - /* - * And insert a param symbol in place of the constant token; and, if - * we have a squashable list, insert a placeholder comment starting - * from the list's second value. - */ - n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s", - num_constants_replaced + 1 + jstate->highest_extern_param_id, - jstate->clocations[i].squashed ? " /*, ... */" : ""); - num_constants_replaced++; - - /* move forward */ - quer_loc = off + tok_len; - last_off = off; - last_tok_len = tok_len; - } - - /* - * We've copied up until the last ignorable constant. Copy over the - * remaining bytes of the original query string. - */ - len_to_wrt = query_len - quer_loc; - - Assert(len_to_wrt >= 0); - memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); - n_quer_loc += len_to_wrt; - - Assert(n_quer_loc <= norm_query_buflen); - norm_query[n_quer_loc] = '\0'; - - *query_len_p = n_quer_loc; - return norm_query; -} - -/* - * Given a valid SQL string and an array of constant-location records, - * fill in the textual lengths of those constants. - * - * The constants may use any allowed constant syntax, such as float literals, - * bit-strings, single-quoted strings and dollar-quoted strings. This is - * accomplished by using the public API for the core scanner. - * - * It is the caller's job to ensure that the string is a valid SQL statement - * with constants at the indicated locations. Since in practice the string - * has already been parsed, and the locations that the caller provides will - * have originated from within the authoritative parser, this should not be - * a problem. - * - * Multiple constants can have the same location. We reset lengths of those - * past the first to -1 so that they can later be ignored. - * - * If query_loc > 0, then "query" has been advanced by that much compared to - * the original string start, so we need to translate the provided locations - * to compensate. (This lets us avoid re-scanning statements before the one - * of interest, so it's worth doing.) - * - * N.B. There is an assumption that a '-' character at a Const location begins - * a negative numeric constant. This precludes there ever being another - * reason for a constant to start with a '-'. - */ -static void -fill_in_constant_lengths(JumbleState *jstate, const char *query, - int query_loc) -{ - LocationLen *locs; - core_yyscan_t yyscanner; - core_yy_extra_type yyextra; - core_YYSTYPE yylval; - YYLTYPE yylloc; - - /* - * Sort the records by location so that we can process them in order while - * scanning the query text. - */ - if (jstate->clocations_count > 1) - qsort(jstate->clocations, jstate->clocations_count, - sizeof(LocationLen), comp_location); - locs = jstate->clocations; - - /* initialize the flex scanner --- should match raw_parser() */ - yyscanner = scanner_init(query, - &yyextra, - &ScanKeywords, - ScanKeywordTokens); - - /* we don't want to re-emit any escape string warnings */ - yyextra.escape_string_warning = false; - - /* Search for each constant, in sequence */ - for (int i = 0; i < jstate->clocations_count; i++) - { - int loc; - int tok; - - /* Ignore constants after the first one in the same location */ - if (i > 0 && locs[i].location == locs[i - 1].location) - { - locs[i].length = -1; - continue; - } - - if (locs[i].squashed) - continue; /* squashable list, ignore */ - - /* Adjust recorded location if we're dealing with partial string */ - loc = locs[i].location - query_loc; - Assert(loc >= 0); - - /* - * We have a valid location for a constant that's not a dupe. Lex - * tokens until we find the desired constant. - */ - for (;;) - { - tok = core_yylex(&yylval, &yylloc, yyscanner); - - /* We should not hit end-of-string, but if we do, behave sanely */ - if (tok == 0) - break; /* out of inner for-loop */ - - /* - * We should find the token position exactly, but if we somehow - * run past it, work with that. - */ - if (yylloc >= loc) - { - if (query[loc] == '-') - { - /* - * It's a negative value - this is the one and only case - * where we replace more than a single token. - * - * Do not compensate for the core system's special-case - * adjustment of location to that of the leading '-' - * operator in the event of a negative constant. It is - * also useful for our purposes to start from the minus - * symbol. In this way, queries like "select * from foo - * where bar = 1" and "select * from foo where bar = -2" - * will have identical normalized query strings. - */ - tok = core_yylex(&yylval, &yylloc, yyscanner); - if (tok == 0) - break; /* out of inner for-loop */ - } - - /* - * We now rely on the assumption that flex has placed a zero - * byte after the text of the current token in scanbuf. - */ - locs[i].length = strlen(yyextra.scanbuf + loc); - break; /* out of inner for-loop */ - } - } - - /* If we hit end-of-string, give up, leaving remaining lengths -1 */ - if (tok == 0) - break; - } - - scanner_finish(yyscanner); -} - -/* - * comp_location: comparator for qsorting LocationLen structs by location - */ -static int -comp_location(const void *a, const void *b) -{ - int l = ((const LocationLen *) a)->location; - int r = ((const LocationLen *) b)->location; - - return pg_cmp_s32(l, r); -} diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c index ffc230af427..07511496779 100644 --- a/src/backend/nodes/queryjumblefuncs.c +++ b/src/backend/nodes/queryjumblefuncs.c @@ -40,10 +40,12 @@ #include "access/transam.h" #include "catalog/pg_proc.h" #include "common/hashfn.h" +#include "common/int.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" #include "nodes/queryjumble.h" #include "utils/lsyscache.h" +#include "parser/scanner.h" #include "parser/scansup.h" #define JUMBLE_SIZE 1024 /* query serialization buffer size */ @@ -633,6 +635,18 @@ _jumbleList(JumbleState *jstate, Node *node) } } +/* + * comp_location: comparator for qsorting LocationLen structs by location + */ +static int +comp_location(const void *a, const void *b) +{ + int l = ((const LocationLen *) a)->location; + int r = ((const LocationLen *) b)->location; + + return pg_cmp_s32(l, r); +} + /* * We try to jumble lists of expressions as one individual item regardless * of how many elements are in the list. This is know as squashing, which @@ -773,3 +787,241 @@ _jumbleRangeTblEntry_eref(JumbleState *jstate, */ JUMBLE_STRING(aliasname); } + +/* + * Given a valid SQL string and an array of constant-location records, + * fill in the textual lengths of those constants. + * + * The constants may use any allowed constant syntax, such as float literals, + * bit-strings, single-quoted strings and dollar-quoted strings. This is + * accomplished by using the public API for the core scanner. + * + * It is the caller's job to ensure that the string is a valid SQL statement + * with constants at the indicated locations. Since in practice the string + * has already been parsed, and the locations that the caller provides will + * have originated from within the authoritative parser, this should not be + * a problem. + * + * Multiple constants can have the same location. We reset lengths of those + * past the first to -1 so that they can later be ignored. + * + * N.B. There is an assumption that a '-' character at a Const location begins + * a negative numeric constant. This precludes there ever being another + * reason for a constant to start with a '-'. + */ +static void +fill_in_constant_lengths(const JumbleState *jstate, const char *query, + int query_loc) +{ + LocationLen *locs; + core_yyscan_t yyscanner; + core_yy_extra_type yyextra; + core_YYSTYPE yylval; + YYLTYPE yylloc; + + /* + * Sort the records by location so that we can process them in order while + * scanning the query text. + */ + if (jstate->clocations_count > 1) + qsort(jstate->clocations, jstate->clocations_count, + sizeof(LocationLen), comp_location); + locs = jstate->clocations; + + /* initialize the flex scanner --- should match raw_parser() */ + yyscanner = scanner_init(query, + &yyextra, + &ScanKeywords, + ScanKeywordTokens); + + /* we don't want to re-emit any escape string warnings */ + yyextra.escape_string_warning = false; + + /* Search for each constant, in sequence */ + for (int i = 0; i < jstate->clocations_count; i++) + { + int loc; + int tok; + + /* Ignore constants after the first one in the same location */ + if (i > 0 && locs[i].location == locs[i - 1].location) + { + locs[i].length = -1; + continue; + } + + if (locs[i].squashed) + continue; /* squashable list, ignore */ + + /* + * Adjust the constant's location using the provided starting location + * of the current statement. This allows us to avoid scanning a + * multi-statement string from the beginning. + */ + loc = locs[i].location - query_loc; + Assert(loc >= 0); + + /* + * We have a valid location for a constant that's not a dupe. Lex + * tokens until we find the desired constant. + */ + for (;;) + { + tok = core_yylex(&yylval, &yylloc, yyscanner); + + /* We should not hit end-of-string, but if we do, behave sanely */ + if (tok == 0) + break; /* out of inner for-loop */ + + /* + * We should find the token position exactly, but if we somehow + * run past it, work with that. + */ + if (yylloc >= loc) + { + if (query[loc] == '-') + { + /* + * It's a negative value - this is the one and only case + * where we replace more than a single token. + * + * Do not compensate for the core system's special-case + * adjustment of location to that of the leading '-' + * operator in the event of a negative constant. It is + * also useful for our purposes to start from the minus + * symbol. In this way, queries like "select * from foo + * where bar = 1" and "select * from foo where bar = -2" + * will have identical normalized query strings. + */ + tok = core_yylex(&yylval, &yylloc, yyscanner); + if (tok == 0) + break; /* out of inner for-loop */ + } + + /* + * We now rely on the assumption that flex has placed a zero + * byte after the text of the current token in scanbuf. + */ + locs[i].length = strlen(yyextra.scanbuf + loc); + break; /* out of inner for-loop */ + } + } + + /* If we hit end-of-string, give up, leaving remaining lengths -1 */ + if (tok == 0) + break; + } + + scanner_finish(yyscanner); +} + +/* + * Generate a normalized version of the query string that will be used to + * represent all similar queries. + * + * Note that the normalized representation may well vary depending on + * just which "equivalent" query is used to create the hashtable entry. + * We assume this is OK. + * + * *query_len_p contains the input string length, and is updated with + * the result string length on exit. The resulting string might be longer + * or shorter depending on what happens with replacement of constants. + * + * Returns a palloc'd string that the caller must pfree. + */ +char * +generate_normalized_query(const JumbleState *jstate, const char *query, + int query_loc, int *query_len_p) +{ + char *norm_query; + int query_len = *query_len_p; + int norm_query_buflen, /* Space allowed for norm_query */ + len_to_wrt, /* Length (in bytes) to write */ + quer_loc = 0, /* Source query byte location */ + n_quer_loc = 0, /* Normalized query byte location */ + last_off = 0, /* Offset from start for previous tok */ + last_tok_len = 0; /* Length (in bytes) of that tok */ + int num_constants_replaced = 0; + + /* + * Get constants' lengths (core system only gives us locations). Note + * this also ensures the items are sorted by location. + */ + fill_in_constant_lengths(jstate, query, query_loc); + + /* + * Allow for $n symbols to be longer than the constants they replace. + * Constants must take at least one byte in text form, while a $n symbol + * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We + * could refine that limit based on the max value of n for the current + * query, but it hardly seems worth any extra effort to do so. + */ + norm_query_buflen = query_len + jstate->clocations_count * 10; + + /* Allocate result buffer */ + norm_query = palloc(norm_query_buflen + 1); + + for (int i = 0; i < jstate->clocations_count; i++) + { + int off, /* Offset from start for cur tok */ + tok_len; /* Length (in bytes) of that tok */ + + /* + * If we have an external param at this location, but no lists are + * being squashed across the query, then we skip here; this will make + * us print the characters found in the original query that represent + * the parameter in the next iteration (or after the loop is done), + * which is a bit odd but seems to work okay in most cases. + */ + if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists) + continue; + + off = jstate->clocations[i].location; + + /* Adjust the constant's location (see fill_in_constant_lengths) */ + off -= query_loc; + + tok_len = jstate->clocations[i].length; + + if (tok_len < 0) + continue; /* ignore any duplicates */ + + /* Copy next chunk (what precedes the next constant) */ + len_to_wrt = off - last_off; + len_to_wrt -= last_tok_len; + Assert(len_to_wrt >= 0); + memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); + n_quer_loc += len_to_wrt; + + /* + * And insert a param symbol in place of the constant token; and, if + * we have a squashable list, insert a placeholder comment starting + * from the list's second value. + */ + n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s", + num_constants_replaced + 1 + jstate->highest_extern_param_id, + jstate->clocations[i].squashed ? " /*, ... */" : ""); + num_constants_replaced++; + + /* move forward */ + quer_loc = off + tok_len; + last_off = off; + last_tok_len = tok_len; + } + + /* + * We've copied up until the last ignorable constant. Copy over the + * remaining bytes of the original query string. + */ + len_to_wrt = query_len - quer_loc; + + Assert(len_to_wrt >= 0); + memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); + n_quer_loc += len_to_wrt; + + Assert(n_quer_loc <= norm_query_buflen); + norm_query[n_quer_loc] = '\0'; + + *query_len_p = n_quer_loc; + return norm_query; +} diff --git a/src/include/nodes/queryjumble.h b/src/include/nodes/queryjumble.h index dcb36dcb44f..8207227e79e 100644 --- a/src/include/nodes/queryjumble.h +++ b/src/include/nodes/queryjumble.h @@ -91,6 +91,8 @@ extern PGDLLIMPORT int compute_query_id; extern const char *CleanQuerytext(const char *query, int *location, int *len); +extern char *generate_normalized_query(JumbleState *jstate, const char *query, + int query_loc, int *query_len_p); extern JumbleState *JumbleQuery(Query *query); extern void EnableQueryId(void); -- 2.43.0