From c28a9686478a2a28f23ac0e9ce809b905374a97a Mon Sep 17 00:00:00 2001 From: Corey Huinker Date: Tue, 4 Nov 2025 15:35:58 -0500 Subject: [PATCH v5 2/7] Add working input function for pg_ndistinct. This will consume the format that was established when the output function for pg_ndistinct was recently changed. This will be needed for importing extended statistics. --- src/backend/statistics/mvdistinct.c | 445 +++++++++++++++++++++++- src/test/regress/expected/stats_ext.out | 22 ++ src/test/regress/sql/stats_ext.sql | 12 + 3 files changed, 472 insertions(+), 7 deletions(-) diff --git a/src/backend/statistics/mvdistinct.c b/src/backend/statistics/mvdistinct.c index ca60841b813..5b7b3aa26a4 100644 --- a/src/backend/statistics/mvdistinct.c +++ b/src/backend/statistics/mvdistinct.c @@ -27,9 +27,15 @@ #include "catalog/pg_statistic_ext.h" #include "catalog/pg_statistic_ext_data.h" +#include "common/int.h" +#include "common/jsonapi.h" #include "lib/stringinfo.h" +#include "mb/pg_wchar.h" +#include "nodes/miscnodes.h" +#include "nodes/pg_list.h" #include "statistics/extended_stats_internal.h" #include "statistics/statistics.h" +#include "utils/builtins.h" #include "utils/fmgrprotos.h" #include "utils/syscache.h" #include "utils/typcache.h" @@ -328,28 +334,453 @@ statext_ndistinct_deserialize(bytea *data) return ndistinct; } +typedef enum +{ + NDIST_EXPECT_START = 0, + NDIST_EXPECT_ITEM, + NDIST_EXPECT_KEY, + NDIST_EXPECT_ATTNUM_LIST, + NDIST_EXPECT_ATTNUM, + NDIST_EXPECT_NDISTINCT, + NDIST_EXPECT_COMPLETE +} ndistinctSemanticState; + +typedef struct +{ + const char *str; + ndistinctSemanticState state; + + List *distinct_items; /* Accumulated complete MVNDistinctItems */ + Node *escontext; + + bool found_attributes; /* Item has an attributes key */ + bool found_ndistinct; /* Item has ndistinct key */ + List *attnum_list; /* Accumulated attributes attnums */ + int64 ndistinct; +} ndistinctParseState; + +/* + * Invoked at the start of each MVNDistinctItem. + * + * The entire JSON document shoul be one array of MVNDistinctItem objects. + * + * If we're anywhere else in the document, it's an error. + */ +static JsonParseErrorType +ndistinct_object_start(void *state) +{ + ndistinctParseState *parse = state; + + if (parse->state != NDIST_EXPECT_ITEM) + { + ereturn(parse->escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Expected Item object"))); + return JSON_SEM_ACTION_FAILED; + } + + /* Now we expect to see attributes/ndistinct keys */ + parse->state = NDIST_EXPECT_KEY; + return JSON_SUCCESS; +} + +/* + * Routine to allow qsorting of AttNumbers + */ +static int +attnum_compare(const void *aptr, const void *bptr) +{ + AttrNumber a = *(const AttrNumber *) aptr; + AttrNumber b = *(const AttrNumber *) bptr; + + return pg_cmp_s16(a, b); +} + + +/* + * Invoked at the end of an object. + * + * Check to ensure that it was a complete MVNDistinctItem + * + */ +static JsonParseErrorType +ndistinct_object_end(void *state) +{ + ndistinctParseState *parse = state; + + int natts = 0; + AttrNumber *attrsort; + + MVNDistinctItem *item; + + if (!parse->found_attributes) + { + ereturn(parse->escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Item must contain \"attributes\" key"))); + return JSON_SEM_ACTION_FAILED; + } + + if (!parse->found_ndistinct) + { + ereturn(parse->escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Item must contain \"ndistinct\" key"))); + return JSON_SEM_ACTION_FAILED; + } + + if (parse->attnum_list == NIL) + { + ereturn(parse->escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("The \"attributes\" key must be an non-empty array"))); + return JSON_SEM_ACTION_FAILED; + } + + /* + * We need at least 2 attnums for a ndistinct item, anything less is + * malformed. + */ + natts = parse->attnum_list->length; + if (natts < 2) + { + ereturn(parse->escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("The attributes key must contain an array of at least two attnums"))); + + return JSON_SEM_ACTION_FAILED; + } + attrsort = palloc0(natts * sizeof(AttrNumber)); + + /* Create the MVNDistinctItem */ + item = palloc(sizeof(MVNDistinctItem)); + item->nattributes = natts; + item->attributes = palloc0(natts * sizeof(AttrNumber)); + item->ndistinct = (double) parse->ndistinct; + + /* fill out both attnum list and sortable list */ + for (int i = 0; i < natts; i++) + { + attrsort[i] = (AttrNumber) parse->attnum_list->elements[i].int_value; + item->attributes[i] = attrsort[i]; + } + + /* Check attrsort for uniqueness */ + qsort(attrsort, natts, sizeof(AttrNumber), attnum_compare); + for (int i = 1; i < natts; i++) + if (attrsort[i] == attrsort[i - 1]) + { + ereturn(parse->escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("attnum list duplicate value found: %d", attrsort[i]))); + + return JSON_SEM_ACTION_FAILED; + } + pfree(attrsort); + + parse->distinct_items = lappend(parse->distinct_items, (void *) item); + + /* reset item state vars */ + list_free(parse->attnum_list); + parse->attnum_list = NIL; + parse->ndistinct = 0; + parse->found_attributes = false; + parse->found_ndistinct = false; + + /* Now we are looking for the next MVNDistinctItem */ + parse->state = NDIST_EXPECT_ITEM; + return JSON_SUCCESS; +} + + +/* + * ndsitinct input format has two types of arrays, the outer MVNDistinctItem + * array, and the attnum list array within each MVNDistinctItem. + */ +static JsonParseErrorType +ndistinct_array_start(void *state) +{ + ndistinctParseState *parse = state; + + switch (parse->state) + { + case NDIST_EXPECT_ATTNUM_LIST: + parse->state = NDIST_EXPECT_ATTNUM; + break; + + case NDIST_EXPECT_START: + parse->state = NDIST_EXPECT_ITEM; + break; + + default: + ereturn(parse->escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Array found in unexpected place"))); + return JSON_SEM_ACTION_FAILED; + } + + return JSON_SUCCESS; +} + + +static JsonParseErrorType +ndistinct_array_end(void *state) +{ + ndistinctParseState *parse = state; + + /* The attnum list is complete, look for more MVNDistinctItem keys */ + if (parse->state == NDIST_EXPECT_ATTNUM) + { + parse->state = NDIST_EXPECT_KEY; + return JSON_SUCCESS; + } + + if (parse->state == NDIST_EXPECT_ITEM) + { + parse->state = NDIST_EXPECT_COMPLETE; + return JSON_SUCCESS; + } + + ereturn(parse->escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Array found in unexpected place"))); + return JSON_SEM_ACTION_FAILED; +} + + +/* + * The valid keys for the MVNDistinctItem object are: + * - attributes + * - ndistinct + */ +static JsonParseErrorType +ndistinct_object_field_start(void *state, char *fname, bool isnull) +{ + ndistinctParseState *parse = state; + + const char *attributes = "attributes"; + const char *ndistinct = "ndistinct"; + + if (strcmp(fname, attributes) == 0) + { + parse->found_attributes = true; + parse->state = NDIST_EXPECT_ATTNUM_LIST; + return JSON_SUCCESS; + } + + if (strcmp(fname, ndistinct) == 0) + { + parse->found_ndistinct = true; + parse->state = NDIST_EXPECT_NDISTINCT; + return JSON_SUCCESS; + } + + ereturn(parse->escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Only allowed keys are \%s\" and \%s\".", attributes, ndistinct))); + return JSON_SEM_ACTION_FAILED; +} + +/* + * + */ +static JsonParseErrorType +ndistinct_array_element_start(void *state, bool isnull) +{ + ndistinctParseState *parse = state; + + if (parse->state == NDIST_EXPECT_ATTNUM) + { + if (isnull) + { + ereturn(parse->escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Attnum list elements cannot be null."))); + + return JSON_SEM_ACTION_FAILED; + } + return JSON_SUCCESS; + } + + if (parse->state == NDIST_EXPECT_ITEM) + { + if (isnull) + { + ereturn(parse->escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Item list elements cannot be null."))); + + return JSON_SEM_ACTION_FAILED; + } + + return JSON_SUCCESS; + } + + ereturn(parse->escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Unexpected array element."))); + + return JSON_SEM_ACTION_FAILED; +} + +/* + * Handle scalar events from the ndistinct input parser. + * + */ +static JsonParseErrorType +ndistinct_scalar(void *state, char *token, JsonTokenType tokentype) +{ + ndistinctParseState *parse = state; + + if (parse->state == NDIST_EXPECT_ATTNUM) + { + AttrNumber attnum = pg_strtoint16_safe(token, parse->escontext); + + if (SOFT_ERROR_OCCURRED(parse->escontext)) + return JSON_SEM_ACTION_FAILED; + + parse->attnum_list = lappend_int(parse->attnum_list, (int) attnum); + return JSON_SUCCESS; + } + + if (parse->state == NDIST_EXPECT_NDISTINCT) + { + /* + * While the structure dictates that ndistinct in a double precision + * floating point, in practice it has always been an integer, and it + * is output as such. Therefore, we follow usage precendent over the + * actual storage structure, and read it in as an integer. + */ + parse->ndistinct = pg_strtoint64_safe(token, parse->escontext); + + if (SOFT_ERROR_OCCURRED(parse->escontext)) + return JSON_SEM_ACTION_FAILED; + + parse->state = NDIST_EXPECT_KEY; + return JSON_SUCCESS; + } + + ereturn(parse->escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", parse->str), + errdetail("Unexpected scalar."))); + + return JSON_SEM_ACTION_FAILED; +} + /* * pg_ndistinct_in * input routine for type pg_ndistinct * - * pg_ndistinct is real enough to be a table column, but it has no - * operations of its own, and disallows input (just like pg_node_tree). + * example input: + * [{"attributes": [6, -1], "ndistinct": 14}, + * {"attributes": [6, -2], "ndistinct": 9143}, + * {"attributes": [-1,-2], "ndistinct": 13454}, + * {"attributes": [6, -1, -2], "ndistinct": 14549}] */ Datum pg_ndistinct_in(PG_FUNCTION_ARGS) { - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot accept a value of type %s", "pg_ndistinct"))); + char *str = PG_GETARG_CSTRING(0); - PG_RETURN_VOID(); /* keep compiler quiet */ + ndistinctParseState parse_state; + JsonParseErrorType result; + JsonLexContext *lex; + JsonSemAction sem_action; + + /* initialize semantic state */ + parse_state.str = str; + parse_state.state = NDIST_EXPECT_START; + parse_state.distinct_items = NIL; + parse_state.escontext = fcinfo->context; + parse_state.found_attributes = false; + parse_state.found_ndistinct = false; + parse_state.attnum_list = NIL; + parse_state.ndistinct = 0; + + /* set callbacks */ + sem_action.semstate = (void *) &parse_state; + sem_action.object_start = ndistinct_object_start; + sem_action.object_end = ndistinct_object_end; + sem_action.array_start = ndistinct_array_start; + sem_action.array_end = ndistinct_array_end; + sem_action.object_field_start = ndistinct_object_field_start; + sem_action.object_field_end = NULL; + sem_action.array_element_start = ndistinct_array_element_start; + sem_action.array_element_end = NULL; + sem_action.scalar = ndistinct_scalar; + + lex = makeJsonLexContextCstringLen(NULL, str, strlen(str), + PG_UTF8, true); + result = pg_parse_json(lex, &sem_action); + freeJsonLexContext(lex); + + if (result == JSON_SUCCESS) + { + MVNDistinct *ndistinct; + int nitems = parse_state.distinct_items->length; + bytea *bytes; + + ndistinct = palloc(offsetof(MVNDistinct, items) + + nitems * sizeof(MVNDistinctItem)); + + ndistinct->magic = STATS_NDISTINCT_MAGIC; + ndistinct->type = STATS_NDISTINCT_TYPE_BASIC; + ndistinct->nitems = nitems; + + for (int i = 0; i < nitems; i++) + { + MVNDistinctItem *item = parse_state.distinct_items->elements[i].ptr_value; + + ndistinct->items[i].ndistinct = item->ndistinct; + ndistinct->items[i].nattributes = item->nattributes; + ndistinct->items[i].attributes = item->attributes; + + /* + * free the MVNDistinctItem, but not the attributes we're still + * using + */ + pfree(item); + } + bytes = statext_ndistinct_serialize(ndistinct); + + list_free(parse_state.distinct_items); + for (int i = 0; i < nitems; i++) + pfree(ndistinct->items[i].attributes); + pfree(ndistinct); + + PG_RETURN_BYTEA_P(bytes); + } + else if (result == JSON_SEM_ACTION_FAILED) + PG_RETURN_NULL(); /* escontext already set */ + + /* Anything else is a generic JSON parse error */ + ereturn(parse_state.escontext, (Datum) 0, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed pg_ndistinct: \"%s\"", str), + errdetail("Must be valid JSON."))); + PG_RETURN_NULL(); } /* * pg_ndistinct * output routine for type pg_ndistinct * - * Produces a human-readable representation of the value. + * Produces a human-readable representation of the value, in the format: + * [{"attributes": [attnum,. ..], "ndistinct": int}, ...] + * */ Datum pg_ndistinct_out(PG_FUNCTION_ARGS) diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 2dc771369e5..1d837badb96 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -3689,6 +3689,28 @@ SELECT * FROM check_estimated_rows('SELECT * FROM sb_2 WHERE extstat_small(y)'); 196 | 196 (1 row) +-- Test input function of pg_ndistinct. +SELECT '[{"attributes" : [2,3], "ndistinct" : 4}, + {"attributes" : [2,-1], "ndistinct" : 4}, + {"attributes" : [2,3,-1], "ndistinct" : 4}, + {"attributes" : [1,3,-1,-2], "ndistinct" : 4}]'::pg_ndistinct; + pg_ndistinct +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + [{"attributes": [2, 3], "ndistinct": 4}, {"attributes": [2, -1], "ndistinct": 4}, {"attributes": [2, 3, -1], "ndistinct": 4}, {"attributes": [1, 3, -1, -2], "ndistinct": 4}] +(1 row) + +-- error, cannot duplicate attribute +SELECT '[{"attributes" : [2,3], "ndistinct" : 4}, + {"attributes" : [2,-1], "ndistinct" : 4}, + {"attributes" : [2,3,2], "ndistinct" : 4}, + {"attributes" : [1,3,-1,-2], "ndistinct" : 4}]'::pg_ndistinct; +ERROR: malformed pg_ndistinct: "[{"attributes" : [2,3], "ndistinct" : 4}, + {"attributes" : [2,-1], "ndistinct" : 4}, + {"attributes" : [2,3,2], "ndistinct" : 4}, + {"attributes" : [1,3,-1,-2], "ndistinct" : 4}]" +LINE 1: SELECT '[{"attributes" : [2,3], "ndistinct" : 4}, + ^ +DETAIL: attnum list duplicate value found: 2 -- Tidy up DROP TABLE sb_1, sb_2 CASCADE; DROP FUNCTION extstat_small(x numeric); diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 207c431e68c..2c306dcc971 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -1831,6 +1831,18 @@ ANALYZE sb_2; SELECT * FROM check_estimated_rows('SELECT * FROM sb_2 WHERE extstat_small(y)'); +-- Test input function of pg_ndistinct. +SELECT '[{"attributes" : [2,3], "ndistinct" : 4}, + {"attributes" : [2,-1], "ndistinct" : 4}, + {"attributes" : [2,3,-1], "ndistinct" : 4}, + {"attributes" : [1,3,-1,-2], "ndistinct" : 4}]'::pg_ndistinct; + +-- error, cannot duplicate attribute +SELECT '[{"attributes" : [2,3], "ndistinct" : 4}, + {"attributes" : [2,-1], "ndistinct" : 4}, + {"attributes" : [2,3,2], "ndistinct" : 4}, + {"attributes" : [1,3,-1,-2], "ndistinct" : 4}]'::pg_ndistinct; + -- Tidy up DROP TABLE sb_1, sb_2 CASCADE; DROP FUNCTION extstat_small(x numeric); -- 2.51.1