From f05d3af46c096aaa02271ff9915a9e4a35a72e60 Mon Sep 17 00:00:00 2001 From: Vasuki M Date: Mon, 2 Mar 2026 17:59:16 +0530 Subject: [PATCH v6] ANALYZE: add MISSING_STATS_ONLY option Introduce a new ANALYZE (MISSING_STATS_ONLY) option that causes ANALYZE to process only relations that are missing column or extended statistics. When this option is specified, ANALYZE skips columns that already have entries in pg_statistic and processes only attributes for which statistics are absent. If all columns and extended statistics objects already have statistics, the relation is skipped entirely. This allows repeated manual ANALYZE runs to avoid unnecessary work while still analyzing: - newly created tables, - tables that have gained data but lack statistics, - relations where new columns were added, - relations with missing extended statistics. The default ANALYZE behavior remains unchanged. Regression tests are included to cover: - empty tables, - tables gaining data, - repeated runs, - explicit column lists, - expression indexes, - declarative partitioning, - table inheritance. This patch refines earlier versions by moving the missing-statistics logic into examine_attribute() to ensure that column-level decisions are handled consistently within the existing analysis flow and ensuring correct handling of inheritance and partitioned tables. --- src/backend/commands/analyze.c | 134 +++++++++++++++++- src/backend/commands/vacuum.c | 6 +- src/include/commands/vacuum.h | 1 + .../expected/analyze_missing_stats_only.out | 97 +++++++++++++ src/test/regress/parallel_schedule | 1 + .../sql/analyze_missing_stats_only.sql | 109 ++++++++++++++ 6 files changed, 342 insertions(+), 6 deletions(-) create mode 100644 src/test/regress/expected/analyze_missing_stats_only.out create mode 100644 src/test/regress/sql/analyze_missing_stats_only.sql diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 53adac91..928ede4f 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -29,6 +29,8 @@ #include "catalog/index.h" #include "catalog/indexing.h" #include "catalog/pg_inherits.h" +#include "catalog/pg_statistic_ext.h" +#include "catalog/pg_statistic_ext_data.h" #include "commands/progress.h" #include "commands/tablecmds.h" #include "commands/vacuum.h" @@ -46,6 +48,7 @@ #include "storage/procarray.h" #include "utils/attoptcache.h" #include "utils/datum.h" +#include "utils/fmgroids.h" #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/memutils.h" @@ -83,7 +86,7 @@ static void compute_index_stats(Relation onerel, double totalrows, HeapTuple *rows, int numrows, MemoryContext col_context); static VacAttrStats *examine_attribute(Relation onerel, int attnum, - Node *index_expr); + Node *index_expr,bool missing_stats_only); static int acquire_sample_rows(Relation onerel, int elevel, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows); @@ -96,6 +99,54 @@ static void update_attstats(Oid relid, bool inh, static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); +static bool +relation_has_missing_extended_stats(Relation rel) +{ + Relation extrel; + SysScanDesc scan; + ScanKeyData key; + HeapTuple tup; + + extrel = table_open(StatisticExtRelationId, AccessShareLock); + + ScanKeyInit(&key, + Anum_pg_statistic_ext_stxrelid, + BTEqualStrategyNumber, + F_OIDEQ, + ObjectIdGetDatum(RelationGetRelid(rel))); + + scan = systable_beginscan(extrel, + StatisticExtRelidIndexId, + true, + NULL, + 1, + &key); + + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + Form_pg_statistic_ext e = + (Form_pg_statistic_ext) GETSTRUCT(tup); + + HeapTuple dtup = + SearchSysCache2(STATEXTDATASTXOID, + ObjectIdGetDatum(e->oid), + BoolGetDatum(false)); + + if (!HeapTupleIsValid(dtup)) + { + systable_endscan(scan); + table_close(extrel, AccessShareLock); + return true; + } + + ReleaseSysCache(dtup); + } + + systable_endscan(scan); + table_close(extrel, AccessShareLock); + + return false; +} /* * analyze_rel() -- analyze one relation @@ -399,7 +450,7 @@ do_analyze_rel(Relation onerel, const VacuumParams params, col, RelationGetRelationName(onerel)))); unique_cols = bms_add_member(unique_cols, i); - vacattrstats[tcnt] = examine_attribute(onerel, i, NULL); + vacattrstats[tcnt] = examine_attribute(onerel, i, NULL,(params.options & VACOPT_MISSING_STATS_ONLY)); if (vacattrstats[tcnt] != NULL) tcnt++; } @@ -413,7 +464,7 @@ do_analyze_rel(Relation onerel, const VacuumParams params, tcnt = 0; for (i = 1; i <= attr_cnt; i++) { - vacattrstats[tcnt] = examine_attribute(onerel, i, NULL); + vacattrstats[tcnt] = examine_attribute(onerel, i, NULL,(params.options & VACOPT_MISSING_STATS_ONLY)); if (vacattrstats[tcnt] != NULL) tcnt++; } @@ -482,7 +533,7 @@ do_analyze_rel(Relation onerel, const VacuumParams params, indexpr_item = lnext(indexInfo->ii_Expressions, indexpr_item); thisdata->vacattrstats[tcnt] = - examine_attribute(Irel[ind], i + 1, indexkey); + examine_attribute(Irel[ind], i + 1, indexkey,(params.options & VACOPT_MISSING_STATS_ONLY)); if (thisdata->vacattrstats[tcnt] != NULL) tcnt++; } @@ -492,6 +543,40 @@ do_analyze_rel(Relation onerel, const VacuumParams params, } } + /* + * If ANALYZE (MISSING_STATS_ONLY) and nothing to analyze, + * skip relation. + */ + if ((params.options & VACOPT_MISSING_STATS_ONLY) && + attr_cnt == 0) + { + bool has_missing_ext = relation_has_missing_extended_stats(onerel); + + bool has_expr = false; + + for (ind = 0; ind < nindexes; ind++) + { + if (indexdata[ind].attr_cnt > 0) + { + has_expr = true; + break; + } + } + + if (!has_expr && !has_missing_ext) + { + if (verbose) + ereport(INFO, + (errmsg("skipping analyzing \"%s.%s.%s\"", + get_database_name(MyDatabaseId), + get_namespace_name(RelationGetNamespace(onerel)), + RelationGetRelationName(onerel)))); + + vac_close_indexes(nindexes, Irel, NoLock); + return; + } + } + /* * Determine how many rows we need to sample, using the worst case from * all analyzable columns. We use a lower bound of 100 rows to avoid @@ -1039,7 +1124,7 @@ compute_index_stats(Relation onerel, double totalrows, * and index_expr is the expression tree representing the column's data. */ static VacAttrStats * -examine_attribute(Relation onerel, int attnum, Node *index_expr) +examine_attribute(Relation onerel, int attnum, Node *index_expr, bool missing_stats_only) { Form_pg_attribute attr = TupleDescAttr(onerel->rd_att, attnum - 1); int attstattarget; @@ -1076,6 +1161,45 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr) if (attstattarget == 0) return NULL; + /* + * If ANALYZE (MISSING_STATS_ONLY), + * skip attributes that already have stats. + */ + if (missing_stats_only) + { + HeapTuple statstup; + bool has_stats = false; + + /* Try stainherit = false */ + statstup = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(RelationGetRelid(onerel)), + Int16GetDatum(attnum), + BoolGetDatum(false)); + + if (HeapTupleIsValid(statstup)) + { + has_stats = true; + ReleaseSysCache(statstup); + } + else + { + /* Try stainherit = true (partition parents) */ + statstup = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(RelationGetRelid(onerel)), + Int16GetDatum(attnum), + BoolGetDatum(true)); + + if (HeapTupleIsValid(statstup)) + { + has_stats = true; + ReleaseSysCache(statstup); + } + } + if (has_stats) + return NULL; /* skip attribute */ + + } + /* * Create the VacAttrStats struct. */ diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 03932f45..d031f89b 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -165,6 +165,7 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) BufferAccessStrategy bstrategy = NULL; bool verbose = false; bool skip_locked = false; + bool missing_stats_only = false; bool analyze = false; bool freeze = false; bool full = false; @@ -229,6 +230,8 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) ring_size = result; } + else if (strcmp(opt->defname, "missing_stats_only") == 0) + missing_stats_only = defGetBoolean(opt); else if (!vacstmt->is_vacuumcmd) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), @@ -305,6 +308,7 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) (vacstmt->is_vacuumcmd ? VACOPT_VACUUM : VACOPT_ANALYZE) | (verbose ? VACOPT_VERBOSE : 0) | (skip_locked ? VACOPT_SKIP_LOCKED : 0) | + (missing_stats_only ? VACOPT_MISSING_STATS_ONLY :0) | (analyze ? VACOPT_ANALYZE : 0) | (freeze ? VACOPT_FREEZE : 0) | (full ? VACOPT_FULL : 0) | @@ -315,7 +319,7 @@ ExecVacuum(ParseState *pstate, VacuumStmt *vacstmt, bool isTopLevel) (only_database_stats ? VACOPT_ONLY_DATABASE_STATS : 0); /* sanity checks on options */ - Assert(params.options & (VACOPT_VACUUM | VACOPT_ANALYZE)); + Assert(params.options & (VACOPT_VACUUM | VACOPT_ANALYZE | VACOPT_MISSING_STATS_ONLY)); Assert((params.options & VACOPT_VACUUM) || !(params.options & (VACOPT_FULL | VACOPT_FREEZE))); diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index e885a4b9..4ca45fe6 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -188,6 +188,7 @@ typedef struct VacAttrStats #define VACOPT_DISABLE_PAGE_SKIPPING 0x100 /* don't skip any pages */ #define VACOPT_SKIP_DATABASE_STATS 0x200 /* skip vac_update_datfrozenxid() */ #define VACOPT_ONLY_DATABASE_STATS 0x400 /* only vac_update_datfrozenxid() */ +#define VACOPT_MISSING_STATS_ONLY 0x800 /* ANALYZE if stats are missing */ /* * Values used by index_cleanup and truncate params. diff --git a/src/test/regress/expected/analyze_missing_stats_only.out b/src/test/regress/expected/analyze_missing_stats_only.out new file mode 100644 index 00000000..cde00dc1 --- /dev/null +++ b/src/test/regress/expected/analyze_missing_stats_only.out @@ -0,0 +1,97 @@ +-- +-- ANALYZE (MISSING_STATS_ONLY) regression test +-- +-- 1. Brand new empty table +CREATE TABLE ms1 (a int); +-- No statistics yet → should analyze +ANALYZE (MISSING_STATS_ONLY) ms1; +-- pg_statistic should still be empty (table has no rows) +SELECT count(*) AS stat_rows +FROM pg_statistic +WHERE starelid = 'ms1'::regclass; + stat_rows +----------- + 0 +(1 row) + +-- 2. Insert data +INSERT INTO ms1 SELECT generate_series(1,10); +-- Stats missing → should analyze +ANALYZE (MISSING_STATS_ONLY) ms1; +-- Now stats must exist +SELECT count(*) AS stat_rows +FROM pg_statistic +WHERE starelid = 'ms1'::regclass; + stat_rows +----------- + 1 +(1 row) + +-- 3. Re-run → stats exist → should skip +ANALYZE (MISSING_STATS_ONLY) ms1; +-- Stats count should remain the same +SELECT count(*) AS stat_rows +FROM pg_statistic +WHERE starelid = 'ms1'::regclass; + stat_rows +----------- + 1 +(1 row) + +-- 4. Add new column → missing stats for column b +ALTER TABLE ms1 ADD COLUMN b int; +-- Should analyze again +ANALYZE (MISSING_STATS_ONLY) ms1; +-- Now both columns should have stats +SELECT count(*) AS stat_rows +FROM pg_statistic +WHERE starelid = 'ms1'::regclass; + stat_rows +----------- + 2 +(1 row) + +DROP TABLE ms1; +-- 5. ANALYZE with explicit column list +CREATE TABLE ms_cols (a int, b int); +INSERT INTO ms_cols SELECT i, i FROM generate_series(1,10) i; +ANALYZE (MISSING_STATS_ONLY) ms_cols; +-- Re-run for column a only → should skip +ANALYZE (MISSING_STATS_ONLY) ms_cols (a); +SELECT count(*) AS stat_rows +FROM pg_statistic +WHERE starelid = 'ms_cols'::regclass; + stat_rows +----------- + 2 +(1 row) + +DROP TABLE ms_cols; +-- 6. Expression index +CREATE TABLE ms_expr (a int); +INSERT INTO ms_expr SELECT generate_series(1,10); +CREATE INDEX ms_expr_idx ON ms_expr ((a + 1)); +ANALYZE (MISSING_STATS_ONLY) ms_expr; +-- Re-run → should skip +ANALYZE (MISSING_STATS_ONLY) ms_expr; +DROP TABLE ms_expr; +-- 7. Declarative partitioning +CREATE TABLE ms_part (a int) PARTITION BY RANGE (a); +CREATE TABLE ms_part_1 PARTITION OF ms_part + FOR VALUES FROM (0) TO (100); +INSERT INTO ms_part SELECT generate_series(1,10); +-- Should analyze parent and partition correctly +ANALYZE (MISSING_STATS_ONLY) ms_part; +-- Re-run → should skip +ANALYZE (MISSING_STATS_ONLY) ms_part; +DROP TABLE ms_part; +-- 8. Table inheritance +CREATE TABLE ms_parent (a int); +CREATE TABLE ms_child () INHERITS (ms_parent); +INSERT INTO ms_child SELECT generate_series(1,10); +-- Analyze parent with inheritance +ANALYZE (MISSING_STATS_ONLY) ms_parent; +-- Re-run → should skip +ANALYZE (MISSING_STATS_ONLY) ms_parent; +DROP TABLE ms_child; +DROP TABLE ms_parent; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 549e9b2d..a64be382 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -90,6 +90,7 @@ test: rules psql psql_crosstab psql_pipeline amutils stats_ext collate.linux.utf test: select_parallel test: write_parallel test: vacuum_parallel +test: analyze_missing_stats_only # Run this alone, because concurrent DROP TABLE would make non-superuser # "ANALYZE;" fail with "relation with OID $n does not exist". diff --git a/src/test/regress/sql/analyze_missing_stats_only.sql b/src/test/regress/sql/analyze_missing_stats_only.sql new file mode 100644 index 00000000..d34f1687 --- /dev/null +++ b/src/test/regress/sql/analyze_missing_stats_only.sql @@ -0,0 +1,109 @@ +-- +-- ANALYZE (MISSING_STATS_ONLY) regression test +-- + +-- 1. Brand new empty table +CREATE TABLE ms1 (a int); + +-- No statistics yet → should analyze +ANALYZE (MISSING_STATS_ONLY) ms1; + +-- pg_statistic should still be empty (table has no rows) +SELECT count(*) AS stat_rows +FROM pg_statistic +WHERE starelid = 'ms1'::regclass; + +-- 2. Insert data +INSERT INTO ms1 SELECT generate_series(1,10); + +-- Stats missing → should analyze +ANALYZE (MISSING_STATS_ONLY) ms1; + +-- Now stats must exist +SELECT count(*) AS stat_rows +FROM pg_statistic +WHERE starelid = 'ms1'::regclass; + +-- 3. Re-run → stats exist → should skip +ANALYZE (MISSING_STATS_ONLY) ms1; + +-- Stats count should remain the same +SELECT count(*) AS stat_rows +FROM pg_statistic +WHERE starelid = 'ms1'::regclass; + +-- 4. Add new column → missing stats for column b +ALTER TABLE ms1 ADD COLUMN b int; + +-- Should analyze again +ANALYZE (MISSING_STATS_ONLY) ms1; + +-- Now both columns should have stats +SELECT count(*) AS stat_rows +FROM pg_statistic +WHERE starelid = 'ms1'::regclass; + +DROP TABLE ms1; + +-- 5. ANALYZE with explicit column list + +CREATE TABLE ms_cols (a int, b int); +INSERT INTO ms_cols SELECT i, i FROM generate_series(1,10) i; + +ANALYZE (MISSING_STATS_ONLY) ms_cols; + +-- Re-run for column a only → should skip +ANALYZE (MISSING_STATS_ONLY) ms_cols (a); + +SELECT count(*) AS stat_rows +FROM pg_statistic +WHERE starelid = 'ms_cols'::regclass; + +DROP TABLE ms_cols; + +-- 6. Expression index + +CREATE TABLE ms_expr (a int); +INSERT INTO ms_expr SELECT generate_series(1,10); + +CREATE INDEX ms_expr_idx ON ms_expr ((a + 1)); + +ANALYZE (MISSING_STATS_ONLY) ms_expr; + +-- Re-run → should skip +ANALYZE (MISSING_STATS_ONLY) ms_expr; + +DROP TABLE ms_expr; + +-- 7. Declarative partitioning + +CREATE TABLE ms_part (a int) PARTITION BY RANGE (a); + +CREATE TABLE ms_part_1 PARTITION OF ms_part + FOR VALUES FROM (0) TO (100); + +INSERT INTO ms_part SELECT generate_series(1,10); + +-- Should analyze parent and partition correctly +ANALYZE (MISSING_STATS_ONLY) ms_part; + +-- Re-run → should skip +ANALYZE (MISSING_STATS_ONLY) ms_part; + +DROP TABLE ms_part; + +-- 8. Table inheritance + +CREATE TABLE ms_parent (a int); +CREATE TABLE ms_child () INHERITS (ms_parent); + +INSERT INTO ms_child SELECT generate_series(1,10); + +-- Analyze parent with inheritance +ANALYZE (MISSING_STATS_ONLY) ms_parent; + +-- Re-run → should skip +ANALYZE (MISSING_STATS_ONLY) ms_parent; + +DROP TABLE ms_child; +DROP TABLE ms_parent; -- 2.43.0