From 6f081a9525aff0792fd42b2d629133ed5114a6de Mon Sep 17 00:00:00 2001 From: Evdokimov Ilia Date: Sun, 3 May 2026 17:04:34 +0300 Subject: [PATCH v6 2/2] ANALYZE: hash-accelerate MCV tracking for equality-only types compute_distinct_stats() still performs a linear search of track[] for each sampled value. At higher statistics targets, that lookup cost can dominate ANALYZE time for equality-only datatypes. When the type cache's default hash support matches the equality operator, and the statistics target is at least ANALYZE_HASH_THRESHOLD, maintain a simplehash table that maps tracked values to their current track[] slots. That reduces match lookup from linear to O(1) on average. Entries that move or are replaced update the hash table so track[] and the lookup structure stay in sync, while the existing linear path remains available as a fallback. --- src/backend/commands/analyze.c | 171 ++++++++++++++++++++++++++++++--- 1 file changed, 160 insertions(+), 11 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 262adf41b24..2e6fbe5cc12 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -55,6 +55,7 @@ #include "utils/sortsupport.h" #include "utils/syscache.h" #include "utils/timestamp.h" +#include "utils/typcache.h" /* Per-index data for ANALYZE */ typedef struct AnlIndexData @@ -1900,6 +1901,12 @@ ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull) */ #define WIDTH_THRESHOLD 1024 +/* + * Build a hash table for distinct/MCV tracking only when the statistics + * target is large enough to justify the overhead of maintaining it. + */ +#define ANALYZE_HASH_THRESHOLD 100 + #define swapInt(a,b) do {int _tmp; _tmp=a; a=b; b=_tmp;} while(0) #define swapDatum(a,b) do {Datum _tmp; _tmp=a; a=b; b=_tmp;} while(0) @@ -1918,7 +1925,29 @@ typedef struct int *tupnoLink; } CompareScalarsContext; +/* Entries in the simplehash hash table used by compute_distinct_stats */ +typedef struct DistinctHashEntry +{ + Datum value; /* the value represented by this entry */ + int index; /* its index in the relevant track */ + uint32 hash; /* hash code for the Datum */ + char status; /* status code used by simplehash.h */ +} DistinctHashEntry; + +/* private_data for the simplehash hash table */ +typedef struct DistinctHashContext +{ + FmgrInfo *eqfunc; /* the equality operator */ + FmgrInfo *hashfunc; /* the hash function to use */ + Oid collation; /* collation to use equality and hash calls */ +} DistinctHashContext; + +/* forward reference */ +typedef struct DistinctHash_hash DistinctHash_hash; + +static uint32 distinct_hash_hash(DistinctHash_hash * tab, Datum key); +static bool distinct_hash_equal(DistinctHash_hash * tab, Datum key0, Datum key1); static void compute_trivial_stats(VacAttrStatsP stats, AnalyzeAttrFetchFunc fetchfunc, int samplerows, @@ -1940,6 +1969,53 @@ static int analyze_mcv_list(int *mcv_counts, int samplerows, double totalrows); +/* Define support routines for compute distinct values hash tables */ +#define SH_PREFIX DistinctHash +#define SH_ELEMENT_TYPE DistinctHashEntry +#define SH_KEY_TYPE Datum +#define SH_KEY value +#define SH_HASH_KEY(tab, key) distinct_hash_hash(tab, key) +#define SH_EQUAL(tab, key0, key1) distinct_hash_equal(tab, key0, key1) +#define SH_SCOPE static inline +#define SH_STORE_HASH +#define SH_GET_HASH(tab, ent) ((ent)->hash) +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + + +/* + * Support functions for the hash tables used by compute_distinct_stats + */ +static uint32 +distinct_hash_hash(DistinctHash_hash * tab, Datum key) +{ + DistinctHashContext *context = (DistinctHashContext *) tab->private_data; + Datum result; + + result = FunctionCall1Coll(context->hashfunc, context->collation, key); + return DatumGetUInt32(result); +} + +static bool +distinct_hash_equal(DistinctHash_hash * tab, Datum key0, Datum key1) +{ + DistinctHashContext *context = (DistinctHashContext *) tab->private_data; + Datum result; + + result = FunctionCall2Coll(context->eqfunc, context->collation, key0, key1); + return DatumGetBool(result); +} + +static inline void +distinct_hash_set_index(DistinctHash_hash * hash, Datum value, + uint32 value_hash, int index) +{ + DistinctHashEntry *entry = DistinctHash_lookup_hash(hash, value, value_hash); + + Assert(entry != NULL); + entry->index = index; +} /* * std_typanalyze -- the default type-specific typanalyze function @@ -2129,15 +2205,20 @@ compute_distinct_stats(VacAttrStatsP stats, bool is_varwidth = (!stats->attrtype->typbyval && stats->attrtype->typlen < 0); FmgrInfo f_cmpeq; + TypeCacheEntry *typentry; typedef struct { Datum value; int count; + uint32 hash; } TrackItem; TrackItem *track; int track_cnt, track_max; int num_mcv = stats->attstattarget; + bool use_hash; + DistinctHashContext hash_context; + DistinctHash_hash *track_hash = NULL; int firstcount1 = 0, /* index of first singleton in track[] */ c1_cursor = 0; /* next singleton to evict (FIFO) */ StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data; @@ -2152,6 +2233,26 @@ compute_distinct_stats(VacAttrStatsP stats, track_cnt = 0; fmgr_info(mystats->eqfunc, &f_cmpeq); + typentry = lookup_type_cache(stats->attrtypid, + TYPECACHE_HASH_PROC_FINFO | TYPECACHE_EQ_OPR); + + /* + * For sufficiently large statistics targets, use a hash table to avoid + * repeated linear searches of the track[] array, but only when we can use + * the type's default hash support that matches the equality operator. + */ + use_hash = (num_mcv >= ANALYZE_HASH_THRESHOLD && + mystats->eqopr == typentry->eq_opr && + OidIsValid(typentry->hash_proc)); + + if (use_hash) + { + hash_context.eqfunc = &f_cmpeq; + hash_context.hashfunc = &typentry->hash_proc_finfo; + hash_context.collation = stats->attrcollid; + track_hash = DistinctHash_create(CurrentMemoryContext, track_max, + &hash_context); + } for (i = 0; i < samplerows; i++) { @@ -2160,6 +2261,7 @@ compute_distinct_stats(VacAttrStatsP stats, bool match; int match_index, j; + uint32 value_hash = 0; vacuum_delay_point(true); @@ -2206,20 +2308,33 @@ compute_distinct_stats(VacAttrStatsP stats, /* * See if the value matches anything we're already tracking. */ - match = false; - firstcount1 = track_cnt; - for (j = 0; j < track_cnt; j++) + if (use_hash) { - if (DatumGetBool(FunctionCall2Coll(&f_cmpeq, - stats->attrcollid, - value, track[j].value))) + DistinctHashEntry *entry; + + value_hash = distinct_hash_hash(track_hash, value); + entry = DistinctHash_lookup_hash(track_hash, value, value_hash); + match = (entry != NULL); + if (match) + match_index = entry->index; + } + else + { + match = false; + firstcount1 = track_cnt; + for (j = 0; j < track_cnt; j++) { - match = true; - match_index = j; - break; + if (DatumGetBool(FunctionCall2Coll(&f_cmpeq, + stats->attrcollid, + value, track[j].value))) + { + match = true; + match_index = j; + break; + } + if (j < firstcount1 && track[j].count == 1) + firstcount1 = j; } - if (j < firstcount1 && track[j].count == 1) - firstcount1 = j; } if (match) @@ -2234,6 +2349,18 @@ compute_distinct_stats(VacAttrStatsP stats, { swapDatum(track[j].value, track[j - 1].value); swapInt(track[j].count, track[j - 1].count); + if (use_hash) + { + uint32 tmp; + + tmp = track[j].hash; + track[j].hash = track[j - 1].hash; + track[j - 1].hash = tmp; + distinct_hash_set_index(track_hash, track[j].value, + track[j].hash, j); + distinct_hash_set_index(track_hash, track[j - 1].value, + track[j - 1].hash, j - 1); + } } /* @@ -2281,12 +2408,34 @@ compute_distinct_stats(VacAttrStatsP stats, insert_index = c1_cursor++; if (c1_cursor >= track_cnt) c1_cursor = firstcount1; + + if (use_hash) + { + DistinctHashEntry *delentry; + + delentry = DistinctHash_lookup_hash(track_hash, + track[insert_index].value, + track[insert_index].hash); + Assert(delentry != NULL); + DistinctHash_delete_item(track_hash, delentry); + } } else continue; track[insert_index].value = value; track[insert_index].count = 1; + if (use_hash) + { + bool found_hash; + DistinctHashEntry *entry; + + track[insert_index].hash = value_hash; + entry = DistinctHash_insert_hash(track_hash, value, value_hash, + &found_hash); + Assert(!found_hash); + entry->index = insert_index; + } } } -- 2.34.1