From 960b0d842e8c15f3436a456bc2cc92f388431f80 Mon Sep 17 00:00:00 2001 From: Chengpeng Yan Date: Thu, 9 Apr 2026 09:49:54 +0800 Subject: [PATCH v5 2/2] ANALYZE: hash-accelerate MCV tracking for equality-only types compute_distinct_stats() still performs a linear search of track[] for each sampled value. At higher statistics targets, that lookup cost can dominate ANALYZE time for equality-only datatypes. When the type cache's default hash support matches the equality operator, and the statistics target is at least ANALYZE_HASH_THRESHOLD, maintain a simplehash table that maps tracked values to their current track[] slots. That reduces match lookup from linear to O(1) on average. Entries that move or are replaced update the hash table so track[] and the lookup structure stay in sync, while the existing linear path remains available as a fallback. --- src/backend/commands/analyze.c | 184 ++++++++++++++++++++++++++++++--- 1 file changed, 171 insertions(+), 13 deletions(-) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 5a45819a114..6b4112155c2 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -55,6 +55,7 @@ #include "utils/sortsupport.h" #include "utils/syscache.h" #include "utils/timestamp.h" +#include "utils/typcache.h" /* Per-index data for ANALYZE */ @@ -1941,6 +1942,75 @@ static int analyze_mcv_list(int *mcv_counts, int samplerows, double totalrows); +/* + * Build a hash table for distinct/MCV tracking only when the statistics + * target is large enough to justify the overhead of maintaining it. + */ +#define ANALYZE_HASH_THRESHOLD 100 + +typedef struct DistinctHashEntry +{ + Datum value; + int index; + uint32 hash; + char status; +} DistinctHashEntry; + +typedef struct DistinctHashContext +{ + FmgrInfo *eqfunc; + FmgrInfo *hashfunc; + Oid collation; +} DistinctHashContext; + +typedef struct DistinctHash_hash DistinctHash_hash; + +static uint32 distinct_hash_hash(DistinctHash_hash * tab, Datum key); +static bool distinct_hash_equal(DistinctHash_hash * tab, Datum key0, Datum key1); + +#define SH_PREFIX DistinctHash +#define SH_ELEMENT_TYPE DistinctHashEntry +#define SH_KEY_TYPE Datum +#define SH_KEY value +#define SH_HASH_KEY(tab, key) distinct_hash_hash(tab, key) +#define SH_EQUAL(tab, key0, key1) distinct_hash_equal(tab, key0, key1) +#define SH_SCOPE static inline +#define SH_STORE_HASH +#define SH_GET_HASH(tab, ent) ((ent)->hash) +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + +static uint32 +distinct_hash_hash(DistinctHash_hash * tab, Datum key) +{ + DistinctHashContext *context = (DistinctHashContext *) tab->private_data; + Datum result; + + result = FunctionCall1Coll(context->hashfunc, context->collation, key); + return DatumGetUInt32(result); +} + +static bool +distinct_hash_equal(DistinctHash_hash * tab, Datum key0, Datum key1) +{ + DistinctHashContext *context = (DistinctHashContext *) tab->private_data; + Datum result; + + result = FunctionCall2Coll(context->eqfunc, context->collation, key0, key1); + return DatumGetBool(result); +} + +static inline void +distinct_hash_set_index(DistinctHash_hash * hash, Datum value, + uint32 value_hash, int index) +{ + DistinctHashEntry *entry = DistinctHash_lookup_hash(hash, value, value_hash); + + Assert(entry != NULL); + entry->index = index; +} + /* * std_typanalyze -- the default type-specific typanalyze function @@ -2111,7 +2181,8 @@ compute_trivial_stats(VacAttrStatsP stats, * the samples. Newly seen values are appended to the list, and when it's * full we replace the oldest singly-seen value (FIFO) using a round-robin * cursor (clock hand) over the count=1 region. This avoids repeatedly - * shifting the count=1 region. + * shifting the count=1 region and, when hashing is enabled, avoids having + * to update a large number of hash->index mappings. */ static void compute_distinct_stats(VacAttrStatsP stats, @@ -2129,16 +2200,22 @@ compute_distinct_stats(VacAttrStatsP stats, bool is_varwidth = (!stats->attrtype->typbyval && stats->attrtype->typlen < 0); FmgrInfo f_cmpeq; + TypeCacheEntry *typentry; typedef struct { Datum value; int count; + uint32 hash; } TrackItem; TrackItem *track; int track_cnt, track_max; int num_mcv = stats->attstattarget; + int firstcount1 = 0; /* index of first singleton in track[] */ int c1_cursor = 0; /* next singleton to evict (FIFO) */ + bool use_hash; + DistinctHashContext hash_context; + DistinctHash_hash *track_hash = NULL; StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data; /* @@ -2151,14 +2228,33 @@ compute_distinct_stats(VacAttrStatsP stats, track_cnt = 0; fmgr_info(mystats->eqfunc, &f_cmpeq); + typentry = lookup_type_cache(stats->attrtypid, + TYPECACHE_HASH_PROC_FINFO | TYPECACHE_EQ_OPR); + + /* + * For sufficiently large statistics targets, use a hash table to avoid + * repeated linear searches of the track[] array, but only when we can use + * the type's default hash support that matches the equality operator. + */ + use_hash = (num_mcv >= ANALYZE_HASH_THRESHOLD && + mystats->eqopr == typentry->eq_opr && + OidIsValid(typentry->hash_proc)); + if (use_hash) + { + hash_context.eqfunc = &f_cmpeq; + hash_context.hashfunc = &typentry->hash_proc_finfo; + hash_context.collation = stats->attrcollid; + track_hash = DistinctHash_create(CurrentMemoryContext, track_max, + &hash_context); + } for (i = 0; i < samplerows; i++) { Datum value; bool isnull; bool match; - int firstcount1, - j; + int j = 0; + uint32 value_hash = 0; vacuum_delay_point(true); @@ -2205,19 +2301,32 @@ compute_distinct_stats(VacAttrStatsP stats, /* * See if the value matches anything we're already tracking. */ - match = false; - firstcount1 = track_cnt; - for (j = 0; j < track_cnt; j++) + if (use_hash) + { + DistinctHashEntry *entry; + + value_hash = distinct_hash_hash(track_hash, value); + entry = DistinctHash_lookup_hash(track_hash, value, value_hash); + match = (entry != NULL); + if (match) + j = entry->index; + } + else { - if (DatumGetBool(FunctionCall2Coll(&f_cmpeq, - stats->attrcollid, - value, track[j].value))) + match = false; + firstcount1 = track_cnt; + for (j = 0; j < track_cnt; j++) { - match = true; - break; + if (DatumGetBool(FunctionCall2Coll(&f_cmpeq, + stats->attrcollid, + value, track[j].value))) + { + match = true; + break; + } + if (j < firstcount1 && track[j].count == 1) + firstcount1 = j; } - if (j < firstcount1 && track[j].count == 1) - firstcount1 = j; } if (match) @@ -2233,6 +2342,18 @@ compute_distinct_stats(VacAttrStatsP stats, { swapDatum(track[j].value, track[j - 1].value); swapInt(track[j].count, track[j - 1].count); + if (use_hash) + { + uint32 tmp; + + tmp = track[j].hash; + track[j].hash = track[j - 1].hash; + track[j - 1].hash = tmp; + distinct_hash_set_index(track_hash, track[j].value, + track[j].hash, j); + distinct_hash_set_index(track_hash, track[j - 1].value, + track[j - 1].hash, j - 1); + } j--; } @@ -2250,6 +2371,21 @@ compute_distinct_stats(VacAttrStatsP stats, if (c1_cursor >= track_cnt) c1_cursor = firstcount1 + 1; } + + /* + * In hash mode, a promoted singleton advances the first singleton + * boundary by one slot. + */ + if (use_hash && was_count1 && j <= firstcount1) + firstcount1++; + + /* + * In hash mode, bubble-up may promote a singleton out of the + * count=1 region and advance firstcount1, so c1_cursor might now + * point into the count>1 prefix. + */ + if (use_hash && c1_cursor < firstcount1) + c1_cursor = firstcount1; } else { @@ -2272,12 +2408,34 @@ compute_distinct_stats(VacAttrStatsP stats, insert_index = c1_cursor++; if (c1_cursor >= track_cnt) c1_cursor = firstcount1; + + if (use_hash) + { + DistinctHashEntry *delentry; + + delentry = DistinctHash_lookup_hash(track_hash, + track[insert_index].value, + track[insert_index].hash); + Assert(delentry != NULL); + DistinctHash_delete_item(track_hash, delentry); + } } else continue; track[insert_index].value = value; track[insert_index].count = 1; + if (use_hash) + { + bool found_hash; + DistinctHashEntry *entry; + + track[insert_index].hash = value_hash; + entry = DistinctHash_insert_hash(track_hash, value, value_hash, + &found_hash); + Assert(!found_hash); + entry->index = insert_index; + } } } -- 2.50.1 (Apple Git-155)