From 7fa7c7acf2c6710f2198363b20a869d7b4dd450b Mon Sep 17 00:00:00 2001
From: Chengpeng Yan <chengpeng_yan@outlook.com>
Date: Sat, 7 Feb 2026 19:55:22 +0800
Subject: [PATCH v4 1/2] ANALYZE: speed up MCV tracking for equality-only types

compute_distinct_stats() tracks candidate most-common values for
datatypes that have an equality operator but no ordering.  Matching each
sampled value against the tracking array requires a linear search, which
becomes expensive when statistics targets are set high.

For statistics targets of 100 and above, and when the type's default
hash support matches the equality operator, maintain a simplehash table
that maps a tracked value to its current track[] slot.  This reduces
match lookups from O(n) to O(1) on average, while keeping the linear
path as a fallback when suitable hash support isn't available.

Use a round-robin cursor over the count=1 region to evict singletons
without repeatedly shifting the tracking array and updating many
hash->index mappings.
---
 src/backend/commands/analyze.c | 251 +++++++++++++++++++++++++++++----
 1 file changed, 227 insertions(+), 24 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index a483424152c..f7a5d2d46dd 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -54,6 +54,7 @@
 #include "utils/sortsupport.h"
 #include "utils/syscache.h"
 #include "utils/timestamp.h"
+#include "utils/typcache.h"
 
 
 /* Per-index data for ANALYZE */
@@ -1888,6 +1889,75 @@ static int	analyze_mcv_list(int *mcv_counts,
 							 int samplerows,
 							 double totalrows);
 
+/*
+ * Build a hash table for distinct/MCV tracking only when the statistics
+ * target is large enough to justify the overhead of maintaining it.
+ */
+#define ANALYZE_HASH_THRESHOLD  100
+
+typedef struct DistinctHashEntry
+{
+	Datum		value;
+	int			index;
+	uint32		hash;
+	char		status;
+}			DistinctHashEntry;
+
+typedef struct DistinctHashContext
+{
+	FmgrInfo   *cmpfunc;
+	FmgrInfo   *hashfunc;
+	Oid			collation;
+}			DistinctHashContext;
+
+typedef struct DistinctHash_hash DistinctHash_hash;
+
+static uint32 distinct_hash_hash(DistinctHash_hash * tab, Datum key);
+static bool distinct_hash_equal(DistinctHash_hash * tab, Datum key0, Datum key1);
+
+#define SH_PREFIX DistinctHash
+#define SH_ELEMENT_TYPE DistinctHashEntry
+#define SH_KEY_TYPE Datum
+#define SH_KEY value
+#define SH_HASH_KEY(tab, key) distinct_hash_hash(tab, key)
+#define SH_EQUAL(tab, key0, key1) distinct_hash_equal(tab, key0, key1)
+#define SH_SCOPE static inline
+#define SH_STORE_HASH
+#define SH_GET_HASH(tab, ent) ((ent)->hash)
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+static uint32
+distinct_hash_hash(DistinctHash_hash * tab, Datum key)
+{
+	DistinctHashContext *context = (DistinctHashContext *) tab->private_data;
+	Datum		result;
+
+	result = FunctionCall1Coll(context->hashfunc, context->collation, key);
+	return DatumGetUInt32(result);
+}
+
+static bool
+distinct_hash_equal(DistinctHash_hash * tab, Datum key0, Datum key1)
+{
+	DistinctHashContext *context = (DistinctHashContext *) tab->private_data;
+	Datum		result;
+
+	result = FunctionCall2Coll(context->cmpfunc, context->collation, key0, key1);
+	return DatumGetBool(result);
+}
+
+static inline void
+distinct_hash_set_index(DistinctHash_hash * hash, Datum value,
+						uint32 value_hash, int index)
+{
+	DistinctHashEntry *entry = DistinctHash_lookup_hash(hash, value, value_hash);
+
+	Assert(entry != NULL);
+	entry->index = index;
+}
+
 
 /*
  * std_typanalyze -- the default type-specific typanalyze function
@@ -2055,10 +2125,14 @@ compute_trivial_stats(VacAttrStatsP stats,
  *
  *	The most common values are determined by brute force: we keep a list
  *	of previously seen values, ordered by number of times seen, as we scan
- *	the samples.  A newly seen value is inserted just after the last
- *	multiply-seen value, causing the bottommost (oldest) singly-seen value
- *	to drop off the list.  The accuracy of this method, and also its cost,
- *	depend mainly on the length of the list we are willing to keep.
+ *	the samples.  Newly seen values are appended to the list, and when it's
+ *	full we replace the oldest singly-seen value (FIFO) using a round-robin
+ *	cursor (clock hand) over the count=1 region.  This avoids repeatedly
+ *	shifting the count=1 region and, when hashing is enabled, avoids having
+ *	to update a large number of hash->index mappings.
+ *
+ *	The accuracy of this method, and also its cost, depend mainly on the
+ *	length of the list we are willing to keep.
  */
 static void
 compute_distinct_stats(VacAttrStatsP stats,
@@ -2076,15 +2150,22 @@ compute_distinct_stats(VacAttrStatsP stats,
 	bool		is_varwidth = (!stats->attrtype->typbyval &&
 							   stats->attrtype->typlen < 0);
 	FmgrInfo	f_cmpeq;
+	TypeCacheEntry *typentry;
 	typedef struct
 	{
 		Datum		value;
 		int			count;
+		uint32		hash;
 	} TrackItem;
 	TrackItem  *track;
 	int			track_cnt,
 				track_max;
 	int			num_mcv = stats->attstattarget;
+	int			firstcount1 = 0;	/* index of first singleton in track[] */
+	int			c1_cursor = 0;	/* next singleton to evict (FIFO) */
+	bool		use_hash;
+	DistinctHashContext hash_context;
+	DistinctHash_hash *track_hash = NULL;
 	StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data;
 
 	/*
@@ -2097,14 +2178,33 @@ compute_distinct_stats(VacAttrStatsP stats,
 	track_cnt = 0;
 
 	fmgr_info(mystats->eqfunc, &f_cmpeq);
+	typentry = lookup_type_cache(stats->attrtypid,
+								 TYPECACHE_HASH_PROC_FINFO | TYPECACHE_EQ_OPR);
+
+	/*
+	 * For sufficiently large statistics targets, use a hash table to avoid
+	 * repeated linear searches of the track[] array, but only when we can use
+	 * the type's default hash support that matches the equality operator.
+	 */
+	use_hash = (num_mcv >= ANALYZE_HASH_THRESHOLD &&
+				mystats->eqopr == typentry->eq_opr &&
+				OidIsValid(typentry->hash_proc));
+	if (use_hash)
+	{
+		hash_context.cmpfunc = &f_cmpeq;
+		hash_context.hashfunc = &typentry->hash_proc_finfo;
+		hash_context.collation = stats->attrcollid;
+		track_hash = DistinctHash_create(CurrentMemoryContext, track_max,
+										 &hash_context);
+	}
 
 	for (i = 0; i < samplerows; i++)
 	{
 		Datum		value;
 		bool		isnull;
 		bool		match;
-		int			firstcount1,
-					j;
+		int			j = 0;
+		uint32		value_hash = 0;
 
 		vacuum_delay_point(true);
 
@@ -2151,47 +2251,150 @@ compute_distinct_stats(VacAttrStatsP stats,
 		/*
 		 * See if the value matches anything we're already tracking.
 		 */
-		match = false;
-		firstcount1 = track_cnt;
-		for (j = 0; j < track_cnt; j++)
+		if (use_hash)
 		{
-			if (DatumGetBool(FunctionCall2Coll(&f_cmpeq,
-											   stats->attrcollid,
-											   value, track[j].value)))
+			DistinctHashEntry *entry;
+
+			value_hash = distinct_hash_hash(track_hash, value);
+			entry = DistinctHash_lookup_hash(track_hash, value, value_hash);
+			match = (entry != NULL);
+			if (match)
+				j = entry->index;
+		}
+		else
+		{
+			match = false;
+			firstcount1 = track_cnt;
+			for (j = 0; j < track_cnt; j++)
 			{
-				match = true;
-				break;
+				if (DatumGetBool(FunctionCall2Coll(&f_cmpeq,
+												   stats->attrcollid,
+												   value, track[j].value)))
+				{
+					match = true;
+					break;
+				}
+				if (j < firstcount1 && track[j].count == 1)
+					firstcount1 = j;
 			}
-			if (j < firstcount1 && track[j].count == 1)
-				firstcount1 = j;
 		}
 
 		if (match)
 		{
+			bool		was_count1;
+			int			match_index = j;
+
 			/* Found a match */
+			was_count1 = (track[j].count == 1);
 			track[j].count++;
 			/* This value may now need to "bubble up" in the track list */
 			while (j > 0 && track[j].count > track[j - 1].count)
 			{
 				swapDatum(track[j].value, track[j - 1].value);
 				swapInt(track[j].count, track[j - 1].count);
+				if (use_hash)
+				{
+					uint32		tmp;
+
+					tmp = track[j].hash;
+					track[j].hash = track[j - 1].hash;
+					track[j - 1].hash = tmp;
+					distinct_hash_set_index(track_hash, track[j].value,
+											track[j].hash, j);
+					distinct_hash_set_index(track_hash, track[j - 1].value,
+											track[j - 1].hash, j - 1);
+				}
 				j--;
 			}
+
+			/*
+			 * When a singleton becomes multiply-seen, bubble-up swaps move it
+			 * into the count>1 prefix and shift the preceding singletons
+			 * track[firstcount1..match_index-1] right by one.
+			 *
+			 * If c1_cursor points to a shifted singleton, or to the promoted
+			 * singleton itself (match_index), advance it so FIFO eviction
+			 * matches the original shift-based behavior.
+			 */
+			if (was_count1 &&
+				c1_cursor >= firstcount1 &&
+				c1_cursor <= match_index)
+			{
+				c1_cursor++;
+				if (c1_cursor >= track_cnt)
+					c1_cursor = firstcount1 + 1;
+			}
+
+			/*
+			 * In hash mode, a promoted singleton advances the first singleton
+			 * boundary by one slot.
+			 */
+			if (use_hash && was_count1 && j <= firstcount1)
+				firstcount1++;
+
+			/*
+			 * In hash mode, bubble-up may promote a singleton out of the
+			 * count=1 region and advance firstcount1, so c1_cursor might now
+			 * point into the count>1 prefix.
+			 */
+			if (use_hash && c1_cursor < firstcount1)
+				c1_cursor = firstcount1;
 		}
 		else
 		{
-			/* No match.  Insert at head of count-1 list */
+			int			insert_index;
+
+			/*
+			 * No match.  Track a single-occurrence value if we have a slot.
+			 * If we're full, evict the oldest singleton (FIFO) from the
+			 * count=1 region (track[firstcount1..track_cnt-1]) while leaving
+			 * the multiply-seen items intact, rather than shifting the array.
+			 */
 			if (track_cnt < track_max)
-				track_cnt++;
-			for (j = track_cnt - 1; j > firstcount1; j--)
+				insert_index = track_cnt++;
+			else if (firstcount1 < track_cnt)
 			{
-				track[j].value = track[j - 1].value;
-				track[j].count = track[j - 1].count;
+				/*
+				 * Match the old shift-based FIFO eviction without O(n)
+				 * shifting by treating the count=1 region as a ring and
+				 * advancing a round-robin cursor (clock hand) over it.
+				 */
+				/*
+				 * Bubble-up promotions can advance firstcount1; keep
+				 * c1_cursor within the count=1 region.
+				 */
+				if (c1_cursor < firstcount1 || c1_cursor >= track_cnt)
+					c1_cursor = firstcount1;
+				insert_index = c1_cursor++;
+				if (c1_cursor >= track_cnt)
+					c1_cursor = firstcount1;
+
+				if (use_hash)
+				{
+					DistinctHashEntry *delentry;
+
+					delentry = DistinctHash_lookup_hash(track_hash,
+														track[insert_index].value,
+														track[insert_index].hash);
+					Assert(delentry != NULL);
+					DistinctHash_delete_item(track_hash, delentry);
+				}
 			}
-			if (firstcount1 < track_cnt)
+			else
+				continue;
+
+			track[insert_index].value = value;
+			track[insert_index].count = 1;
+			if (use_hash)
 			{
-				track[firstcount1].value = value;
-				track[firstcount1].count = 1;
+				bool		found_hash;
+				DistinctHashEntry *entry;
+
+				track[insert_index].hash = value_hash;
+				entry = DistinctHash_insert_hash(track_hash, value, value_hash,
+												 &found_hash);
+				Assert(!found_hash);
+				entry->index = insert_index;
 			}
 		}
 	}
-- 
2.50.1 (Apple Git-155)