From a6b1cb866f9374cdc893e9a318959eccaa5bfbc9 Mon Sep 17 00:00:00 2001
From: Oleksandr Shulgin <oleksandr.shulgin@zalando.de>
Date: Wed, 2 Mar 2016 18:18:36 +0100
Subject: [PATCH 1/2] Account for NULLs in ANALYZE more strictly

Previously the ndistinct and avgcount calculation (for MCV list) could
be affected greatly by high fraction of NULLs in the sample.  Account
for that by subtracting the number of NULLs we've seen from the total
sample size explicitly.

At the same time, values that are considered "too wide" are accounted
for in ndistinct, but removed from sample size for MCV list
calculation.  In compute_distinct_stats() we need to do that manually,
in compute_scalar_stats() the value_cnt is already holding the number
of non-null, not too-wide values.
---
 src/backend/commands/analyze.c | 42 ++++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 8a5f07c..f05b496 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -2085,17 +2085,21 @@ compute_distinct_stats(VacAttrStatsP stats,
 						denom,
 						stadistinct;
 
-			numer = (double) samplerows *(double) d;
+			double		samplerows_nonnull = samplerows - null_cnt;
+			double		totalrows_nonnull
+							= totalrows * (1.0 - stats->stanullfrac);
 
-			denom = (double) (samplerows - f1) +
-				(double) f1 *(double) samplerows / totalrows;
+			numer = samplerows_nonnull * (double) d;
+
+			denom = (samplerows_nonnull - f1) +
+				(double) f1 * samplerows_nonnull / totalrows_nonnull;
 
 			stadistinct = numer / denom;
 			/* Clamp to sane range in case of roundoff error */
 			if (stadistinct < (double) d)
 				stadistinct = (double) d;
-			if (stadistinct > totalrows)
-				stadistinct = totalrows;
+			if (stadistinct > totalrows_nonnull)
+				stadistinct = totalrows_nonnull;
 			stats->stadistinct = floor(stadistinct + 0.5);
 		}
 
@@ -2124,16 +2128,17 @@ compute_distinct_stats(VacAttrStatsP stats,
 			/* Track list includes all values seen, and all will fit */
 			num_mcv = track_cnt;
 		}
-		else
+		else if (track_cnt != 0)
 		{
+			int			sample_cnt = nonnull_cnt - toowide_cnt;
 			double		ndistinct = stats->stadistinct;
 			double		avgcount,
 						mincount;
 
 			if (ndistinct < 0)
-				ndistinct = -ndistinct * totalrows;
+				ndistinct = -ndistinct * sample_cnt;
 			/* estimate # of occurrences in sample of a typical value */
-			avgcount = (double) samplerows / ndistinct;
+			avgcount = (double) sample_cnt / ndistinct;
 			/* set minimum threshold count to store a value */
 			mincount = avgcount * 1.25;
 			if (mincount < 2)
@@ -2434,17 +2439,21 @@ compute_scalar_stats(VacAttrStatsP stats,
 						denom,
 						stadistinct;
 
-			numer = (double) samplerows *(double) d;
+			double		samplerows_nonnull = samplerows - null_cnt;
+			double		totalrows_nonnull
+							= totalrows * (1.0 - stats->stanullfrac);
+
+			numer = samplerows_nonnull * (double) d;
 
-			denom = (double) (samplerows - f1) +
-				(double) f1 *(double) samplerows / totalrows;
+			denom = (samplerows_nonnull - f1) +
+				(double) f1 * samplerows_nonnull / totalrows_nonnull;
 
 			stadistinct = numer / denom;
 			/* Clamp to sane range in case of roundoff error */
 			if (stadistinct < (double) d)
 				stadistinct = (double) d;
-			if (stadistinct > totalrows)
-				stadistinct = totalrows;
+			if (stadistinct > totalrows_nonnull)
+				stadistinct = totalrows_nonnull;
 			stats->stadistinct = floor(stadistinct + 0.5);
 		}
 
@@ -2480,21 +2489,18 @@ compute_scalar_stats(VacAttrStatsP stats,
 		}
 		else
 		{
-			double		ndistinct = stats->stadistinct;
 			double		avgcount,
 						mincount,
 						maxmincount;
 
-			if (ndistinct < 0)
-				ndistinct = -ndistinct * totalrows;
 			/* estimate # of occurrences in sample of a typical value */
-			avgcount = (double) samplerows / ndistinct;
+			avgcount = (double) values_cnt / (double) ndistinct;
 			/* set minimum threshold count to store a value */
 			mincount = avgcount * 1.25;
 			if (mincount < 2)
 				mincount = 2;
 			/* don't let threshold exceed 1/K, however */
-			maxmincount = (double) samplerows / (double) num_bins;
+			maxmincount = (double) values_cnt / (double) num_bins;
 			if (mincount > maxmincount)
 				mincount = maxmincount;
 			if (num_mcv > track_cnt)
-- 
2.5.0