Re: Bogus ANALYZE results for an otherwise-unique column with many nulls - Mailing list pgsql-hackers

From Tom Lane
Subject Re: Bogus ANALYZE results for an otherwise-unique column with many nulls
Date
Msg-id 18487.1470351666@sss.pgh.pa.us
Whole thread Raw
In response to Bogus ANALYZE results for an otherwise-unique column with many nulls  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: Bogus ANALYZE results for an otherwise-unique column with many nulls  (Andreas Joseph Krogh <andreas@visena.com>)
List pgsql-hackers
I wrote:
> Looking around, there are a couple of places outside commands/analyze.c
> that are making the same mistake, so this patch isn't complete, but it
> illustrates what needs to be done.

Here's a more complete patch.

            regards, tom lane

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 5fcedd7..9ac7122 100644
*** a/src/backend/commands/analyze.c
--- b/src/backend/commands/analyze.c
*************** compute_distinct_stats(VacAttrStatsP sta
*** 2049,2056 ****

          if (nmultiple == 0)
          {
!             /* If we found no repeated values, assume it's a unique column */
!             stats->stadistinct = -1.0;
          }
          else if (track_cnt < track_max && toowide_cnt == 0 &&
                   nmultiple == track_cnt)
--- 2049,2059 ----

          if (nmultiple == 0)
          {
!             /*
!              * If we found no repeated non-null values, assume it's a unique
!              * column; but be sure to discount for any nulls we found.
!              */
!             stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
          }
          else if (track_cnt < track_max && toowide_cnt == 0 &&
                   nmultiple == track_cnt)
*************** compute_scalar_stats(VacAttrStatsP stats
*** 2426,2433 ****

          if (nmultiple == 0)
          {
!             /* If we found no repeated values, assume it's a unique column */
!             stats->stadistinct = -1.0;
          }
          else if (toowide_cnt == 0 && nmultiple == ndistinct)
          {
--- 2429,2439 ----

          if (nmultiple == 0)
          {
!             /*
!              * If we found no repeated non-null values, assume it's a unique
!              * column; but be sure to discount for any nulls we found.
!              */
!             stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
          }
          else if (toowide_cnt == 0 && nmultiple == ndistinct)
          {
*************** compute_scalar_stats(VacAttrStatsP stats
*** 2753,2759 ****
          else
              stats->stawidth = stats->attrtype->typlen;
          /* Assume all too-wide values are distinct, so it's a unique column */
!         stats->stadistinct = -1.0;
      }
      else if (null_cnt > 0)
      {
--- 2759,2765 ----
          else
              stats->stawidth = stats->attrtype->typlen;
          /* Assume all too-wide values are distinct, so it's a unique column */
!         stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
      }
      else if (null_cnt > 0)
      {
diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c
index 0f851ea..817453c 100644
*** a/src/backend/tsearch/ts_typanalyze.c
--- b/src/backend/tsearch/ts_typanalyze.c
*************** compute_tsvector_stats(VacAttrStats *sta
*** 295,301 ****
          stats->stawidth = total_width / (double) nonnull_cnt;

          /* Assume it's a unique column (see notes above) */
!         stats->stadistinct = -1.0;

          /*
           * Construct an array of the interesting hashtable items, that is,
--- 295,301 ----
          stats->stawidth = total_width / (double) nonnull_cnt;

          /* Assume it's a unique column (see notes above) */
!         stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);

          /*
           * Construct an array of the interesting hashtable items, that is,
diff --git a/src/backend/utils/adt/rangetypes_typanalyze.c b/src/backend/utils/adt/rangetypes_typanalyze.c
index fcb71d3..56504fc 100644
*** a/src/backend/utils/adt/rangetypes_typanalyze.c
--- b/src/backend/utils/adt/rangetypes_typanalyze.c
*************** compute_range_stats(VacAttrStats *stats,
*** 203,209 ****
          /* Do the simple null-frac and width stats */
          stats->stanullfrac = (double) null_cnt / (double) samplerows;
          stats->stawidth = total_width / (double) non_null_cnt;
!         stats->stadistinct = -1.0;

          /* Must copy the target values into anl_context */
          old_cxt = MemoryContextSwitchTo(stats->anl_context);
--- 203,211 ----
          /* Do the simple null-frac and width stats */
          stats->stanullfrac = (double) null_cnt / (double) samplerows;
          stats->stawidth = total_width / (double) non_null_cnt;
!
!         /* Estimate that non-null values are unique */
!         stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);

          /* Must copy the target values into anl_context */
          old_cxt = MemoryContextSwitchTo(stats->anl_context);
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index cc2a9a1..56943f2 100644
*** a/src/backend/utils/adt/selfuncs.c
--- b/src/backend/utils/adt/selfuncs.c
*************** double
*** 4738,4743 ****
--- 4738,4744 ----
  get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
  {
      double        stadistinct;
+     double        stanullfrac = 0.0;
      double        ntuples;

      *isdefault = false;
*************** get_variable_numdistinct(VariableStatDat
*** 4745,4751 ****
      /*
       * Determine the stadistinct value to use.  There are cases where we can
       * get an estimate even without a pg_statistic entry, or can get a better
!      * value than is in pg_statistic.
       */
      if (HeapTupleIsValid(vardata->statsTuple))
      {
--- 4746,4753 ----
      /*
       * Determine the stadistinct value to use.  There are cases where we can
       * get an estimate even without a pg_statistic entry, or can get a better
!      * value than is in pg_statistic.  Grab stanullfrac too if we can find it
!      * (otherwise, assume no nulls, for lack of any better idea).
       */
      if (HeapTupleIsValid(vardata->statsTuple))
      {
*************** get_variable_numdistinct(VariableStatDat
*** 4754,4759 ****
--- 4756,4762 ----

          stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
          stadistinct = stats->stadistinct;
+         stanullfrac = stats->stanullfrac;
      }
      else if (vardata->vartype == BOOLOID)
      {
*************** get_variable_numdistinct(VariableStatDat
*** 4777,4783 ****
              {
                  case ObjectIdAttributeNumber:
                  case SelfItemPointerAttributeNumber:
!                     stadistinct = -1.0; /* unique */
                      break;
                  case TableOidAttributeNumber:
                      stadistinct = 1.0;    /* only 1 value */
--- 4780,4786 ----
              {
                  case ObjectIdAttributeNumber:
                  case SelfItemPointerAttributeNumber:
!                     stadistinct = -1.0; /* unique (and all non null) */
                      break;
                  case TableOidAttributeNumber:
                      stadistinct = 1.0;    /* only 1 value */
*************** get_variable_numdistinct(VariableStatDat
*** 4799,4808 ****
       * If there is a unique index or DISTINCT clause for the variable, assume
       * it is unique no matter what pg_statistic says; the statistics could be
       * out of date, or we might have found a partial unique index that proves
!      * the var is unique for this query.
       */
      if (vardata->isunique)
!         stadistinct = -1.0;

      /*
       * If we had an absolute estimate, use that.
--- 4802,4812 ----
       * If there is a unique index or DISTINCT clause for the variable, assume
       * it is unique no matter what pg_statistic says; the statistics could be
       * out of date, or we might have found a partial unique index that proves
!      * the var is unique for this query.  However, we'd better still believe
!      * the null-fraction statistic.
       */
      if (vardata->isunique)
!         stadistinct = -1.0 * (1.0 - stanullfrac);

      /*
       * If we had an absolute estimate, use that.

pgsql-hackers by date:

Previous
From: Bruce Momjian
Date:
Subject: Re: Heap WARM Tuples - Design Draft
Next
From: Peter Geoghegan
Date:
Subject: Re: Possible duplicate release of buffer lock.