Re: [HACKERS] WIP patch: distinguish selectivity of < from <= and > from >= - Mailing list pgsql-hackers

From Tom Lane
Subject Re: [HACKERS] WIP patch: distinguish selectivity of < from <= and > from >=
Date
Msg-id 16232.1499440369@sss.pgh.pa.us
Whole thread Raw
In response to Re: [HACKERS] WIP patch: distinguish selectivity of < from <= and > from >=  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-hackers
I wrote:
> I looked at that again and realized that one of the answers I was missing
> lies in the behavior of analyze.c's compute_scalar_stats().

I updated the patch's comments based on this.  I'll just attach the
selfuncs.c part of the patch, since nothing else changed.

            regards, tom lane

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index e103f5e..3ec629b 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -163,7 +163,7 @@ static double var_eq_non_const(VariableStatData *vardata, Oid operator,
                  bool varonleft, bool negate);
 static double ineq_histogram_selectivity(PlannerInfo *root,
                            VariableStatData *vardata,
-                           FmgrInfo *opproc, bool isgt,
+                           FmgrInfo *opproc, bool isgt, bool iseq,
                            Datum constval, Oid consttype);
 static double eqjoinsel_inner(Oid operator,
                 VariableStatData *vardata1, VariableStatData *vardata2);
@@ -544,18 +544,21 @@ neqsel(PG_FUNCTION_ARGS)
 /*
  *    scalarineqsel        - Selectivity of "<", "<=", ">", ">=" for scalars.
  *
- * This is the guts of both scalarltsel and scalargtsel.  The caller has
- * commuted the clause, if necessary, so that we can treat the variable as
- * being on the left.  The caller must also make sure that the other side
- * of the clause is a non-null Const, and dissect same into a value and
- * datatype.
+ * This is the guts of scalarltsel/scalarlesel/scalargtsel/scalargesel.
+ * The isgt and iseq flags distinguish which of the four cases apply.
+ *
+ * The caller has commuted the clause, if necessary, so that we can treat
+ * the variable as being on the left.  The caller must also make sure that
+ * the other side of the clause is a non-null Const, and dissect that into
+ * a value and datatype.  (This definition simplifies some callers that
+ * want to estimate against a constructed value instead of a Const node.)
  *
  * This routine works for any datatype (or pair of datatypes) known to
  * convert_to_scalar().  If it is applied to some other datatype,
  * it will return a default estimate.
  */
 static double
-scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
+scalarineqsel(PlannerInfo *root, Oid operator, bool isgt, bool iseq,
               VariableStatData *vardata, Datum constval, Oid consttype)
 {
     Form_pg_statistic stats;
@@ -587,7 +590,8 @@ scalarineqsel(PlannerInfo *root, Oid operator, bool isgt,
      * If there is a histogram, determine which bin the constant falls in, and
      * compute the resulting contribution to selectivity.
      */
-    hist_selec = ineq_histogram_selectivity(root, vardata, &opproc, isgt,
+    hist_selec = ineq_histogram_selectivity(root, vardata,
+                                            &opproc, isgt, iseq,
                                             constval, consttype);

     /*
@@ -757,7 +761,8 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
  *    ineq_histogram_selectivity    - Examine the histogram for scalarineqsel
  *
  * Determine the fraction of the variable's histogram population that
- * satisfies the inequality condition, ie, VAR < CONST or VAR > CONST.
+ * satisfies the inequality condition, ie, VAR < (or <=, >, >=) CONST.
+ * The isgt and iseq flags distinguish which of the four cases apply.
  *
  * Returns -1 if there is no histogram (valid results will always be >= 0).
  *
@@ -768,7 +773,7 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 static double
 ineq_histogram_selectivity(PlannerInfo *root,
                            VariableStatData *vardata,
-                           FmgrInfo *opproc, bool isgt,
+                           FmgrInfo *opproc, bool isgt, bool iseq,
                            Datum constval, Oid consttype)
 {
     double        hist_selec;
@@ -795,11 +800,17 @@ ineq_histogram_selectivity(PlannerInfo *root,
         if (sslot.nvalues > 1)
         {
             /*
-             * Use binary search to find proper location, ie, the first slot
-             * at which the comparison fails.  (If the given operator isn't
-             * actually sort-compatible with the histogram, you'll get garbage
-             * results ... but probably not any more garbage-y than you would
-             * from the old linear search.)
+             * Use binary search to find the desired location, namely the
+             * right end of the histogram bin containing the comparison value,
+             * which is the leftmost entry for which the comparison operator
+             * succeeds (if isgt) or fails (if !isgt).  (If the given operator
+             * isn't actually sort-compatible with the histogram, you'll get
+             * garbage results ... but probably not any more garbage-y than
+             * you would have from the old linear search.)
+             *
+             * In this loop, we pay no attention to whether the operator iseq
+             * or not; that detail will be mopped up below.  (We cannot tell,
+             * anyway, whether the operator thinks the values are equal.)
              *
              * If the binary search accesses the first or last histogram
              * entry, we try to replace that endpoint with the true column min
@@ -864,25 +875,74 @@ ineq_histogram_selectivity(PlannerInfo *root,

             if (lobound <= 0)
             {
-                /* Constant is below lower histogram boundary. */
+                /*
+                 * Constant is below lower histogram boundary.  More
+                 * precisely, we have found that no entry in the histogram
+                 * satisfies the inequality clause (if !isgt) or they all do
+                 * (if isgt).  We estimate that that's true of the entire
+                 * table, so set histfrac to 0.0 (which we'll flip to 1.0
+                 * below, if isgt).
+                 */
                 histfrac = 0.0;
             }
             else if (lobound >= sslot.nvalues)
             {
-                /* Constant is above upper histogram boundary. */
+                /*
+                 * Inverse case: constant is above upper histogram boundary.
+                 */
                 histfrac = 1.0;
             }
             else
             {
+                /* We have values[i-1] <= constant <= values[i]. */
                 int            i = lobound;
+                double        eq_selec = 0;
                 double        val,
                             high,
                             low;
                 double        binfrac;

                 /*
-                 * We have values[i-1] <= constant <= values[i].
+                 * In the cases where we'll need it below, obtain an estimate
+                 * of the selectivity of "x = constval".  We use a calculation
+                 * similar to what var_eq_const() does for a non-MCV constant,
+                 * ie, estimate that all distinct non-MCV values occur equally
+                 * often.  But multiplication by "1.0 - sumcommon - nullfrac"
+                 * will be done by our caller, so we shouldn't do that here.
+                 * Therefore we can't try to clamp the estimate by reference
+                 * to the least common MCV; the result would be too small.
                  *
+                 * Note: since this is effectively assuming that constval
+                 * isn't an MCV, it's logically dubious if constval in fact is
+                 * one.  But we have to apply *some* correction for equality,
+                 * and anyway we cannot tell if constval is an MCV, since we
+                 * don't have a suitable equality operator at hand.
+                 */
+                if (i == 1 || isgt == iseq)
+                {
+                    double        otherdistinct;
+                    bool        isdefault;
+                    AttStatsSlot mcvslot;
+
+                    /* Get estimated number of distinct values */
+                    otherdistinct = get_variable_numdistinct(vardata,
+                                                             &isdefault);
+
+                    /* Subtract off the number of known MCVs */
+                    if (get_attstatsslot(&mcvslot, vardata->statsTuple,
+                                         STATISTIC_KIND_MCV, InvalidOid,
+                                         ATTSTATSSLOT_NUMBERS))
+                    {
+                        otherdistinct -= mcvslot.nnumbers;
+                        free_attstatsslot(&mcvslot);
+                    }
+
+                    /* If result doesn't seem sane, leave eq_selec at 0 */
+                    if (otherdistinct > 1)
+                        eq_selec = 1.0 / otherdistinct;
+                }
+
+                /*
                  * Convert the constant and the two nearest bin boundary
                  * values to a uniform comparison scale, and do a linear
                  * interpolation within this bin.
@@ -936,13 +996,54 @@ ineq_histogram_selectivity(PlannerInfo *root,
                  */
                 histfrac = (double) (i - 1) + binfrac;
                 histfrac /= (double) (sslot.nvalues - 1);
+
+                /*
+                 * At this point, histfrac is an estimate of the fraction of
+                 * the population represented by the histogram that satisfies
+                 * "x <= constval".  Somewhat remarkably, this statement is
+                 * true regardless of which operator we were doing the probes
+                 * with, so long as convert_to_scalar() delivers reasonable
+                 * results.  If the probe constant is equal to some histogram
+                 * entry, we would have considered the bin to the left of that
+                 * entry if probing with "<" or ">=", or the bin to the right
+                 * if probing with "<=" or ">"; but binfrac would have come
+                 * out as 1.0 in the first case and 0.0 in the second, leading
+                 * to the same histfrac in either case.  For probe constants
+                 * between histogram entries, we find the same bin and get the
+                 * same estimate with any operator.
+                 *
+                 * The fact that the estimate corresponds to "x <= constval"
+                 * and not "x < constval" is because of the way that ANALYZE
+                 * constructs the histogram: each entry is, effectively, the
+                 * rightmost value in its sample bucket.  So selectivity
+                 * values that are exact multiples of 1/(histogram_size-1)
+                 * should be understood as estimates including a histogram
+                 * entry plus everything to its left.
+                 *
+                 * However, that breaks down for the first histogram entry,
+                 * which necessarily is the leftmost value in its sample
+                 * bucket.  That means the first histogram bin is slightly
+                 * narrower than the rest, by an amount equal to eq_selec.
+                 * Another way to say that is that we want "x <= leftmost" to
+                 * be estimated as eq_selec not zero.  So, if we're dealing
+                 * with the first bin (i==1), rescale to make that true while
+                 * adjusting the rest of that bin linearly.
+                 */
+                if (i == 1)
+                    histfrac += eq_selec * (1.0 - binfrac);
+
+                /*
+                 * "x <= constval" is good if we want an estimate for "<=" or
+                 * ">", but if we are estimating for "<" or ">=", we now need
+                 * to decrease the estimate by eq_selec.
+                 */
+                if (isgt == iseq)
+                    histfrac -= eq_selec;
             }

             /*
-             * Now histfrac = fraction of histogram entries below the
-             * constant.
-             *
-             * Account for "<" vs ">"
+             * Now the estimate is finished for "<" and "<=" cases.  If we are
+             * estimating for ">" or ">=", flip it.
              */
             hist_selec = isgt ? (1.0 - histfrac) : histfrac;

@@ -950,16 +1051,21 @@ ineq_histogram_selectivity(PlannerInfo *root,
              * The histogram boundaries are only approximate to begin with,
              * and may well be out of date anyway.  Therefore, don't believe
              * extremely small or large selectivity estimates --- unless we
-             * got actual current endpoint values from the table.
+             * got actual current endpoint values from the table, in which
+             * case just do the usual sanity clamp.  Somewhat arbitrarily, we
+             * set the cutoff for other cases at a hundredth of the histogram
+             * resolution.
              */
             if (have_end)
                 CLAMP_PROBABILITY(hist_selec);
             else
             {
-                if (hist_selec < 0.0001)
-                    hist_selec = 0.0001;
-                else if (hist_selec > 0.9999)
-                    hist_selec = 0.9999;
+                double        cutoff = 0.01 / (double) (sslot.nvalues - 1);
+
+                if (hist_selec < cutoff)
+                    hist_selec = cutoff;
+                else if (hist_selec > 1.0 - cutoff)
+                    hist_selec = 1.0 - cutoff;
             }
         }

@@ -970,10 +1076,11 @@ ineq_histogram_selectivity(PlannerInfo *root,
 }

 /*
- *        scalarltsel        - Selectivity of "<" (also "<=") for scalars.
+ * Common wrapper function for the selectivity estimators that simply
+ * invoke scalarineqsel().
  */
-Datum
-scalarltsel(PG_FUNCTION_ARGS)
+static Datum
+scalarineqsel_wrapper(PG_FUNCTION_ARGS, bool isgt, bool iseq)
 {
     PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
     Oid            operator = PG_GETARG_OID(1);
@@ -984,7 +1091,6 @@ scalarltsel(PG_FUNCTION_ARGS)
     bool        varonleft;
     Datum        constval;
     Oid            consttype;
-    bool        isgt;
     double        selec;

     /*
@@ -1019,14 +1125,8 @@ scalarltsel(PG_FUNCTION_ARGS)
     /*
      * Force the var to be on the left to simplify logic in scalarineqsel.
      */
-    if (varonleft)
+    if (!varonleft)
     {
-        /* we have var < other */
-        isgt = false;
-    }
-    else
-    {
-        /* we have other < var, commute to make var > other */
         operator = get_commutator(operator);
         if (!operator)
         {
@@ -1034,10 +1134,12 @@ scalarltsel(PG_FUNCTION_ARGS)
             ReleaseVariableStats(vardata);
             PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
         }
-        isgt = true;
+        isgt = !isgt;
     }

-    selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
+    /* The rest of the work is done by scalarineqsel(). */
+    selec = scalarineqsel(root, operator, isgt, iseq,
+                          &vardata, constval, consttype);

     ReleaseVariableStats(vardata);

@@ -1045,78 +1147,39 @@ scalarltsel(PG_FUNCTION_ARGS)
 }

 /*
- *        scalargtsel        - Selectivity of ">" (also ">=") for integers.
+ *        scalarltsel        - Selectivity of "<" for scalars.
  */
 Datum
-scalargtsel(PG_FUNCTION_ARGS)
+scalarltsel(PG_FUNCTION_ARGS)
 {
-    PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
-    Oid            operator = PG_GETARG_OID(1);
-    List       *args = (List *) PG_GETARG_POINTER(2);
-    int            varRelid = PG_GETARG_INT32(3);
-    VariableStatData vardata;
-    Node       *other;
-    bool        varonleft;
-    Datum        constval;
-    Oid            consttype;
-    bool        isgt;
-    double        selec;
-
-    /*
-     * If expression is not variable op something or something op variable,
-     * then punt and return a default estimate.
-     */
-    if (!get_restriction_variable(root, args, varRelid,
-                                  &vardata, &other, &varonleft))
-        PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
-
-    /*
-     * Can't do anything useful if the something is not a constant, either.
-     */
-    if (!IsA(other, Const))
-    {
-        ReleaseVariableStats(vardata);
-        PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
-    }
-
-    /*
-     * If the constant is NULL, assume operator is strict and return zero, ie,
-     * operator will never return TRUE.
-     */
-    if (((Const *) other)->constisnull)
-    {
-        ReleaseVariableStats(vardata);
-        PG_RETURN_FLOAT8(0.0);
-    }
-    constval = ((Const *) other)->constvalue;
-    consttype = ((Const *) other)->consttype;
-
-    /*
-     * Force the var to be on the left to simplify logic in scalarineqsel.
-     */
-    if (varonleft)
-    {
-        /* we have var > other */
-        isgt = true;
-    }
-    else
-    {
-        /* we have other > var, commute to make var < other */
-        operator = get_commutator(operator);
-        if (!operator)
-        {
-            /* Use default selectivity (should we raise an error instead?) */
-            ReleaseVariableStats(vardata);
-            PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
-        }
-        isgt = false;
-    }
+    return scalarineqsel_wrapper(fcinfo, false, false);
+}

-    selec = scalarineqsel(root, operator, isgt, &vardata, constval, consttype);
+/*
+ *        scalarlesel        - Selectivity of "<=" for scalars.
+ */
+Datum
+scalarlesel(PG_FUNCTION_ARGS)
+{
+    return scalarineqsel_wrapper(fcinfo, false, true);
+}

-    ReleaseVariableStats(vardata);
+/*
+ *        scalargtsel        - Selectivity of ">" for scalars.
+ */
+Datum
+scalargtsel(PG_FUNCTION_ARGS)
+{
+    return scalarineqsel_wrapper(fcinfo, true, false);
+}

-    PG_RETURN_FLOAT8((float8) selec);
+/*
+ *        scalargesel        - Selectivity of ">=" for scalars.
+ */
+Datum
+scalargesel(PG_FUNCTION_ARGS)
+{
+    return scalarineqsel_wrapper(fcinfo, true, true);
 }

 /*
@@ -2721,7 +2784,7 @@ neqjoinsel(PG_FUNCTION_ARGS)
 }

 /*
- *        scalarltjoinsel - Join selectivity of "<" and "<=" for scalars
+ *        scalarltjoinsel - Join selectivity of "<" for scalars
  */
 Datum
 scalarltjoinsel(PG_FUNCTION_ARGS)
@@ -2730,7 +2793,16 @@ scalarltjoinsel(PG_FUNCTION_ARGS)
 }

 /*
- *        scalargtjoinsel - Join selectivity of ">" and ">=" for scalars
+ *        scalarlejoinsel - Join selectivity of "<=" for scalars
+ */
+Datum
+scalarlejoinsel(PG_FUNCTION_ARGS)
+{
+    PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
+}
+
+/*
+ *        scalargtjoinsel - Join selectivity of ">" for scalars
  */
 Datum
 scalargtjoinsel(PG_FUNCTION_ARGS)
@@ -2739,6 +2811,15 @@ scalargtjoinsel(PG_FUNCTION_ARGS)
 }

 /*
+ *        scalargejoinsel - Join selectivity of ">=" for scalars
+ */
+Datum
+scalargejoinsel(PG_FUNCTION_ARGS)
+{
+    PG_RETURN_FLOAT8(DEFAULT_INEQ_SEL);
+}
+
+/*
  * patternjoinsel        - Generic code for pattern-match join selectivity.
  */
 static double
@@ -3035,13 +3116,13 @@ mergejoinscansel(PlannerInfo *root, Node *clause,
      * fraction that's <= the right-side maximum value.  But only believe
      * non-default estimates, else stick with our 1.0.
      */
-    selec = scalarineqsel(root, leop, isgt, &leftvar,
+    selec = scalarineqsel(root, leop, isgt, true, &leftvar,
                           rightmax, op_righttype);
     if (selec != DEFAULT_INEQ_SEL)
         *leftend = selec;

     /* And similarly for the right variable. */
-    selec = scalarineqsel(root, revleop, isgt, &rightvar,
+    selec = scalarineqsel(root, revleop, isgt, true, &rightvar,
                           leftmax, op_lefttype);
     if (selec != DEFAULT_INEQ_SEL)
         *rightend = selec;
@@ -3065,13 +3146,13 @@ mergejoinscansel(PlannerInfo *root, Node *clause,
      * minimum value.  But only believe non-default estimates, else stick with
      * our own default.
      */
-    selec = scalarineqsel(root, ltop, isgt, &leftvar,
+    selec = scalarineqsel(root, ltop, isgt, false, &leftvar,
                           rightmin, op_righttype);
     if (selec != DEFAULT_INEQ_SEL)
         *leftstart = selec;

     /* And similarly for the right variable. */
-    selec = scalarineqsel(root, revltop, isgt, &rightvar,
+    selec = scalarineqsel(root, revltop, isgt, false, &rightvar,
                           leftmin, op_lefttype);
     if (selec != DEFAULT_INEQ_SEL)
         *rightstart = selec;
@@ -4012,8 +4093,8 @@ convert_numeric_to_scalar(Datum value, Oid typid)
     }

     /*
-     * Can't get here unless someone tries to use scalarltsel/scalargtsel on
-     * an operator with one numeric and one non-numeric operand.
+     * Can't get here unless someone tries to use scalarineqsel() on an
+     * operator with one numeric and one non-numeric operand.
      */
     elog(ERROR, "unsupported type: %u", typid);
     return 0;
@@ -4194,8 +4275,8 @@ convert_string_datum(Datum value, Oid typid)
         default:

             /*
-             * Can't get here unless someone tries to use scalarltsel on an
-             * operator with one string and one non-string operand.
+             * Can't get here unless someone tries to use scalarineqsel() on
+             * an operator with one string and one non-string operand.
              */
             elog(ERROR, "unsupported type: %u", typid);
             return NULL;
@@ -4399,8 +4480,8 @@ convert_timevalue_to_scalar(Datum value, Oid typid)
     }

     /*
-     * Can't get here unless someone tries to use scalarltsel/scalargtsel on
-     * an operator with one timevalue and one non-timevalue operand.
+     * Can't get here unless someone tries to use scalarineqsel() on an
+     * operator with one timevalue and one non-timevalue operand.
      */
     elog(ERROR, "unsupported type: %u", typid);
     return 0;
@@ -5765,7 +5846,8 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
         elog(ERROR, "no >= operator for opfamily %u", opfamily);
     fmgr_info(get_opcode(cmpopr), &opproc);

-    prefixsel = ineq_histogram_selectivity(root, vardata, &opproc, true,
+    prefixsel = ineq_histogram_selectivity(root, vardata,
+                                           &opproc, true, true,
                                            prefixcon->constvalue,
                                            prefixcon->consttype);

@@ -5791,7 +5873,8 @@ prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
     {
         Selectivity topsel;

-        topsel = ineq_histogram_selectivity(root, vardata, &opproc, false,
+        topsel = ineq_histogram_selectivity(root, vardata,
+                                            &opproc, false, false,
                                             greaterstrcon->constvalue,
                                             greaterstrcon->consttype);


-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

pgsql-hackers by date:

Previous
From: Simon Riggs
Date:
Subject: Re: [HACKERS] New partitioning - some feedback
Next
From: Robert Haas
Date:
Subject: Re: [HACKERS] New partitioning - some feedback