Re: Clamping reulst row number of joins. - Mailing list pgsql-hackers
From | Tom Lane |
---|---|
Subject | Re: Clamping reulst row number of joins. |
Date | |
Msg-id | 22598.1425686096@sss.pgh.pa.us Whole thread Raw |
In response to | Re: Clamping reulst row number of joins. (Tom Lane <tgl@sss.pgh.pa.us>) |
List | pgsql-hackers |
I wrote: > Hm, the problem evidently is that we get a default selectivity estimate > for the ON condition. I think a proper fix for this would involve > teaching eqjoinsel (and ideally other join selectivity functions) how > to drill down into appendrels and combine estimates for the child > relations. I wrote a prototype patch for this. The additions to examine_variable() seem pretty reasonable. However, the only selectivity function I've fixed is eqjoinsel_inner(). If we do it like this, we're going to need similar recursive-boilerplate additions in basically every selectivity function, which seems like a PITA as well as a lot of code bloat. Can anyone think of a cute way to minimize that overhead? regards, tom lane diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 4dd3f9f..aaf76be 100644 *** a/src/backend/utils/adt/selfuncs.c --- b/src/backend/utils/adt/selfuncs.c *************** static double convert_one_bytea_to_scala *** 181,187 **** int rangelo, int rangehi); static char *convert_string_datum(Datum value, Oid typid); static double convert_timevalue_to_scalar(Datum value, Oid typid); ! static void examine_simple_variable(PlannerInfo *root, Var *var, VariableStatData *vardata); static bool get_variable_range(PlannerInfo *root, VariableStatData *vardata, Oid sortop, Datum *min, Datum *max); --- 181,187 ---- int rangelo, int rangehi); static char *convert_string_datum(Datum value, Oid typid); static double convert_timevalue_to_scalar(Datum value, Oid typid); ! static void examine_simple_variable(PlannerInfo *root, Var *var, Index varno, VariableStatData *vardata); static bool get_variable_range(PlannerInfo *root, VariableStatData *vardata, Oid sortop, Datum *min, Datum *max); *************** eqjoinsel_inner(Oid operator, *** 2221,2226 **** --- 2221,2276 ---- float4 *numbers2 = NULL; int nnumbers2 = 0; + if (vardata1->children) + { + /* Recurse to appendrel children and compute weighted selectivity */ + double appendrelrows; + ListCell *lc; + + selec = 0; + appendrelrows = 0; + foreach(lc, vardata1->children) + { + VariableStatData *childdata = (VariableStatData *) lfirst(lc); + double cselec; + + if (childdata->rel == NULL) + continue; /* safety check */ + cselec = eqjoinsel_inner(operator, childdata, vardata2); + selec += cselec * childdata->rel->rows; + appendrelrows += childdata->rel->rows; + } + if (appendrelrows > 0) + selec /= appendrelrows; + CLAMP_PROBABILITY(selec); + return selec; + } + + if (vardata2->children) + { + /* Recurse to appendrel children and compute weighted selectivity */ + double appendrelrows; + ListCell *lc; + + selec = 0; + appendrelrows = 0; + foreach(lc, vardata2->children) + { + VariableStatData *childdata = (VariableStatData *) lfirst(lc); + double cselec; + + if (childdata->rel == NULL) + continue; /* safety check */ + cselec = eqjoinsel_inner(operator, vardata1, childdata); + selec += cselec * childdata->rel->rows; + appendrelrows += childdata->rel->rows; + } + if (appendrelrows > 0) + selec /= appendrelrows; + CLAMP_PROBABILITY(selec); + return selec; + } + nd1 = get_variable_numdistinct(vardata1, &isdefault1); nd2 = get_variable_numdistinct(vardata2, &isdefault2); *************** get_restriction_variable(PlannerInfo *ro *** 4192,4198 **** { *varonleft = true; *other = estimate_expression_value(root, rdata.var); ! /* Assume we need no ReleaseVariableStats(rdata) here */ return true; } --- 4242,4248 ---- { *varonleft = true; *other = estimate_expression_value(root, rdata.var); ! ReleaseVariableStats(rdata); /* usually unnecessary, but ... */ return true; } *************** get_restriction_variable(PlannerInfo *ro *** 4200,4206 **** { *varonleft = false; *other = estimate_expression_value(root, vardata->var); ! /* Assume we need no ReleaseVariableStats(*vardata) here */ *vardata = rdata; return true; } --- 4250,4256 ---- { *varonleft = false; *other = estimate_expression_value(root, vardata->var); ! ReleaseVariableStats(*vardata); *vardata = rdata; return true; } *************** get_join_variables(PlannerInfo *root, Li *** 4259,4283 **** * node: the expression tree to examine * varRelid: see specs for restriction selectivity functions * ! * Outputs: *vardata is filled as follows: ! * var: the input expression (with any binary relabeling stripped, if ! * it is or contains a variable; but otherwise the type is preserved) ! * rel: RelOptInfo for relation containing variable; NULL if expression ! * contains no Vars (NOTE this could point to a RelOptInfo of a ! * subquery, not one in the current query). ! * statsTuple: the pg_statistic entry for the variable, if one exists; ! * otherwise NULL. ! * freefunc: pointer to a function to release statsTuple with. ! * vartype: exposed type of the expression; this should always match ! * the declared input type of the operator we are estimating for. ! * atttype, atttypmod: type data to pass to get_attstatsslot(). This is ! * commonly the same as the exposed type of the variable argument, ! * but can be different in binary-compatible-type cases. ! * isunique: TRUE if we were able to match the var to a unique index or a ! * single-column DISTINCT clause, implying its values are unique for ! * this query. (Caution: this should be trusted for statistical ! * purposes only, since we do not check indimmediate nor verify that ! * the exact same definition of equality applies.) * * Caller is responsible for doing ReleaseVariableStats() before exiting. */ --- 4309,4316 ---- * node: the expression tree to examine * varRelid: see specs for restriction selectivity functions * ! * Outputs: ! * *vardata is filled as per the specs in selfuncs.h. * * Caller is responsible for doing ReleaseVariableStats() before exiting. */ *************** examine_variable(PlannerInfo *root, Node *** 4317,4323 **** vardata->isunique = has_unique_index(vardata->rel, var->varattno); /* Try to locate some stats */ ! examine_simple_variable(root, var, vardata); return; } --- 4350,4356 ---- vardata->isunique = has_unique_index(vardata->rel, var->varattno); /* Try to locate some stats */ ! examine_simple_variable(root, var, var->varno, vardata); return; } *************** examine_variable(PlannerInfo *root, Node *** 4464,4478 **** * Handle a simple Var for examine_variable * * This is split out as a subroutine so that we can recurse to deal with ! * Vars referencing subqueries. * * We already filled in all the fields of *vardata except for the stats tuple. */ static void ! examine_simple_variable(PlannerInfo *root, Var *var, VariableStatData *vardata) { ! RangeTblEntry *rte = root->simple_rte_array[var->varno]; Assert(IsA(rte, RangeTblEntry)); --- 4497,4512 ---- * Handle a simple Var for examine_variable * * This is split out as a subroutine so that we can recurse to deal with ! * Vars referencing subqueries. "varno" is the RTE index to assume that ! * the Var refers to; other fields of the Var can be taken at face value. * * We already filled in all the fields of *vardata except for the stats tuple. */ static void ! examine_simple_variable(PlannerInfo *root, Var *var, Index varno, VariableStatData *vardata) { ! RangeTblEntry *rte = root->simple_rte_array[varno]; Assert(IsA(rte, RangeTblEntry)); *************** examine_simple_variable(PlannerInfo *roo *** 4499,4505 **** BoolGetDatum(rte->inh)); vardata->freefunc = ReleaseSysCache; } ! else if (rte->rtekind == RTE_SUBQUERY && !rte->inh) { /* * Plain subquery (not one that was converted to an appendrel). --- 4533,4584 ---- BoolGetDatum(rte->inh)); vardata->freefunc = ReleaseSysCache; } ! else if (rte->rtekind == RTE_SUBQUERY && rte->inh) ! { ! /* ! * Parent of a UNION ALL appendrel; examine all children in hopes of ! * acquiring useful stats. ! */ ! ListCell *lc; ! ! foreach(lc, root->append_rel_list) ! { ! AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(lc); ! VariableStatData *childdata; ! ! /* append_rel_list contains all append rels; ignore others */ ! if (appinfo->parent_relid != varno) ! continue; ! ! /* Initialize childdata with same contents as parent... */ ! childdata = (VariableStatData *) palloc(sizeof(VariableStatData)); ! memcpy(childdata, vardata, sizeof(VariableStatData)); ! ! /* ... then update rel and reset what mustn't be shared */ ! childdata->rel = find_base_rel(root, appinfo->child_relid); ! childdata->statsTuple = NULL; ! childdata->freefunc = NULL; ! ! childdata->children = NIL; ! ! /* Recursively look for stats for this child rel */ ! examine_simple_variable(root, var, appinfo->child_relid, childdata); ! ! /* If we have nested appendrels, flatten them */ ! if (childdata->children) ! { ! vardata->children = list_concat(vardata->children, ! childdata->children); ! /* be sure to clean up the intermediate appendrel's entry */ ! childdata->children = NIL; ! ReleaseVariableStatsP(childdata); ! pfree(childdata); ! } ! else ! vardata->children = lappend(vardata->children, childdata); ! } ! } ! else if (rte->rtekind == RTE_SUBQUERY) { /* * Plain subquery (not one that was converted to an appendrel). *************** examine_simple_variable(PlannerInfo *roo *** 4533,4539 **** * can't use that rel pointer either, but have to look up the Var's * rel afresh. */ ! rel = find_base_rel(root, var->varno); /* If the subquery hasn't been planned yet, we have to punt */ if (rel->subroot == NULL) --- 4612,4618 ---- * can't use that rel pointer either, but have to look up the Var's * rel afresh. */ ! rel = find_base_rel(root, varno); /* If the subquery hasn't been planned yet, we have to punt */ if (rel->subroot == NULL) *************** examine_simple_variable(PlannerInfo *roo *** 4600,4606 **** * if the underlying column is unique, the subquery may have * joined to other tables in a way that creates duplicates. */ ! examine_simple_variable(rel->subroot, var, vardata); } } else --- 4679,4685 ---- * if the underlying column is unique, the subquery may have * joined to other tables in a way that creates duplicates. */ ! examine_simple_variable(rel->subroot, var, var->varno, vardata); } } else *************** examine_simple_variable(PlannerInfo *roo *** 4615,4620 **** --- 4694,4727 ---- } /* + * ReleaseVariableStatsP + * Release resources represented by a VariableStatData. + * + * For what are now somewhat historical reasons, this is typically invoked + * via the macro ReleaseVariableStats(). The VariableStatData struct itself + * is *not* freed here, since it is often a local variable in the caller. + */ + void + ReleaseVariableStatsP(VariableStatData *vardata) + { + ListCell *lc; + + /* Release any pin we may have on a stats tuple */ + if (HeapTupleIsValid(vardata->statsTuple)) + (*vardata->freefunc) (vardata->statsTuple); + /* Recurse to handle any appendrel child nodes */ + foreach(lc, vardata->children) + { + VariableStatData *childdata = (VariableStatData *) lfirst(lc); + + ReleaseVariableStatsP(childdata); + pfree(childdata); + } + /* Might as well free the list cells too */ + list_free(vardata->children); + } + + /* * get_variable_numdistinct * Estimate the number of distinct values of a variable. * diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index bf69f2a..266b4f2 100644 *** a/src/include/utils/selfuncs.h --- b/src/include/utils/selfuncs.h *************** *** 63,69 **** } while (0) ! /* Return data from examine_variable and friends */ typedef struct VariableStatData { Node *var; /* the Var or expression tree */ --- 63,96 ---- } while (0) ! /* ! * Return data from examine_variable and friends ! * ! * var: the input expression (with any binary relabeling stripped, if ! * it is or contains a variable; but otherwise the type is preserved). ! * rel: RelOptInfo for relation containing variable; NULL if expression ! * contains no Vars (NOTE this could point to a RelOptInfo of a ! * subquery, not one in the current query). ! * statsTuple: the pg_statistic entry for the variable, if one exists; ! * otherwise NULL. ! * freefunc: pointer to a function to release statsTuple with (this will ! * be called by ReleaseVariableStats(); callers don't do so directly). ! * vartype: exposed type of the expression; this should always match ! * the declared input type of the operator we are estimating for. ! * atttype, atttypmod: type data to pass to get_attstatsslot(). This is ! * commonly the same as the exposed type of the variable argument, ! * but can be different in binary-compatible-type cases. ! * isunique: TRUE if we were able to match the var to a unique index or a ! * single-column DISTINCT clause, implying its values are unique for ! * this query. (Caution: this should be trusted for statistical ! * purposes only, since we do not check indimmediate nor verify that ! * the exact same definition of equality applies.) ! * children: if rel is an appendrel for which we could not get stats ! * directly, children is a List of VariableStatData structs for the ! * appendrel's child rels, which may have useful statistics. Note that ! * the "var" fields will all point to the parent's "var"; they are not ! * transposed to match the child relid. ! */ typedef struct VariableStatData { Node *var; /* the Var or expression tree */ *************** typedef struct VariableStatData *** 75,87 **** Oid atttype; /* type to pass to get_attstatsslot */ int32 atttypmod; /* typmod to pass to get_attstatsslot */ bool isunique; /* matches unique index or DISTINCT clause */ } VariableStatData; ! #define ReleaseVariableStats(vardata) \ ! do { \ ! if (HeapTupleIsValid((vardata).statsTuple)) \ ! (* (vardata).freefunc) ((vardata).statsTuple); \ ! } while(0) typedef enum --- 102,111 ---- Oid atttype; /* type to pass to get_attstatsslot */ int32 atttypmod; /* typmod to pass to get_attstatsslot */ bool isunique; /* matches unique index or DISTINCT clause */ + List *children; /* List of child VariableStatData's, or NIL */ } VariableStatData; ! #define ReleaseVariableStats(vardata) ReleaseVariableStatsP(&(vardata)) typedef enum *************** extern PGDLLIMPORT get_index_stats_hook_ *** 111,116 **** --- 135,141 ---- extern void examine_variable(PlannerInfo *root, Node *node, int varRelid, VariableStatData *vardata); + extern void ReleaseVariableStatsP(VariableStatData *vardata); extern bool get_restriction_variable(PlannerInfo *root, List *args, int varRelid, VariableStatData *vardata, Node **other,
pgsql-hackers by date: