diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c new file mode 100644 index 37a735b..7d6e678 --- a/src/backend/optimizer/path/clausesel.c +++ b/src/backend/optimizer/path/clausesel.c @@ -44,6 +44,12 @@ static void addRangeClause(RangeQueryCla bool varonleft, bool isLTsel, Selectivity s2); static RelOptInfo *find_single_rel_for_clauses(PlannerInfo *root, List *clauses); +static Selectivity clauselist_selectivity_or(PlannerInfo *root, + List *clauses, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo, + bool use_extended_stats); /**************************************************************************** * ROUTINES TO COMPUTE SELECTIVITIES @@ -61,64 +67,8 @@ static RelOptInfo *find_single_rel_for_c * * The basic approach is to apply extended statistics first, on as many * clauses as possible, in order to capture cross-column dependencies etc. - * The remaining clauses are then estimated using regular statistics tracked - * for individual columns. This is done by simply passing the clauses to - * clauselist_selectivity_simple. - */ -Selectivity -clauselist_selectivity(PlannerInfo *root, - List *clauses, - int varRelid, - JoinType jointype, - SpecialJoinInfo *sjinfo) -{ - Selectivity s1 = 1.0; - RelOptInfo *rel; - Bitmapset *estimatedclauses = NULL; - - /* - * Determine if these clauses reference a single relation. If so, and if - * it has extended statistics, try to apply those. - */ - rel = find_single_rel_for_clauses(root, clauses); - if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL) - { - /* - * Estimate as many clauses as possible using extended statistics. - * - * 'estimatedclauses' tracks the 0-based list position index of - * clauses that we've estimated using extended statistics, and that - * should be ignored. - */ - s1 *= statext_clauselist_selectivity(root, clauses, varRelid, - jointype, sjinfo, rel, - &estimatedclauses); - } - - /* - * Apply normal selectivity estimates for the remaining clauses, passing - * 'estimatedclauses' so that it skips already estimated ones. - */ - return s1 * clauselist_selectivity_simple(root, clauses, varRelid, - jointype, sjinfo, - estimatedclauses); -} - -/* - * clauselist_selectivity_simple - - * Compute the selectivity of an implicitly-ANDed list of boolean - * expression clauses. The list can be empty, in which case 1.0 - * must be returned. List elements may be either RestrictInfos - * or bare expression clauses --- the former is preferred since - * it allows caching of results. The estimatedclauses bitmap tracks - * clauses that have already been estimated by other means. - * - * See clause_selectivity() for the meaning of the additional parameters. - * - * Our basic approach is to take the product of the selectivities of the - * subclauses. However, that's only right if the subclauses have independent - * probabilities, and in reality they are often NOT independent. So, - * we want to be smarter where we can. + * The remaining clauses are then estimated by taking the product of their + * selectivities. * * We also recognize "range queries", such as "x > 34 AND x < 42". Clauses * are recognized as possible range query components if they are restriction @@ -147,28 +97,63 @@ clauselist_selectivity(PlannerInfo *root * selectivity functions; perhaps some day we can generalize the approach. */ Selectivity -clauselist_selectivity_simple(PlannerInfo *root, - List *clauses, - int varRelid, - JoinType jointype, - SpecialJoinInfo *sjinfo, - Bitmapset *estimatedclauses) +clauselist_selectivity(PlannerInfo *root, + List *clauses, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo) +{ + return clauselist_selectivity_ext(root, clauses, varRelid, + jointype, sjinfo, true); +} + +Selectivity +clauselist_selectivity_ext(PlannerInfo *root, + List *clauses, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo, + bool use_extended_stats) { Selectivity s1 = 1.0; + RelOptInfo *rel; + Bitmapset *estimatedclauses = NULL; RangeQueryClause *rqlist = NULL; ListCell *l; int listidx; /* - * If there's exactly one clause (and it was not estimated yet), just go - * directly to clause_selectivity(). None of what we might do below is - * relevant. + * If there's exactly one clause, just go directly to + * clause_selectivity(). None of what we might do below is relevant. */ - if (list_length(clauses) == 1 && bms_is_empty(estimatedclauses)) - return clause_selectivity(root, (Node *) linitial(clauses), - varRelid, jointype, sjinfo); + if (list_length(clauses) == 1) + return clause_selectivity_ext(root, (Node *) linitial(clauses), + varRelid, jointype, sjinfo, + use_extended_stats); + + /* + * Determine if these clauses reference a single relation. If so, and if + * it has extended statistics, try to apply those. + */ + rel = find_single_rel_for_clauses(root, clauses); + if (use_extended_stats && rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL) + { + /* + * Estimate as many clauses as possible using extended statistics. + * + * 'estimatedclauses' tracks the 0-based list position index of + * clauses that we've estimated using extended statistics, and that + * should be ignored. + */ + s1 *= statext_clauselist_selectivity(root, clauses, varRelid, + jointype, sjinfo, rel, + &estimatedclauses, false); + } /* + * Apply normal selectivity estimates for remaining clauses. We'll be + * careful to skip any clauses which were already estimated above. + * * Anything that doesn't look like a potential rangequery clause gets * multiplied into s1 and forgotten. Anything that does gets inserted into * an rqlist entry. @@ -190,7 +175,8 @@ clauselist_selectivity_simple(PlannerInf continue; /* Always compute the selectivity using clause_selectivity */ - s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo); + s2 = clause_selectivity_ext(root, clause, varRelid, jointype, sjinfo, + use_extended_stats); /* * Check for being passed a RestrictInfo. @@ -351,6 +337,87 @@ clauselist_selectivity_simple(PlannerInf } /* + * clauselist_selectivity_or - + * Compute the selectivity of an implicitly-ORed list of boolean + * expression clauses. The list can be empty, in which case 0.0 + * must be returned. List elements may be either RestrictInfos + * or bare expression clauses --- the former is preferred since + * it allows caching of results. + * + * See clause_selectivity() for the meaning of the additional parameters. + * + * The basic approach is to apply extended statistics first, on as many + * clauses as possible, in order to capture cross-column dependencies etc. + * The remaining clauses are then estimated as if they were independent. + */ +static Selectivity +clauselist_selectivity_or(PlannerInfo *root, + List *clauses, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo, + bool use_extended_stats) +{ + Selectivity s1 = 0.0; + RelOptInfo *rel; + Bitmapset *estimatedclauses = NULL; + ListCell *lc; + int listidx; + + /* + * Determine if these clauses reference a single relation. If so, and if + * it has extended statistics, try to apply those. + */ + rel = find_single_rel_for_clauses(root, clauses); + if (use_extended_stats && rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL) + { + /* + * Estimate as many clauses as possible using extended statistics. + * + * 'estimatedclauses' tracks the 0-based list position index of + * clauses that we've estimated using extended statistics, and that + * should be ignored. + * + * XXX We can't multiply with current value, because for OR clauses we + * start with 0.0, so we simply assign to 's' directly. + */ + s1 = statext_clauselist_selectivity(root, clauses, varRelid, + jointype, sjinfo, rel, + &estimatedclauses, true); + } + + /* + * Estimate the remaining clauses. + * + * Selectivities for an OR clause are computed as s1+s2 - s1*s2 to account + * for the probable overlap of selected tuple sets. + * + * XXX is this too conservative? + */ + listidx = -1; + foreach(lc, clauses) + { + Selectivity s2; + + listidx++; + + /* + * Skip this clause if it's already been estimated by some other + * statistics above. + */ + if (bms_is_member(listidx, estimatedclauses)) + continue; + + s2 = clause_selectivity_ext(root, (Node *) lfirst(lc), varRelid, + jointype, sjinfo, use_extended_stats); + + s1 = s1 + s2 - s1 * s2; + } + + return s1; +} + +/* * addRangeClause --- add a new range clause for clauselist_selectivity * * Here is where we try to match up pairs of range-query clauses @@ -602,6 +669,18 @@ clause_selectivity(PlannerInfo *root, JoinType jointype, SpecialJoinInfo *sjinfo) { + return clause_selectivity_ext(root, clause, varRelid, + jointype, sjinfo, true); +} + +Selectivity +clause_selectivity_ext(PlannerInfo *root, + Node *clause, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo, + bool use_extended_stats) +{ Selectivity s1 = 0.5; /* default for any unhandled clause type */ RestrictInfo *rinfo = NULL; bool cacheable = false; @@ -716,42 +795,35 @@ clause_selectivity(PlannerInfo *root, else if (is_notclause(clause)) { /* inverse of the selectivity of the underlying clause */ - s1 = 1.0 - clause_selectivity(root, - (Node *) get_notclausearg((Expr *) clause), - varRelid, - jointype, - sjinfo); + s1 = 1.0 - clause_selectivity_ext(root, + (Node *) get_notclausearg((Expr *) clause), + varRelid, + jointype, + sjinfo, + use_extended_stats); } else if (is_andclause(clause)) { /* share code with clauselist_selectivity() */ - s1 = clauselist_selectivity(root, - ((BoolExpr *) clause)->args, - varRelid, - jointype, - sjinfo); + s1 = clauselist_selectivity_ext(root, + ((BoolExpr *) clause)->args, + varRelid, + jointype, + sjinfo, + use_extended_stats); } else if (is_orclause(clause)) { /* - * Selectivities for an OR clause are computed as s1+s2 - s1*s2 to - * account for the probable overlap of selected tuple sets. - * - * XXX is this too conservative? + * Almost the same thing as clauselist_selectivity, but with the + * clauses connected by OR. */ - ListCell *arg; - - s1 = 0.0; - foreach(arg, ((BoolExpr *) clause)->args) - { - Selectivity s2 = clause_selectivity(root, - (Node *) lfirst(arg), - varRelid, - jointype, - sjinfo); - - s1 = s1 + s2 - s1 * s2; - } + s1 = clauselist_selectivity_or(root, + ((BoolExpr *) clause)->args, + varRelid, + jointype, + sjinfo, + use_extended_stats); } else if (is_opclause(clause) || IsA(clause, DistinctExpr)) { @@ -852,20 +924,22 @@ clause_selectivity(PlannerInfo *root, else if (IsA(clause, RelabelType)) { /* Not sure this case is needed, but it can't hurt */ - s1 = clause_selectivity(root, - (Node *) ((RelabelType *) clause)->arg, - varRelid, - jointype, - sjinfo); + s1 = clause_selectivity_ext(root, + (Node *) ((RelabelType *) clause)->arg, + varRelid, + jointype, + sjinfo, + use_extended_stats); } else if (IsA(clause, CoerceToDomain)) { /* Not sure this case is needed, but it can't hurt */ - s1 = clause_selectivity(root, - (Node *) ((CoerceToDomain *) clause)->arg, - varRelid, - jointype, - sjinfo); + s1 = clause_selectivity_ext(root, + (Node *) ((CoerceToDomain *) clause)->arg, + varRelid, + jointype, + sjinfo, + use_extended_stats); } else { diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c new file mode 100644 index d950b4e..b1abcde --- a/src/backend/statistics/dependencies.c +++ b/src/backend/statistics/dependencies.c @@ -1073,8 +1073,8 @@ clauselist_apply_dependencies(PlannerInf } } - simple_sel = clauselist_selectivity_simple(root, attr_clauses, varRelid, - jointype, sjinfo, NULL); + simple_sel = clauselist_selectivity_ext(root, attr_clauses, varRelid, + jointype, sjinfo, false); attr_sel[attidx++] = simple_sel; } diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c new file mode 100644 index 3632692..3941657 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -1282,16 +1282,17 @@ statext_is_compatible_clause(PlannerInfo static Selectivity statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo, - RelOptInfo *rel, Bitmapset **estimatedclauses) + RelOptInfo *rel, Bitmapset **estimatedclauses, + bool is_or) { ListCell *l; Bitmapset **list_attnums; int listidx; - Selectivity sel = 1.0; + Selectivity sel = (is_or) ? 0.0 : 1.0; /* check if there's any stats that might be useful for us. */ if (!has_stats_of_kind(rel->statlist, STATS_EXT_MCV)) - return 1.0; + return sel; list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) * list_length(clauses)); @@ -1371,40 +1372,111 @@ statext_mcv_clauselist_selectivity(Plann listidx++; } - /* - * First compute "simple" selectivity, i.e. without the extended - * statistics, and essentially assuming independence of the - * columns/clauses. We'll then use the various selectivities computed - * from MCV list to improve it. - */ - simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid, - jointype, sjinfo, NULL); + if (is_or) + { + bool *or_matches = NULL; + MCVList *mcv_list; - /* - * Now compute the multi-column estimate from the MCV list, along with - * the other selectivities (base & total selectivity). - */ - mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, varRelid, - jointype, sjinfo, rel, - &mcv_basesel, &mcv_totalsel); + /* Load the MCV list stored in the statistics object */ + mcv_list = statext_mcv_load(stat->statOid); - /* Estimated selectivity of values not covered by MCV matches */ - other_sel = simple_sel - mcv_basesel; - CLAMP_PROBABILITY(other_sel); + /* + * Compute the selectivity of the OR'ed list of clauses by + * estimating each in turn and combining them using the formula + * P(A OR B) = P(A) + P(B) - P(A AND B). This allows us to use + * the multivariate MCV stats to better estimate P(A AND B). + * + * Each time we iterate this formula, the clause "A" above is + * equal to all the clauses processed so far, combined with "OR". + * This is then combined using "AND" with the next clause, as + * described in mcv_clause_selectivity_or(). + */ + foreach(l, stat_clauses) + { + Node *clause = (Node *) lfirst(l); + Selectivity clause_sel; - /* The non-MCV selectivity can't exceed the 1 - mcv_totalsel. */ - if (other_sel > 1.0 - mcv_totalsel) - other_sel = 1.0 - mcv_totalsel; + /* Compute P(B) --- the selectivity of the next clause */ + clause_sel = clause_selectivity_ext(root, clause, varRelid, + jointype, sjinfo, false); - /* - * Overall selectivity is the combination of MCV and non-MCV - * estimates. - */ - stat_sel = mcv_sel + other_sel; - CLAMP_PROBABILITY(stat_sel); + /* Initial estimate for P(A AND B), assuming independence */ + simple_sel = sel * clause_sel; - /* Factor the estimate from this MCV to the overall estimate. */ - sel *= stat_sel; + /* + * Now compute the multi-column estimate of this from the MCV + * list, along with the base and total selectivities. + */ + mcv_sel = mcv_clause_selectivity_or(root, stat, mcv_list, + clause, &or_matches, + &mcv_basesel, + &mcv_totalsel); + + /* + * Estimated selectivity of (A AND B) values not covered by + * MCV matches. + */ + other_sel = simple_sel - mcv_basesel; + CLAMP_PROBABILITY(other_sel); + + /* + * The non-MCV selectivity can't exceed the 1 - mcv_totalsel. + */ + if (other_sel > 1.0 - mcv_totalsel) + other_sel = 1.0 - mcv_totalsel; + + /* + * Overall selectivity P(A AND B) is the combination of MCV + * and non-MCV estimates. + */ + stat_sel = mcv_sel + other_sel; + CLAMP_PROBABILITY(stat_sel); + + /* Factor this into the overall result */ + sel = sel + clause_sel - stat_sel; + CLAMP_PROBABILITY(sel); + } + } + else + { + /* + * We have an implicitly AND'ed list of clauses. + * + * First compute the "simple" selectivity, i.e. without any + * extended statistics, and essentially assuming independence of + * the columns/clauses. + */ + simple_sel = clauselist_selectivity_ext(root, stat_clauses, + varRelid, jointype, + sjinfo, false); + + /* + * Now compute the multi-column estimate from the MCV list, along + * with the other selectivities (base & total selectivity). + */ + mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, + varRelid, jointype, sjinfo, + rel, &mcv_basesel, + &mcv_totalsel); + + /* Estimated selectivity of values not covered by MCV matches */ + other_sel = simple_sel - mcv_basesel; + CLAMP_PROBABILITY(other_sel); + + /* The non-MCV selectivity can't exceed the 1 - mcv_totalsel. */ + if (other_sel > 1.0 - mcv_totalsel) + other_sel = 1.0 - mcv_totalsel; + + /* + * Overall selectivity is the combination of MCV and non-MCV + * estimates. + */ + stat_sel = mcv_sel + other_sel; + CLAMP_PROBABILITY(stat_sel); + + /* Factor this into the overall result */ + sel *= stat_sel; + } } return sel; @@ -1417,13 +1489,21 @@ statext_mcv_clauselist_selectivity(Plann Selectivity statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo, - RelOptInfo *rel, Bitmapset **estimatedclauses) + RelOptInfo *rel, Bitmapset **estimatedclauses, + bool is_or) { Selectivity sel; /* First, try estimating clauses using a multivariate MCV list. */ sel = statext_mcv_clauselist_selectivity(root, clauses, varRelid, jointype, - sjinfo, rel, estimatedclauses); + sjinfo, rel, estimatedclauses, is_or); + + /* + * Functional dependencies only work for clauses connected by AND, so for + * OR clauses we're done. + */ + if (is_or) + return sel; /* * Then, apply functional dependencies on the remaining clauses by calling diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c new file mode 100644 index 6a262f1..3913538 --- a/src/backend/statistics/mcv.c +++ b/src/backend/statistics/mcv.c @@ -1936,3 +1936,76 @@ mcv_clauselist_selectivity(PlannerInfo * return s; } + + +/* + * mcv_clause_selectivity_or + * Return a correction to the selectivity estimate for a clause that is + * included in an OR'ed list of clauses. + * + * This is called for each clause in the OR'ed list of clauses, using the + * following algorithm: + * + * Suppose P[n] = P(C[1] OR C[2] OR ... OR C[n]) is the combined selectivity + * of the first n clauses in the list. Then the combined selectivity taking + * into account the next clause C[n+1] can be written as + * + * P[n+1] = P[n] + P(C[n+1]) - P((C[1] OR ... OR C[n]) AND C[n+1]) + * + * To estimate the final "correction" term above, we track the match bitmap + * for the OR'ed list of clauses examined so far and examine its intersection + * with the match bitmap for the (n+1)'th clause. + * + * The return value is the sum of MCV item frequencies for the match bitmap + * intersection corresponding to the correction term above. We also return + * the total selectivity of all the MCV items (not just the matching ones), + * and the sum of base frequencies computed on the assumption of independence, + * in the same way as mcv_clauselist_selectivity(). This allows the + * correction term above to be estimated using both per-column statistics and + * multivariate MCV statistics, in the same way as we do for an implicitly + * AND'ed list of clauses. + * + * The parameter "or_matches" is an in/out parameter tracking the match bitmap + * for the clauses examined so far. The caller is expected to set it to NULL + * the first time it calls this function. + */ +Selectivity +mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat, + MCVList *mcv, Node *clause, bool **or_matches, + Selectivity *basesel, Selectivity *totalsel) +{ + bool *new_matches = NULL; + int i; + Selectivity s = 0.0; + + /* build the OR-matches bitmap, if not built already */ + if (*or_matches == NULL) + *or_matches = palloc0(sizeof(bool) * mcv->nitems); + + /* build the match bitmap for the new clause */ + new_matches = mcv_get_match_bitmap(root, list_make1(clause), stat->keys, + mcv, false); + + /* + * Sum the frequencies for all the matching MCV items in the intersection + * of the two match bitmaps and update or_matches as described above. + */ + *basesel = 0.0; + *totalsel = 0.0; + for (i = 0; i < mcv->nitems; i++) + { + *totalsel += mcv->items[i].frequency; + + if ((*or_matches)[i] && new_matches[i]) + { + *basesel += mcv->items[i].base_frequency; + s += mcv->items[i].frequency; + } + + (*or_matches)[i] = (*or_matches)[i] || new_matches[i]; + } + + pfree(new_matches); + + return s; +} diff --git a/src/include/optimizer/optimizer.h b/src/include/optimizer/optimizer.h new file mode 100644 index 3e41710..dea0e73 --- a/src/include/optimizer/optimizer.h +++ b/src/include/optimizer/optimizer.h @@ -58,17 +58,23 @@ extern Selectivity clause_selectivity(Pl int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo); -extern Selectivity clauselist_selectivity_simple(PlannerInfo *root, - List *clauses, - int varRelid, - JoinType jointype, - SpecialJoinInfo *sjinfo, - Bitmapset *estimatedclauses); +extern Selectivity clause_selectivity_ext(PlannerInfo *root, + Node *clause, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo, + bool use_extended_stats); extern Selectivity clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo); +extern Selectivity clauselist_selectivity_ext(PlannerInfo *root, + List *clauses, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo, + bool use_extended_stats); /* in path/costsize.c: */ diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h new file mode 100644 index 61e6969..242e470 --- a/src/include/statistics/extended_stats_internal.h +++ b/src/include/statistics/extended_stats_internal.h @@ -109,4 +109,12 @@ extern Selectivity mcv_clauselist_select Selectivity *basesel, Selectivity *totalsel); +extern Selectivity mcv_clause_selectivity_or(PlannerInfo *root, + StatisticExtInfo *stat, + MCVList *mcv, + Node *clause, + bool **or_matches, + Selectivity *basesel, + Selectivity *totalsel); + #endif /* EXTENDED_STATS_INTERNAL_H */ diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h new file mode 100644 index 50fce49..c9ed211 --- a/src/include/statistics/statistics.h +++ b/src/include/statistics/statistics.h @@ -116,7 +116,8 @@ extern Selectivity statext_clauselist_se JoinType jointype, SpecialJoinInfo *sjinfo, RelOptInfo *rel, - Bitmapset **estimatedclauses); + Bitmapset **estimatedclauses, + bool is_or); extern bool has_stats_of_kind(List *stats, char requiredkind); extern StatisticExtInfo *choose_best_statistics(List *stats, char requiredkind, Bitmapset **clause_attnums, diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out new file mode 100644 index 4c3edd2..07b6ac5 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -1113,6 +1113,12 @@ SELECT * FROM check_estimated_rows('SELE 200 | 200 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL'); + estimated | actual +-----------+-------- + 200 | 200 +(1 row) + SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')'); estimated | actual -----------+-------- @@ -1173,13 +1179,6 @@ SELECT * FROM check_estimated_rows('SELE 100 | 100 (1 row) --- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute) -SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL'); - estimated | actual ------------+-------- - 343 | 200 -(1 row) - -- check change of unrelated column type does not reset the MCV statistics ALTER TABLE mcv_lists ALTER COLUMN d TYPE VARCHAR(64); SELECT d.stxdmcv IS NOT NULL @@ -1477,6 +1476,110 @@ SELECT * FROM check_estimated_rows('SELE 1 | 0 (1 row) +-- mcv covering just a small fraction of data +CREATE TABLE mcv_lists_partial ( + a INT, + b INT, + c INT +); +-- 10 frequent groups, each with 100 elements +INSERT INTO mcv_lists_partial (a, b, c) + SELECT + mod(i,10), + mod(i,10), + mod(i,10) + FROM generate_series(0,999) s(i); +-- 100 groups that will make it to the MCV list (includes the 10 frequent ones) +INSERT INTO mcv_lists_partial (a, b, c) + SELECT + i, + i, + i + FROM generate_series(0,99) s(i); +-- 4000 groups in total, most of which won't make it (just a single item) +INSERT INTO mcv_lists_partial (a, b, c) + SELECT + i, + i, + i + FROM generate_series(0,3999) s(i); +ANALYZE mcv_lists_partial; +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 0'); + estimated | actual +-----------+-------- + 1 | 102 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 0'); + estimated | actual +-----------+-------- + 300 | 102 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 AND b = 10 AND c = 10'); + estimated | actual +-----------+-------- + 1 | 2 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 OR b = 10 OR c = 10'); + estimated | actual +-----------+-------- + 6 | 2 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 10'); + estimated | actual +-----------+-------- + 1 | 0 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 10'); + estimated | actual +-----------+-------- + 204 | 104 +(1 row) + +CREATE STATISTICS mcv_lists_partial_stats (mcv) ON a, b, c + FROM mcv_lists_partial; +ANALYZE mcv_lists_partial; +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 0'); + estimated | actual +-----------+-------- + 102 | 102 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 0'); + estimated | actual +-----------+-------- + 98 | 102 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 AND b = 10 AND c = 10'); + estimated | actual +-----------+-------- + 2 | 2 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 OR b = 10 OR c = 10'); + estimated | actual +-----------+-------- + 2 | 2 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 10'); + estimated | actual +-----------+-------- + 1 | 0 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 10'); + estimated | actual +-----------+-------- + 102 | 104 +(1 row) + +DROP TABLE mcv_lists_partial; -- check the ability to use multiple MCV lists CREATE TABLE mcv_lists_multi ( a INTEGER, @@ -1506,12 +1609,36 @@ SELECT * FROM check_estimated_rows('SELE 102 | 714 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 AND c = 0'); + estimated | actual +-----------+-------- + 143 | 142 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 OR c = 0'); + estimated | actual +-----------+-------- + 1571 | 1572 +(1 row) + SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); estimated | actual -----------+-------- 4 | 142 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE (a = 0 AND b = 0) OR (c = 0 AND d = 0)'); + estimated | actual +-----------+-------- + 298 | 1572 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR b = 0 OR c = 0 OR d = 0'); + estimated | actual +-----------+-------- + 2649 | 1572 +(1 row) + -- create separate MCV statistics CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi; CREATE STATISTICS mcv_lists_multi_2 (mcv) ON c, d FROM mcv_lists_multi; @@ -1528,12 +1655,36 @@ SELECT * FROM check_estimated_rows('SELE 714 | 714 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 AND c = 0'); + estimated | actual +-----------+-------- + 143 | 142 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 OR c = 0'); + estimated | actual +-----------+-------- + 1571 | 1572 +(1 row) + SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); estimated | actual -----------+-------- 143 | 142 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE (a = 0 AND b = 0) OR (c = 0 AND d = 0)'); + estimated | actual +-----------+-------- + 1571 | 1572 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR b = 0 OR c = 0 OR d = 0'); + estimated | actual +-----------+-------- + 1714 | 1572 +(1 row) + DROP TABLE mcv_lists_multi; -- Permission tests. Users should not be able to see specific data values in -- the extended statistics, if they lack permission to see those values in diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql new file mode 100644 index 9781e59..3ec6dda --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -561,6 +561,8 @@ SELECT * FROM check_estimated_rows('SELE SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL'); + SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52, NULL) AND b IN ( ''1'', ''2'', NULL)'); @@ -581,9 +583,6 @@ SELECT * FROM check_estimated_rows('SELE SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < ALL (ARRAY[4, 5]) AND b IN (''1'', ''2'', NULL, ''3'') AND c > ANY (ARRAY[1, 2, NULL, 3])'); --- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute) -SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL'); - -- check change of unrelated column type does not reset the MCV statistics ALTER TABLE mcv_lists ALTER COLUMN d TYPE VARCHAR(64); @@ -777,6 +776,70 @@ SELECT * FROM check_estimated_rows('SELE SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND NOT c'); +-- mcv covering just a small fraction of data +CREATE TABLE mcv_lists_partial ( + a INT, + b INT, + c INT +); + +-- 10 frequent groups, each with 100 elements +INSERT INTO mcv_lists_partial (a, b, c) + SELECT + mod(i,10), + mod(i,10), + mod(i,10) + FROM generate_series(0,999) s(i); + +-- 100 groups that will make it to the MCV list (includes the 10 frequent ones) +INSERT INTO mcv_lists_partial (a, b, c) + SELECT + i, + i, + i + FROM generate_series(0,99) s(i); + +-- 4000 groups in total, most of which won't make it (just a single item) +INSERT INTO mcv_lists_partial (a, b, c) + SELECT + i, + i, + i + FROM generate_series(0,3999) s(i); + +ANALYZE mcv_lists_partial; + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 0'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 0'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 AND b = 10 AND c = 10'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 OR b = 10 OR c = 10'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 10'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 10'); + +CREATE STATISTICS mcv_lists_partial_stats (mcv) ON a, b, c + FROM mcv_lists_partial; + +ANALYZE mcv_lists_partial; + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 0'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 0'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 AND b = 10 AND c = 10'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 10 OR b = 10 OR c = 10'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 AND b = 0 AND c = 10'); + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE a = 0 OR b = 0 OR c = 10'); + +DROP TABLE mcv_lists_partial; + -- check the ability to use multiple MCV lists CREATE TABLE mcv_lists_multi ( a INTEGER, @@ -799,7 +862,11 @@ ANALYZE mcv_lists_multi; -- estimates without any mcv statistics SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 AND c = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 OR c = 0'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE (a = 0 AND b = 0) OR (c = 0 AND d = 0)'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR b = 0 OR c = 0 OR d = 0'); -- create separate MCV statistics CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi; @@ -809,7 +876,11 @@ ANALYZE mcv_lists_multi; SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 AND c = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 OR c = 0'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE (a = 0 AND b = 0) OR (c = 0 AND d = 0)'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR b = 0 OR c = 0 OR d = 0'); DROP TABLE mcv_lists_multi;