From 933c89c5a2f42bb53c5663bc46f25473bf2a97ba Mon Sep 17 00:00:00 2001 From: Ilia Evdokimov Date: Wed, 14 Jan 2026 13:13:21 +0300 Subject: [PATCH v2] Use hash-based matching for MCVs in ScalarArrayOpExpr selectivity When estimating selectivity for ScalarArrayOpExpr (IN / ANY / ALL) with available MCV statistics, the planner currently matches IN-list elements against the MCV array using nested loops. For large IN-lists and/or MCV lists this results in O(N*M) planning-time behavior. This patch introduces a hash-based matching strategy, analogous to what is already used for MCV matching in join selectivity estimation. The implementation covers both ScalarArrayOpExpr forms: 1. Const array, where IN-list elements are available as Datums, and 2. ArrayExpr, where elements are Nodes and may include non-constant expressions. In both cases, when MCV statistics and suitable hash functions are available, the algorithm chooses the smaller side to hash: either the MCV list or the distinct constant elements of the IN-list. The other side is then scanned once, reducing the complexity from O(N*M) to O(N+M). For the ArrayExpr case, only constant IN-list elements participate in hashing and MCV matching; non-constant elements are handled separately using var_eq_non_const(), preserving the existing semantics and probability model. When the hash table is built over the IN-list, duplicate constant values are eliminated during insertion. This prevents double counting of MCV frequencies and produces a more robust disjoint-probability estimate for = ANY and <> ALL. The hash-based path is used only for equality and inequality operators that use eqsel()/neqsel(), when MCV statistics are available and suitable hash functions exist for the operator. --- src/backend/utils/adt/selfuncs.c | 1060 +++++++++++++++++++++++++++++- src/tools/pgindent/typedefs.list | 3 + 2 files changed, 1062 insertions(+), 1 deletion(-) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 29fec655593..161d3de530e 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -146,7 +146,7 @@ /* * In production builds, switch to hash-based MCV matching when the lists are * large enough to amortize hash setup cost. (This threshold is compared to - * the sum of the lengths of the two MCV lists. This is simplistic but seems + * the sum of the lengths of the two lists. This is simplistic but seems * to work well enough.) In debug builds, we use a smaller threshold so that * the regression tests cover both paths well. */ @@ -156,6 +156,12 @@ #define EQJOINSEL_MCV_HASH_THRESHOLD 20 #endif +#ifndef USE_ASSERT_CHECKING +#define SCALARARRAY_MCV_HASH_THRESHOLD 200 +#else +#define SCALARARRAY_MCV_HASH_THRESHOLD 20 +#endif + /* Entries in the simplehash hash table used by eqjoinsel_find_matches */ typedef struct MCVHashEntry { @@ -176,14 +182,44 @@ typedef struct MCVHashContext int16 hash_typlen; /* typlen of hashed data type */ } MCVHashContext; +/* Entries in the simplehash hash table used by scalararray_mcv_hash_match */ +typedef struct MCVInHashEntry +{ + Datum value; /* the value represented by this entry */ + int index; /* its index in the relevant AttStatsSlot */ + uint32 hash; /* hash code for the Datum */ + char status; /* status code used by simplehash.h */ +} MCVInHashEntry; + +/* private_data for the simplehash hash table */ +typedef struct MCVInHashContext +{ + FunctionCallInfo equal_fcinfo; /* the equality join operator */ + FunctionCallInfo hash_fcinfo; /* the hash function to use */ + bool insert_mode; /* doing inserts or lookups? */ + bool hash_typbyval; /* typbyval of hashed data type */ + int16 hash_typlen; /* typlen of hashed data type */ +} MCVInHashContext; + /* forward reference */ typedef struct MCVHashTable_hash MCVHashTable_hash; +typedef struct MCVInHashTable_hash MCVInHashTable_hash; /* Hooks for plugins to get control when we ask for stats */ get_relation_stats_hook_type get_relation_stats_hook = NULL; get_index_stats_hook_type get_index_stats_hook = NULL; static double eqsel_internal(PG_FUNCTION_ARGS, bool negate); +static double scalararray_mcv_hash_match_expr(VariableStatData *vardata, Oid operator, Oid collation, + Node *other_op, bool var_on_left, ArrayExpr *arrayexpr, + Oid nominal_element_type, bool useOr, bool isEquality, + bool isInequality); +static double scalararray_mcv_hash_match_const(VariableStatData *vardata, Oid operator, Oid collation, + Datum *elem_values, bool *elem_nulls, int num_elems, + Oid nominal_element_type, bool useOr, bool isEquality, + bool isInequality); +static uint32 hash_mcv_in(MCVInHashTable_hash *tab, Datum key); +static bool mcvs_in_equal(MCVInHashTable_hash *tab, Datum key0, Datum key1); static double eqjoinsel_inner(FmgrInfo *eqproc, Oid collation, Oid hashLeft, Oid hashRight, VariableStatData *vardata1, VariableStatData *vardata2, @@ -287,6 +323,19 @@ static double btcost_correlation(IndexOptInfo *index, #define SH_DECLARE #include "lib/simplehash.h" +#define SH_PREFIX MCVInHashTable +#define SH_ELEMENT_TYPE MCVInHashEntry +#define SH_KEY_TYPE Datum +#define SH_KEY value +#define SH_HASH_KEY(tab,key) hash_mcv_in(tab, key) +#define SH_EQUAL(tab,key0,key1) mcvs_in_equal(tab, key0, key1) +#define SH_SCOPE static inline +#define SH_STORE_HASH +#define SH_GET_HASH(tab,ent) (ent)->hash +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + /* * eqsel - Selectivity of "=" for any data types. @@ -2025,6 +2074,35 @@ scalararraysel(PlannerInfo *root, elmlen, elmbyval, elmalign, &elem_values, &elem_nulls, &num_elems); + /* + * Try to calculate selectivity by hash-search O(N) instead of O(N^2) + * in case of MCV matching. We use hash-search only for eqsel() and + * neqsel(). + */ + if ((isEquality || isInequality) && !is_join_clause) + { + VariableStatData vardata; + Node *other_op = NULL; + bool var_on_left; + + /* + * If expression is not variable = something or something = + * variable, then punt and return a default estimate. + */ + if (get_restriction_variable(root, clause->args, varRelid, + &vardata, &other_op, &var_on_left)) + { + s1 = scalararray_mcv_hash_match_const(&vardata, operator, clause->inputcollid, + elem_values, elem_nulls, num_elems, + nominal_element_type, useOr, isEquality, isInequality); + + ReleaseVariableStats(vardata); + + if (s1 >= 0.0) + return s1; + } + } + /* * For generic operators, we assume the probability of success is * independent for each array element. But for "= ANY" or "<> ALL", @@ -2100,6 +2178,35 @@ scalararraysel(PlannerInfo *root, get_typlenbyval(arrayexpr->element_typeid, &elmlen, &elmbyval); + /* + * Try to calculate selectivity by hash-search O(N) instead of O(N^2) + * in case of MCV matching. We use hash-search only for eqsel() and + * neqsel(). + */ + if ((isEquality || isInequality) && !is_join_clause) + { + VariableStatData vardata; + Node *other_op = NULL; + bool var_on_left; + + /* + * If expression is not variable = something or something = + * variable, then punt and return a default estimate. + */ + if (get_restriction_variable(root, clause->args, varRelid, + &vardata, &other_op, &var_on_left)) + { + s1 = scalararray_mcv_hash_match_expr(&vardata, operator, clause->inputcollid, + other_op, var_on_left, arrayexpr, + nominal_element_type, useOr, isEquality, isInequality); + + ReleaseVariableStats(vardata); + + if (s1 >= 0.0) + return s1; + } + } + /* * We use the assumption of disjoint probabilities here too, although * the odds of equal array elements are rather higher if the elements @@ -2210,6 +2317,957 @@ scalararraysel(PlannerInfo *root, return s1; } +/* + * Estimate selectivity of a ScalarArrayOpExpr (ANY/ALL) using MCV statistics. + * + * This function implements the same probability model as the standard + * ScalarArrayOpExpr estimation code (independent or disjoint probabilities + * for OR/AND), but uses MCV statistics and hashing to compute + * per-element probabilities efficiently when possible. + * + * MCVs are used only to obtain more accurate values; the combination + * of frequencies follows the standard ANY/ALL formulas. + * + * Inputs: + * vardata: statistics and metadata for the variable being estimated + * operator: equality or inequality operator to apply + * collation: OID of collation to use + * other_op: expression for the non-variable side of the comparison + * var_on_left: true if the variable is on the left side of the operator + * arrayexpr: array of IN-list with expressions + * nominal_element_type: type of IN-list elements + * useOr: true if elements are combined using OR semantics, false for AND + * isEquality: true if the operator is equality + * isInequality: true if the operator is inequality + * + * Result: + * Returns a selectivity estimate in the range [0.0, 1.0], or -1.0 if the + * selectivity cannot be reliably estimated by this function. + * + * Note: this function assumes that the operator’s selectivity behavior + * matches eqsel()/neqsel semantics (equality or inequality). + * It must not be used for operators with custom or non-standard + * selectivity functions. + */ +static double +scalararray_mcv_hash_match_expr(VariableStatData *vardata, Oid operator, Oid collation, + Node *other_op, bool var_on_left, ArrayExpr *arrayexpr, + Oid nominal_element_type, bool useOr, bool isEquality, + bool isInequality) +{ + Form_pg_statistic stats; + AttStatsSlot sslot; + FmgrInfo eqproc; + double selec = -1.0, + s1disjoint, + nullfrac = 0.0; + Oid hashLeft = InvalidOid, + hashRight = InvalidOid, + opfuncoid; + bool have_mcvs = false; + int num_elems = 0; + ListCell *l; + + /* + * If the variable is known to be unique, MCV statistics do not represent + * a meaningful frequency distribution, so skip MCV-based estimation. + */ + if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0) + return -1.0; + + /* + * For inequality (<>, ALL), we compute probabilities using the negated + * equality operator and later transform them as + * + * p(x <> c) = 1 - p(x = c) - nullfrac + */ + if (isInequality) + { + operator = get_negator(operator); + if (!OidIsValid(operator)) + return -1.0; + } + + opfuncoid = get_opcode(operator); + memset(&sslot, 0, sizeof(sslot)); + + if (HeapTupleIsValid(vardata->statsTuple)) + { + if (statistic_proc_security_check(vardata, opfuncoid)) + have_mcvs = get_attstatsslot(&sslot, vardata->statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); + } + + if (have_mcvs) + { + fmgr_info(opfuncoid, &eqproc); + + num_elems = list_length(arrayexpr->elements); + + /* + * f the MCV list and IN-list are large enough, and the operator + * supports hashing, attempt to use hash functions so that MCV–IN + * matching can be done in O(N+M) instead of O(N×M). + */ + if (sslot.nvalues + num_elems >= SCALARARRAY_MCV_HASH_THRESHOLD) + (void) get_op_hash_functions(operator, &hashLeft, &hashRight); + } + + if (have_mcvs && OidIsValid(hashLeft) && OidIsValid(hashRight)) + { + /* Use a hash table to speed up the matching */ + LOCAL_FCINFO(fcinfo, 2); + LOCAL_FCINFO(hash_fcinfo, 1); + MCVInHashTable_hash *hashTable; + FmgrInfo hash_proc; + MCVInHashContext hashContext; + double sumallcommon = 0.0, + remaining_selec = 0.0; + bool isdefault; + double otherdistinct; + + /* Grab the nullfrac for use below. */ + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); + nullfrac = stats->stanullfrac; + + selec = s1disjoint = (useOr ? 0.0 : 1.0); + + InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation, + NULL, NULL); + fcinfo->args[0].isnull = false; + fcinfo->args[1].isnull = false; + + if (sslot.nvalues <= num_elems) + { + /* + * Build a hash table over the MCV values. + * + * For each IN-list element, we compute p(x = element): constants + * are matched against MCVs (or the residual frequency), while + * non-constant elements fall back to the operator's generic + * selectivity estimator. These probabilities are then combined + * using the standard ANY/ALL formulas. + */ + fmgr_info(hashLeft, &hash_proc); + InitFunctionCallInfoData(*hash_fcinfo, &hash_proc, 1, collation, + NULL, NULL); + hash_fcinfo->args[0].isnull = false; + + hashContext.equal_fcinfo = fcinfo; + hashContext.hash_fcinfo = hash_fcinfo; + hashContext.insert_mode = true; + + get_typlenbyval(sslot.valuetype, + &hashContext.hash_typlen, + &hashContext.hash_typbyval); + + hashTable = MCVInHashTable_create(CurrentMemoryContext, + sslot.nvalues, + &hashContext); + + for (int i = 0; i < sslot.nvalues; i++) + { + bool found = false; + MCVInHashEntry *entry; + + entry = MCVInHashTable_insert(hashTable, sslot.values[i], &found); + + /* + * MCVHashTable_insert will only report "found" if the new + * value is equal to some previous one per datum_image_eq(). + * That probably shouldn't happen, since we're not expecting + * duplicates in the MCV list. If we do find a dup, just + * ignore it, leaving the hash entry's index pointing at the + * first occurrence. That matches the behavior that the + * non-hashed code path would have. + */ + if (likely(!found)) + entry->index = i; + + sumallcommon += sslot.numbers[i]; + } + + /* + * Prepare to probe the hash table. If the probe values are of a + * different data type, then we need to change hash functions. + * (This code relies on the assumption that since we defined + * SH_STORE_HASH, simplehash.h will never need to compute hash + * values for existing hash table entries.) + */ + hashContext.insert_mode = false; + if (hashLeft != hashRight) + { + fmgr_info(hashRight, &hash_proc); + /* Resetting hash_fcinfo is probably unnecessary, but be safe */ + InitFunctionCallInfoData(*hash_fcinfo, &hash_proc, 1, collation, + NULL, NULL); + hash_fcinfo->args[0].isnull = false; + } + + /* + * remaining_selec is the total probability mass of all non-MCV + * values (i.e., 1 - sum(MCV frequencies) - nullfrac). This mass + * is later divided among the remaining distinct values to + * estimate p(x = c) for constants not present in the MCV list. + */ + remaining_selec = 1.0 - sumallcommon - nullfrac; + CLAMP_PROBABILITY(remaining_selec); + + /* + * and in fact it's probably a good deal less. We approximate that + * all the not-common values share this remaining fraction + * equally, so we divide by the number of other distinct values. + */ + otherdistinct = get_variable_numdistinct(vardata, &isdefault) - sslot.nnumbers; + if (otherdistinct > 1) + remaining_selec /= otherdistinct; + + /* + * Another cross-check: selectivity shouldn't be estimated as more + * than the least common "most common value". + */ + if (sslot.nnumbers > 0 && remaining_selec > sslot.numbers[sslot.nnumbers - 1]) + remaining_selec = sslot.numbers[sslot.nnumbers - 1]; + + /* Evaluate selectivity contribution of each IN-list element. */ + foreach(l, arrayexpr->elements) + { + MCVInHashEntry *entry; + Selectivity s1; + Node *elem_value = (Node *) lfirst(l); + + if (IsA(elem_value, Const)) + { + /* + * If the constant is NULL, assume operator is strict and + * return zero, ie, operator will never return TRUE. + * (It's zero even for a negator op.) + */ + if (((Const *) elem_value)->constisnull) + continue; + + entry = MCVInHashTable_lookup(hashTable, ((Const *) elem_value)->constvalue); + + if (entry != NULL) + { + /* + * As in the other code path, skip already-matched + * hash entries + */ + s1 = sslot.numbers[entry->index]; + } + else + { + s1 = remaining_selec; + } + + if (isInequality) + s1 = 1.0 - s1 - nullfrac; + } + else + { + s1 = var_eq_non_const(vardata, operator, collation, other_op, + var_on_left, isInequality); + } + + CLAMP_PROBABILITY(s1); + + if (useOr) + { + selec = selec + s1 - selec * s1; + if (isEquality) + s1disjoint += s1; + } + else + { + selec = selec * s1; + if (isInequality) + s1disjoint += s1 - 1.0; + } + } + + MCVInHashTable_destroy(hashTable); + + /* + * For = ANY or <> ALL, if the IN-list elements are assumed + * distinct, the events are disjoint and the total probability is + * the sum of individual probabilities. Use that estimate if it + * lies in [0,1]. + */ + if ((useOr ? isEquality : isInequality) && + s1disjoint >= 0.0 && s1disjoint <= 1.0) + selec = s1disjoint; + } + else + { + /* + * Here the IN-list is shorter than the MCV list. + * + * We build a hash table over the IN-list values so that we can + * quickly determine which MCVs are referenced by the query. + * + * We then: - sum probabilities of MCVs that appear in the IN-list + * - count how many IN-list values are not MCVs - estimate their + * probabilities from the remaining frequency mass - combine + * everything using the standard ANY/ALL probability rules + */ + int distinct_in = 0; + int mcv_matches = 0; + int non_mcv_in = 0; + double mcv_sum = 0.0; + Selectivity disjoint_sel = 0.0; + + fmgr_info(hashRight, &hash_proc); + InitFunctionCallInfoData(*hash_fcinfo, &hash_proc, 1, collation, + NULL, NULL); + hash_fcinfo->args[0].isnull = false; + + hashContext.equal_fcinfo = fcinfo; + hashContext.hash_fcinfo = hash_fcinfo; + hashContext.insert_mode = true; + + get_typlenbyval(nominal_element_type, + &hashContext.hash_typlen, + &hashContext.hash_typbyval); + + /* + * Build a hash table over the distinct constant values from the + * IN-list. Duplicates are ignored because ANY/ALL semantics + * assume the elements are distinct for disjoint-probability + * estimation. + */ + hashTable = MCVInHashTable_create(CurrentMemoryContext, + num_elems, + &hashContext); + + /* + * Insert all constant IN-list elements into the hash table, + * keeping only one copy of each distinct value. + */ + foreach(l, arrayexpr->elements) + { + bool found = false; + Node *elem_value = (Node *) lfirst(l); + + if (!IsA(elem_value, Const)) + continue; + + if (((Const *) elem_value)->constisnull) + continue; + + MCVInHashTable_insert(hashTable, ((Const *) elem_value)->constvalue, &found); + + if (likely(!found)) + distinct_in++; + } + + hashContext.insert_mode = false; + if (hashLeft != hashRight) + { + fmgr_info(hashLeft, &hash_proc); + /* Resetting hash_fcinfo is probably unnecessary, but be safe */ + InitFunctionCallInfoData(*hash_fcinfo, &hash_proc, 1, collation, + NULL, NULL); + hash_fcinfo->args[0].isnull = false; + } + + /* + * Scan the MCV list once. + * + * For each MCV value: - always add its frequency to sumallcommon + * (needed to compute the remaining non-MCV probability mass) - if + * the MCV appears in the IN-list, record it as a match and add + * its probability to mcv_sum + * + * At the same time we also fold its contribution into the running + * ANY/ALL probability (selec), exactly as the generic code does. + */ + for (int i = 0; i < sslot.nvalues; i++) + { + MCVInHashEntry *entry; + + sumallcommon += sslot.numbers[i]; + + entry = MCVInHashTable_lookup(hashTable, sslot.values[i]); + if (entry != NULL) + { + Selectivity s1 = sslot.numbers[i]; + + mcv_sum += s1; + mcv_matches++; + + if (isInequality) + s1 = 1.0 - s1 - nullfrac; + + CLAMP_PROBABILITY(s1); + + if (useOr) + selec = selec + s1 - selec * s1; + else + selec = selec * s1; + } + } + + /* + * Compute the total probability mass of all non-MCV values. This + * is the part of the column distribution not covered by MCVs. + */ + remaining_selec = 1.0 - sumallcommon - nullfrac; + CLAMP_PROBABILITY(remaining_selec); + + /* + * Approximate the per-value probability of a non-MCV constant by + * dividing the remaining probability mass by the number of other + * distinct values. + */ + otherdistinct = get_variable_numdistinct(vardata, &isdefault) - sslot.nnumbers; + if (otherdistinct > 1) + remaining_selec /= otherdistinct; + + if (sslot.nnumbers > 0 && remaining_selec > sslot.numbers[sslot.nnumbers - 1]) + remaining_selec = sslot.numbers[sslot.nnumbers - 1]; + + /* + * Number of IN-list values that are not MCVs. Each of these is + * assumed to have probability = remaining_selec. + */ + non_mcv_in = distinct_in - mcv_matches; + Assert(non_mcv_in >= 0); + + /* + * Compute the disjoint-probability estimate: + * + * s = sum of probabilities of all MCV values present in the + * IN-list + sum of probabilities of all non-MCV IN-list values + * + * This is valid for = ANY and <> ALL when the IN-list elements + * are assumed distinct. + */ + disjoint_sel = mcv_sum + non_mcv_in * remaining_selec; + + if (isInequality) + disjoint_sel = 1.0 - disjoint_sel - nullfrac; + + if ((useOr ? isEquality : isInequality) && + disjoint_sel >= 0.0 && disjoint_sel <= 1.0) + selec = disjoint_sel; + + /* + * These elements cannot be matched against MCVs and therefore + * must always be handled by the generic estimator. Their + * probabilities are combined into the same ANY/ALL probability + * chain to preserve the correct probability model. + */ + foreach(l, arrayexpr->elements) + { + Selectivity s1 = 0.0; + Node *elem_value = (Node *) lfirst(l); + + if (IsA(elem_value, Const)) + continue; + + s1 = var_eq_non_const(vardata, operator, collation, + other_op, var_on_left, isInequality); + + if (isInequality) + s1 = 1.0 - s1 - nullfrac; + + CLAMP_PROBABILITY(s1); + + if (useOr) + selec = selec + s1 - selec * s1; + else + selec = selec * s1; + } + + MCVInHashTable_destroy(hashTable); + } + + CLAMP_PROBABILITY(selec); + } + + free_attstatsslot(&sslot); + + return selec; +} + +/* + * Estimate selectivity of a ScalarArrayOpExpr (ANY/ALL) using MCV statistics. + * + * This function implements the same probability model as the standard + * ScalarArrayOpExpr estimation code (independent or disjoint probabilities + * for OR/AND), but uses MCV statistics and hashing to compute + * per-element probabilities efficiently when possible. + * + * MCVs are used only to obtain more accurate values; the combination + * of frequencies follows the standard ANY/ALL formulas. + * + * Inputs: + * vardata: statistics and metadata for the variable being estimated + * operator: equality or inequality operator to apply + * collation: OID of collation to use + * other_op: expression for the non-variable side of the comparison + * var_on_left: true if the variable is on the left side of the operator + * elem_values: array of IN-list element const values + * elem_nulls: array indicating which IN-list elements are NULL + * num_elems: number of IN-list const elements + * nominal_element_type: type of IN-list elements + * useOr: true if elements are combined using OR semantics, false for AND + * isEquality: true if the operator is equality + * isInequality: true if the operator is inequality + * + * Result: + * Returns a selectivity estimate in the range [0.0, 1.0], or -1.0 if the + * selectivity cannot be reliably estimated by this function. + * + * Note: this function assumes that the operator’s selectivity behavior + * matches eqsel()/neqsel semantics (equality or inequality). + * It must not be used for operators with custom or non-standard + * selectivity functions. + */ +static double +scalararray_mcv_hash_match_const(VariableStatData *vardata, Oid operator, Oid collation, + Datum *elem_values, bool *elem_nulls, int num_elems, + Oid nominal_element_type, bool useOr, bool isEquality, + bool isInequality) +{ + Form_pg_statistic stats; + AttStatsSlot sslot; + FmgrInfo eqproc; + double selec = -1.0, + s1disjoint, + nullfrac = 0.0; + Oid hashLeft = InvalidOid, + hashRight = InvalidOid, + opfuncoid; + bool have_mcvs = false; + + /* + * If the variable is known to be unique, MCV statistics do not represent + * a meaningful frequency distribution, so skip MCV-based estimation. + */ + if (vardata->isunique && vardata->rel && vardata->rel->tuples >= 1.0) + return -1.0; + + /* + * For inequality (<>, ALL), we compute probabilities using the negated + * equality operator and later transform them as + * + * p(x <> c) = 1 - p(x = c) - nullfrac + */ + if (isInequality) + { + operator = get_negator(operator); + if (!OidIsValid(operator)) + return -1.0; + } + + opfuncoid = get_opcode(operator); + memset(&sslot, 0, sizeof(sslot)); + + if (HeapTupleIsValid(vardata->statsTuple)) + { + if (statistic_proc_security_check(vardata, opfuncoid)) + have_mcvs = get_attstatsslot(&sslot, vardata->statsTuple, + STATISTIC_KIND_MCV, InvalidOid, + ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS); + } + + if (have_mcvs) + { + fmgr_info(opfuncoid, &eqproc); + + /* + * f the MCV list and IN-list are large enough, and the operator + * supports hashing, attempt to use hash functions so that MCV–IN + * matching can be done in O(N+M) instead of O(N×M). + */ + if (sslot.nvalues + num_elems >= SCALARARRAY_MCV_HASH_THRESHOLD) + (void) get_op_hash_functions(operator, &hashLeft, &hashRight); + } + + if (have_mcvs && OidIsValid(hashLeft) && OidIsValid(hashRight)) + { + /* Use a hash table to speed up the matching */ + LOCAL_FCINFO(fcinfo, 2); + LOCAL_FCINFO(hash_fcinfo, 1); + MCVInHashTable_hash *hashTable; + FmgrInfo hash_proc; + MCVInHashContext hashContext; + double sumallcommon = 0.0, + remaining_selec = 0.0; + bool isdefault; + double otherdistinct; + + /* Grab the nullfrac for use below. */ + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); + nullfrac = stats->stanullfrac; + + selec = s1disjoint = (useOr ? 0.0 : 1.0); + + InitFunctionCallInfoData(*fcinfo, &eqproc, 2, collation, + NULL, NULL); + fcinfo->args[0].isnull = false; + fcinfo->args[1].isnull = false; + + if (sslot.nvalues <= num_elems) + { + /* + * Build a hash table over the MCV values. + * + * For each IN-list element, we compute p(x = element): constants + * are matched against MCVs (or the residual frequency). These + * probabilities are then combined using the standard ANY/ALL + * formulas. + */ + fmgr_info(hashLeft, &hash_proc); + InitFunctionCallInfoData(*hash_fcinfo, &hash_proc, 1, collation, + NULL, NULL); + hash_fcinfo->args[0].isnull = false; + + hashContext.equal_fcinfo = fcinfo; + hashContext.hash_fcinfo = hash_fcinfo; + hashContext.insert_mode = true; + + get_typlenbyval(sslot.valuetype, + &hashContext.hash_typlen, + &hashContext.hash_typbyval); + + hashTable = MCVInHashTable_create(CurrentMemoryContext, + sslot.nvalues, + &hashContext); + + for (int i = 0; i < sslot.nvalues; i++) + { + bool found = false; + MCVInHashEntry *entry; + + entry = MCVInHashTable_insert(hashTable, sslot.values[i], &found); + + /* + * MCVHashTable_insert will only report "found" if the new + * value is equal to some previous one per datum_image_eq(). + * That probably shouldn't happen, since we're not expecting + * duplicates in the MCV list. If we do find a dup, just + * ignore it, leaving the hash entry's index pointing at the + * first occurrence. That matches the behavior that the + * non-hashed code path would have. + */ + if (likely(!found)) + entry->index = i; + + sumallcommon += sslot.numbers[i]; + } + + /* + * Prepare to probe the hash table. If the probe values are of a + * different data type, then we need to change hash functions. + * (This code relies on the assumption that since we defined + * SH_STORE_HASH, simplehash.h will never need to compute hash + * values for existing hash table entries.) + */ + hashContext.insert_mode = false; + if (hashLeft != hashRight) + { + fmgr_info(hashRight, &hash_proc); + /* Resetting hash_fcinfo is probably unnecessary, but be safe */ + InitFunctionCallInfoData(*hash_fcinfo, &hash_proc, 1, collation, + NULL, NULL); + hash_fcinfo->args[0].isnull = false; + } + + /* + * remaining_selec is the total probability mass of all non-MCV + * values (i.e., 1 - sum(MCV frequencies) - nullfrac). This mass + * is later divided among the remaining distinct values to + * estimate p(x = c) for constants not present in the MCV list. + */ + remaining_selec = 1.0 - sumallcommon - nullfrac; + CLAMP_PROBABILITY(remaining_selec); + + /* + * and in fact it's probably a good deal less. We approximate that + * all the not-common values share this remaining fraction + * equally, so we divide by the number of other distinct values. + */ + otherdistinct = get_variable_numdistinct(vardata, &isdefault) - sslot.nnumbers; + if (otherdistinct > 1) + remaining_selec /= otherdistinct; + + /* + * Another cross-check: selectivity shouldn't be estimated as more + * than the least common "most common value". + */ + if (sslot.nnumbers > 0 && remaining_selec > sslot.numbers[sslot.nnumbers - 1]) + remaining_selec = sslot.numbers[sslot.nnumbers - 1]; + + /* Evaluate selectivity contribution of each IN-list element. */ + for (int i = 0; i < num_elems; i++) + { + MCVInHashEntry *entry; + Selectivity s1; + + /* + * If the constant is NULL, assume operator is strict and + * return zero, ie, operator will never return TRUE. (It's + * zero even for a negator op.) + */ + if (elem_nulls[i]) + continue; + + entry = MCVInHashTable_lookup(hashTable, elem_values[i]); + + if (entry != NULL) + { + /* + * As in the other code path, skip already-matched hash + * entries + */ + s1 = sslot.numbers[entry->index]; + } + else + s1 = remaining_selec; + + if (isInequality) + s1 = 1.0 - s1 - nullfrac; + + CLAMP_PROBABILITY(s1); + + if (useOr) + { + selec = selec + s1 - selec * s1; + if (isEquality) + s1disjoint += s1; + } + else + { + selec = selec * s1; + if (isInequality) + s1disjoint += s1 - 1.0; + } + } + + MCVInHashTable_destroy(hashTable); + + /* + * For = ANY or <> ALL, if the IN-list elements are assumed + * distinct, the events are disjoint and the total probability is + * the sum of individual probabilities. Use that estimate if it + * lies in [0,1]. + */ + if ((useOr ? isEquality : isInequality) && + s1disjoint >= 0.0 && s1disjoint <= 1.0) + selec = s1disjoint; + } + else + { + /* + * Here the IN-list is shorter than the MCV list. + * + * We build a hash table over the IN-list values so that we can + * quickly determine which MCVs are referenced by the query. + * + * We then: - sum probabilities of MCVs that appear in the IN-list + * - count how many IN-list values are not MCVs - estimate their + * probabilities from the remaining frequency mass - combine + * everything using the standard ANY/ALL probability rules + */ + int distinct_in = 0; + int mcv_matches = 0; + int non_mcv_in = 0; + double mcv_sum = 0.0; + Selectivity disjoint_sel = 0.0; + + fmgr_info(hashRight, &hash_proc); + InitFunctionCallInfoData(*hash_fcinfo, &hash_proc, 1, collation, + NULL, NULL); + hash_fcinfo->args[0].isnull = false; + + hashContext.equal_fcinfo = fcinfo; + hashContext.hash_fcinfo = hash_fcinfo; + hashContext.insert_mode = true; + + get_typlenbyval(nominal_element_type, + &hashContext.hash_typlen, + &hashContext.hash_typbyval); + + /* + * Build a hash table over the distinct constant values from the + * IN-list. Duplicates are ignored because ANY/ALL semantics + * assume the elements are distinct for disjoint-probability + * estimation. + */ + hashTable = MCVInHashTable_create(CurrentMemoryContext, + num_elems, + &hashContext); + + /* + * Insert all constant IN-list elements into the hash table, + * keeping only one copy of each distinct value. + */ + for (int i = 0; i < num_elems; i++) + { + bool found = false; + + if (elem_nulls[i]) + continue; + + MCVInHashTable_insert(hashTable, elem_values[i], &found); + + if (likely(!found)) + distinct_in++; + } + + hashContext.insert_mode = false; + if (hashLeft != hashRight) + { + fmgr_info(hashLeft, &hash_proc); + /* Resetting hash_fcinfo is probably unnecessary, but be safe */ + InitFunctionCallInfoData(*hash_fcinfo, &hash_proc, 1, collation, + NULL, NULL); + hash_fcinfo->args[0].isnull = false; + } + + /* + * Scan the MCV list once. + * + * For each MCV value: - always add its frequency to sumallcommon + * (needed to compute the remaining non-MCV probability mass) - if + * the MCV appears in the IN-list, record it as a match and add + * its probability to mcv_sum + * + * At the same time we also fold its contribution into the running + * ANY/ALL probability (selec), exactly as the generic code does. + */ + for (int i = 0; i < sslot.nvalues; i++) + { + MCVInHashEntry *entry; + + sumallcommon += sslot.numbers[i]; + + entry = MCVInHashTable_lookup(hashTable, sslot.values[i]); + if (entry != NULL) + { + Selectivity s1 = sslot.numbers[i]; + + mcv_sum += s1; + mcv_matches++; + + if (isInequality) + s1 = 1.0 - s1 - nullfrac; + + CLAMP_PROBABILITY(s1); + + if (useOr) + selec = selec + s1 - selec * s1; + else + selec = selec * s1; + } + } + + /* + * Compute the total probability mass of all non-MCV values. This + * is the part of the column distribution not covered by MCVs. + */ + remaining_selec = 1.0 - sumallcommon - nullfrac; + CLAMP_PROBABILITY(remaining_selec); + + /* + * Approximate the per-value probability of a non-MCV constant by + * dividing the remaining probability mass by the number of other + * distinct values. + */ + otherdistinct = get_variable_numdistinct(vardata, &isdefault) - sslot.nnumbers; + if (otherdistinct > 1) + remaining_selec /= otherdistinct; + + if (sslot.nnumbers > 0 && remaining_selec > sslot.numbers[sslot.nnumbers - 1]) + remaining_selec = sslot.numbers[sslot.nnumbers - 1]; + + /* + * Number of IN-list values that are not MCVs. Each of these is + * assumed to have probability = remaining_selec. + */ + non_mcv_in = distinct_in - mcv_matches; + Assert(non_mcv_in >= 0); + + /* + * Compute the disjoint-probability estimate: + * + * s = sum of probabilities of all MCV values present in the + * IN-list + sum of probabilities of all non-MCV IN-list values + * + * This is valid for = ANY and <> ALL when the IN-list elements + * are assumed distinct. + */ + disjoint_sel = mcv_sum + non_mcv_in * remaining_selec; + + if (isInequality) + disjoint_sel = 1.0 - disjoint_sel - nullfrac; + + if ((useOr ? isEquality : isInequality) && + disjoint_sel >= 0.0 && disjoint_sel <= 1.0) + selec = disjoint_sel; + + MCVInHashTable_destroy(hashTable); + } + + CLAMP_PROBABILITY(selec); + } + + free_attstatsslot(&sslot); + + return selec; +} + +/* + * Support functions for the hash tables used by eqjoinsel_find_matches + */ +static uint32 +hash_mcv_in(MCVInHashTable_hash *tab, Datum key) +{ + MCVInHashContext *context = (MCVInHashContext *) tab->private_data; + FunctionCallInfo fcinfo = context->hash_fcinfo; + Datum fresult; + + fcinfo->args[0].value = key; + fcinfo->isnull = false; + fresult = FunctionCallInvoke(fcinfo); + Assert(!fcinfo->isnull); + return DatumGetUInt32(fresult); +} + +static bool +mcvs_in_equal(MCVInHashTable_hash *tab, Datum key0, Datum key1) +{ + MCVInHashContext *context = (MCVInHashContext *) tab->private_data; + + if (context->insert_mode) + { + /* + * During the insertion step, any comparisons will be between two + * Datums of the hash table's data type, so if the given operator is + * cross-type it will be the wrong thing to use. Fortunately, we can + * use datum_image_eq instead. The MCV values should all be distinct + * anyway, so it's mostly pro-forma to compare them at all. + */ + return datum_image_eq(key0, key1, + context->hash_typbyval, context->hash_typlen); + } + else + { + FunctionCallInfo fcinfo = context->equal_fcinfo; + Datum fresult; + + fcinfo->args[0].value = key0; + fcinfo->args[1].value = key1; + fcinfo->isnull = false; + fresult = FunctionCallInvoke(fcinfo); + return (!fcinfo->isnull && DatumGetBool(fresult)); + } +} + /* * Estimate number of elements in the array yielded by an expression. * diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 14dec2d49c1..adfe8f4c588 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1667,6 +1667,9 @@ MBuf MCVHashContext MCVHashEntry MCVHashTable_hash +MCVInHashContext +MCVInHashEntry +MCVInHashTable_hash MCVItem MCVList MEMORY_BASIC_INFORMATION -- 2.34.1