diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index b4edcd2..fbc7aff 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -25,8 +25,8 @@ OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \ pseudotypes.o rangetypes.o rangetypes_gist.o \ rowtypes.o regexp.o regproc.o ruleutils.o selfuncs.o \ tid.o timestamp.o varbit.o varchar.o varlena.o version.o xid.o \ - network.o network_gist.o mac.o inet_cidr_ntop.o inet_net_pton.o \ - ri_triggers.o pg_lzcompress.o pg_locale.o formatting.o \ + network.o network_gist.o network_selfuncs.o mac.o inet_cidr_ntop.o \ + inet_net_pton.o ri_triggers.o pg_lzcompress.o pg_locale.o formatting.o \ ascii.o quote.o pgstatfuncs.o encode.o dbsize.o genfile.o trigfuncs.o \ tsginidx.o tsgistidx.o tsquery.o tsquery_cleanup.o tsquery_gist.o \ tsquery_op.o tsquery_rewrite.o tsquery_util.o tsrank.o \ diff --git a/src/backend/utils/adt/network_selfuncs.c b/src/backend/utils/adt/network_selfuncs.c new file mode 100644 index 0000000..87a7390 --- /dev/null +++ b/src/backend/utils/adt/network_selfuncs.c @@ -0,0 +1,416 @@ +/*------------------------------------------------------------------------- + * + * network_selfuncs.c + * Functions for selectivity estimation of network operators + * + * Estimates are based on null fraction, distinct value count, most common + * values, and histogram of inet, cidr datatypes. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/utils/adt/network_selfuncs.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/htup_details.h" +#include "catalog/pg_operator.h" +#include "catalog/pg_statistic.h" +#include "utils/lsyscache.h" +#include "utils/inet.h" +#include "utils/selfuncs.h" + + +/* Default selectivity constant for the inet overlap operator */ +#define DEFAULT_OVERLAP_SEL 0.01 + +/* Default selectivity constant for the other operators */ +#define DEFAULT_INCLUSION_SEL 0.005 + +/* Default selectivity for given operator */ +#define DEFAULT_SEL(operator) \ + ((operator) == OID_INET_OVERLAP_OP ? \ + DEFAULT_OVERLAP_SEL : DEFAULT_INCLUSION_SEL) + +static int inet_opr_order(Oid operator); +static Selectivity inet_hist_inclusion_selectivity(VariableStatData *vardata, + Datum constvalue, + double ndistinc, + int opr_order); +static int inet_inclusion_cmp(inet *left, inet *right, int opr_order); +static int inet_masklen_inclusion_cmp(inet *left, inet *right, int opr_order); +static double inet_hist_match_divider(inet *hist, inet *query, int opr_order); + +/* + * Selectivity estimation for the subnet inclusion operators + */ +Datum +inetinclusionsel(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Oid operator = PG_GETARG_OID(1); + List *args = (List *) PG_GETARG_POINTER(2); + int varRelid = PG_GETARG_INT32(3); + VariableStatData vardata; + Node *other; + bool varonleft; + Selectivity selec, + max_mcv_selec, + mcv_selec, + max_hist_selec, + hist_selec; + Datum constvalue; + Form_pg_statistic stats; + FmgrInfo proc; + + /* + * If expression is not (variable op something) or (something op + * variable), then punt and return a default estimate. + */ + if (!get_restriction_variable(root, args, varRelid, + &vardata, &other, &varonleft)) + PG_RETURN_FLOAT8(DEFAULT_SEL(operator)); + + /* + * Can't do anything useful if the something is not a constant, either. + */ + if (!IsA(other, Const)) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(DEFAULT_SEL(operator)); + } + + /* All of the subnet inclusion operators are strict. */ + if (((Const *) other)->constisnull) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(0.0); + } + + if (!HeapTupleIsValid(vardata.statsTuple)) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(DEFAULT_SEL(operator)); + } + + constvalue = ((Const *) other)->constvalue; + stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); + + fmgr_info(get_opcode(operator), &proc); + mcv_selec = mcv_selectivity(&vardata, &proc, constvalue, varonleft, + &max_mcv_selec); + + max_hist_selec = 1.0 - stats->stanullfrac - max_mcv_selec; + + /* If current selectivity is good enough, just correct and return it. */ + if (max_hist_selec / max_mcv_selec < mcv_selec) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(mcv_selec / (1.0 - max_hist_selec)); + } + + hist_selec = inet_hist_inclusion_selectivity(&vardata, constvalue, + stats->stadistinct, (varonleft ? inet_opr_order(operator) : + inet_opr_order(operator) * -1)); + + /* + * If histogram selectivity is not exist but MCV selectivity exists, + * correct and return it. If they both not exist return the default. + */ + if (hist_selec < 0) + { + if (max_mcv_selec > 0) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(mcv_selec / (1.0 - max_hist_selec)); + } + + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(DEFAULT_SEL(operator)); + } + + selec = mcv_selec + max_hist_selec * hist_selec; + + /* Result should be in range, but make sure... */ + CLAMP_PROBABILITY(selec); + + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(selec); +} + +/* + * Practical comparable numbers for the subnet inclusion operators + */ +static int +inet_opr_order(Oid operator) +{ + switch (operator) + { + case OID_INET_SUP_OP: + return -2; + case OID_INET_SUPEQ_OP: + return -1; + case OID_INET_OVERLAP_OP: + return 0; + case OID_INET_SUBEQ_OP: + return 1; + case OID_INET_SUB_OP: + return 2; + } + + elog(ERROR, "unknown operator for inet inclusion selectivity"); +} + +/* + * Inet histogram inclusion selectivity estimation + * + * Calculates histogram selectivity for the subnet inclusion operators of + * the inet type. In the normal case, the return value is between 0 and 1. + * It should be corrected with MVC selectivity and null fraction. If + * the constant is less than the first element or greater than the last + * element of the histogram the return value will be 0. If the histogram + * does not available, the return value will be -1. + * + * The histogram is originally for the basic comparison operators. Only + * the common bits of the network part and the lenght of the network part + * (masklen) are appropriate for the subnet inclusion opeators. Fortunately, + * basic comparison fits in this situation. Even so, the lenght of the + * network part would not really be significant in the histogram. This would + * lead to big mistakes for data sets with uneven masklen distribution. + * To avoid this problem, comparison with the left and the right side of the + * buckets used together. + * + * Histogram bucket matches calculated in 3 forms. If the constant matches + * the both sides, bucket considered as fully matched. If the constant + * matches only the right side, bucket does not considered as matched at all. + * In that case, the ratio for only 1 value in the column added to + * the selectivity. + * + * The ratio for only 1 value, calculated with the ndistinct variable, + * if greater than 0. 0 can be given to it if this behavior does not desired. + * This ratio can be big enough not to disregard for addresses with small + * masklen's. See pg_statistic for more information about it. + * + * When the constant matches only the right side of the bucket, it will match + * the next bucket, unless the bucket is the last one. If these buckets would + * considered as matched, it would lead unfair multiple matches for some + * constants. + * + * The third form is to match the bucket partially. Two dividers for both + * of the boundries tried to be calculated, in this case. If the address + * family of the boundry does not match the constant or comparison of + * the lenght of the network parts does not true by the operator, the divider + * for the boundry would not taken into account. If both of the dividers + * can be calculated, some kind of geometrical mean will be used. + * + * For partial match with the buckets which have different address families + * on the left and right sides, only the boundry with the same address + * family, taken into consideration. This can cause more mistake for these + * buckets if their masklen's of their boundries are also disperate. It can + * only be the case for one bucket, if there are addresses with different + * families on the column. It seems as a better option than not considering + * these buckets. + */ +static Selectivity +inet_hist_inclusion_selectivity(VariableStatData *vardata, Datum constvalue, + double ndistinct, int opr_order) +{ + Datum *values; + inet *query, + *left, + *right; + int nvalues, + left_order, + right_order, + i; + double match, + divider, + left_divider, + right_divider; + + if (!(HeapTupleIsValid(vardata->statsTuple) && + get_attstatsslot(vardata->statsTuple, + vardata->atttype, vardata->atttypmod, + STATISTIC_KIND_HISTOGRAM, InvalidOid, + NULL, + &values, &nvalues, + NULL, NULL))) + return -1; + + query = DatumGetInetP(constvalue); + left = NULL; + left_order = -255; /* The first value should be greater. */ + match = 0.0; + + /* Iterate over the histogram buckets. Use i for the right side. */ + for (i = 0; i < nvalues; i++) + { + right = DatumGetInetP(values[i]); + right_order = inet_inclusion_cmp(right, query, opr_order); + + if (right_order == 0) + { + if (left_order == 0) + /* Full bucket match. */ + match += 1.0; + else + /* Only right side match. */ + if (ndistinct > 0) + match += 1.0 / ndistinct; + } + else if (((right_order > 0 && left_order <= 0) || + (right_order < 0 && left_order >= 0)) && left) + { + left_divider = inet_hist_match_divider(left, query, opr_order); + right_divider = inet_hist_match_divider(right, query, opr_order); + + /* right_divider cannot be 0 because right_order is not. */ + if (left_divider >= 0 || right_divider > 0) + { + /* Partial bucket match. */ + + if (left_divider > 0) + if (right_divider > 0) + divider = 2.0 * (left_divider * right_divider) / + (left_divider + right_divider); + else + divider = left_divider; + + else + divider = right_divider; + + match += 1.0 / divider; + } + } + + /* Shift the variables. */ + left = right; + left_order = right_order; + } + + divider = nvalues - 1; + if (ndistinct > 0) + /* Add this in case the constant matches the first element. */ + divider += 1.0 / ndistinct; + + elog(DEBUG1, "inet histogram inclusion matches: %f / %f", match, divider); + + free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0); + + Assert(match <= divider); + + return match / divider; +} + +/* + * Comparison function for the subnet inclusion operators + * + * Comparison is compatible with the basic comparison function for the inet + * type. See network_cmp_internal on network.c for the original. Basic + * comparison operators are implemented with the network_cmp_internal + * function. It is possible to implement the subnet inclusion operators with + * this function. + * + * Comparison is first on the common bits of the network part, then on + * the length of the network part (masklen) as the network_cmp_internal + * function. Only the first part is on this function. The second part is + * seperated to another function for reusability. The difference between + * the second part and the original network_cmp_internal is that the operator + * is used while comparing the lengths of the network parts. See the second + * part on the inet_masklen_inclusion_cmp function below. + */ +static int +inet_inclusion_cmp(inet *left, inet *right, int opr_order) +{ + if (ip_family(left) == ip_family(right)) + { + int order; + + order = bitncmp(ip_addr(left), ip_addr(right), + Min(ip_bits(left), ip_bits(right))); + + if (order != 0) + return order; + + return inet_masklen_inclusion_cmp(left, right, opr_order); + } + + return ip_family(left) - ip_family(right); +} + +/* + * Masklen comparison function for the subnet inclusion operators + * + * Compares the lengths of network parts of the inputs using the operator. + * If the comparision is okay for the operator the return value will be 0. + * Otherwise the return value will be less than or greater than 0 with + * respect to the operator. + */ +static int +inet_masklen_inclusion_cmp(inet *left, inet *right, int opr_order) +{ + if (ip_family(left) == ip_family(right)) + { + int order; + + order = ip_bits(left) - ip_bits(right); + + if ((order > 0 && opr_order >= 0) || + (order == 0 && opr_order >= -1 && opr_order <= 1) || + (order < 0 && opr_order <= 0)) + return 0; + + return opr_order; + } + + return ip_family(left) - ip_family(right); +} + +/* + * Inet histogram partial match divider calculation + * + * First, the families and the lenghts of the network parts are compared + * using the subnet inclusion operator. If they are not -1 returned which + * means divider not available. + * + * The divider is imagined as the distance between the decisive bits and + * the common bits of the addresses. The divider will be used as power of two + * as it is the natural scale for the IP network inclusion. It is + * an empirical formula and subject to change with more experiment. + */ +static double +inet_hist_match_divider(inet *hist, inet *query, int opr_order) +{ + if (inet_masklen_inclusion_cmp(hist, query, opr_order) == 0) + { + int min_bits, + decisive_bits; + + min_bits = Min(ip_bits(hist), ip_bits(query)); + + /* + * Set the decisive bits from the one which should contain the other + * according to the operator. + */ + if (opr_order < 0) + decisive_bits = ip_bits(hist); + else if (opr_order > 0) + decisive_bits = ip_bits(query); + else + decisive_bits = min_bits; + + if (min_bits > 0) + decisive_bits -= bitncommon(ip_addr(hist), ip_addr(query), + min_bits); + + return pow(2, decisive_bits); + } + + return -1; +} diff --git a/src/include/catalog/pg_operator.h b/src/include/catalog/pg_operator.h index edb9401..36f1c7d 100644 --- a/src/include/catalog/pg_operator.h +++ b/src/include/catalog/pg_operator.h @@ -1140,19 +1140,19 @@ DATA(insert OID = 1205 ( ">" PGNSP PGUID b f f 869 869 16 1203 1204 network DESCR("greater than"); DATA(insert OID = 1206 ( ">=" PGNSP PGUID b f f 869 869 16 1204 1203 network_ge scalargtsel scalargtjoinsel )); DESCR("greater than or equal"); -DATA(insert OID = 931 ( "<<" PGNSP PGUID b f f 869 869 16 933 0 network_sub - - )); +DATA(insert OID = 931 ( "<<" PGNSP PGUID b f f 869 869 16 933 0 network_sub inetinclusionsel - )); DESCR("is subnet"); #define OID_INET_SUB_OP 931 -DATA(insert OID = 932 ( "<<=" PGNSP PGUID b f f 869 869 16 934 0 network_subeq - - )); +DATA(insert OID = 932 ( "<<=" PGNSP PGUID b f f 869 869 16 934 0 network_subeq inetinclusionsel - )); DESCR("is subnet or equal"); #define OID_INET_SUBEQ_OP 932 -DATA(insert OID = 933 ( ">>" PGNSP PGUID b f f 869 869 16 931 0 network_sup - - )); +DATA(insert OID = 933 ( ">>" PGNSP PGUID b f f 869 869 16 931 0 network_sup inetinclusionsel - )); DESCR("is supernet"); #define OID_INET_SUP_OP 933 -DATA(insert OID = 934 ( ">>=" PGNSP PGUID b f f 869 869 16 932 0 network_supeq - - )); +DATA(insert OID = 934 ( ">>=" PGNSP PGUID b f f 869 869 16 932 0 network_supeq inetinclusionsel - )); DESCR("is supernet or equal"); #define OID_INET_SUPEQ_OP 934 -DATA(insert OID = 4050 ( "&&" PGNSP PGUID b f f 869 869 16 4050 0 network_overlap - - )); +DATA(insert OID = 4050 ( "&&" PGNSP PGUID b f f 869 869 16 4050 0 network_overlap inetinclusionsel - )); DESCR("overlaps (is subnet or supernet)"); #define OID_INET_OVERLAP_OP 4050 diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index ed4448c..bcc8c08 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -2117,6 +2117,8 @@ DATA(insert OID = 928 ( network_subeq PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 DATA(insert OID = 929 ( network_sup PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "869 869" _null_ _null_ _null_ _null_ network_sup _null_ _null_ _null_ )); DATA(insert OID = 930 ( network_supeq PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "869 869" _null_ _null_ _null_ _null_ network_supeq _null_ _null_ _null_ )); DATA(insert OID = 4040 ( network_overlap PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "869 869" _null_ _null_ _null_ _null_ network_overlap _null_ _null_ _null_ )); +DATA(insert OID = 4041 ( inetinclusionsel PGNSP PGUID 12 1 0 0 0 f f f f t f s 4 0 701 "2281 26 2281 23" _null_ _null_ _null_ _null_ inetinclusionsel _null_ _null_ _null_ )); +DESCR("restriction selectivity of inet inclusion operators"); /* GiST support for inet and cidr */ DATA(insert OID = 4042 ( inet_gist_consistent PGNSP PGUID 12 1 0 0 0 f f f f t f i 5 0 16 "2281 869 23 26 2281" _null_ _null_ _null_ _null_ inet_gist_consistent _null_ _null_ _null_ )); diff --git a/src/include/utils/inet.h b/src/include/utils/inet.h index af30a42..8e7ba50 100644 --- a/src/include/utils/inet.h +++ b/src/include/utils/inet.h @@ -138,6 +138,11 @@ extern int bitncmp(void *l, void *r, int n); extern int bitncommon(unsigned char *l, unsigned char *r, int n); /* + * Selectivity estimation function in network_selfuncs.c + */ +extern Datum inetinclusionsel(PG_FUNCTION_ARGS); + +/* * GiST support functions in network_gist.c */ extern Datum inet_gist_consistent(PG_FUNCTION_ARGS);