From 400eadd3d1527d44966222e965aa4abdc5bd7d2b Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Mon, 17 Oct 2022 18:39:28 +0200 Subject: [PATCH 01/11] Allow index AMs to build and use custom statistics Some indexing AMs work very differently and estimating them using existing statistics is problematic, producing unreliable costing. This applies e.g. to BRIN, which relies on page ranges, not tuple pointers. This adds an optional AM procedure, allowing the opfamily to build custom statistics, store them in pg_statistic and then use them during planning. By default this is disabled, but may be enabled by setting SET enable_indexam_stats = true; Then ANALYZE will call the optional procedure for all indexes. --- src/backend/access/brin/brin.c | 1 + src/backend/access/brin/brin_minmax.c | 902 ++++++++++++++++++ src/backend/commands/analyze.c | 149 ++- src/backend/statistics/extended_stats.c | 2 + src/backend/utils/adt/selfuncs.c | 59 ++ src/backend/utils/cache/lsyscache.c | 41 + src/backend/utils/misc/guc_tables.c | 10 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/access/amapi.h | 2 + src/include/access/brin.h | 63 ++ src/include/access/brin_internal.h | 1 + src/include/catalog/pg_amproc.dat | 64 ++ src/include/catalog/pg_proc.dat | 4 + src/include/catalog/pg_statistic.h | 5 + src/include/commands/vacuum.h | 2 + src/include/utils/lsyscache.h | 1 + src/test/regress/expected/sysviews.out | 3 +- 17 files changed, 1304 insertions(+), 6 deletions(-) diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index b5a5fa7b334..88a0361015c 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -95,6 +95,7 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->amstrategies = 0; amroutine->amsupport = BRIN_LAST_OPTIONAL_PROCNUM; amroutine->amoptsprocnum = BRIN_PROCNUM_OPTIONS; + amroutine->amstatsprocnum = BRIN_PROCNUM_STATISTICS; amroutine->amcanorder = false; amroutine->amcanorderbyop = false; amroutine->amcanbackward = false; diff --git a/src/backend/access/brin/brin_minmax.c b/src/backend/access/brin/brin_minmax.c index 2431591be65..14fff22247a 100644 --- a/src/backend/access/brin/brin_minmax.c +++ b/src/backend/access/brin/brin_minmax.c @@ -10,17 +10,23 @@ */ #include "postgres.h" +#include "access/brin.h" #include "access/brin_internal.h" +#include "access/brin_revmap.h" #include "access/brin_tuple.h" #include "access/genam.h" #include "access/stratnum.h" #include "catalog/pg_amop.h" #include "catalog/pg_type.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" #include "utils/builtins.h" #include "utils/datum.h" #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/syscache.h" +#include "utils/timestamp.h" typedef struct MinmaxOpaque { @@ -253,6 +259,902 @@ brin_minmax_union(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +/* FIXME copy of a private struct from brin.c */ +typedef struct BrinOpaque +{ + BlockNumber bo_pagesPerRange; + BrinRevmap *bo_rmAccess; + BrinDesc *bo_bdesc; +} BrinOpaque; + +/* + * Compare ranges by minval (collation and operator are taken from the extra + * argument, which is expected to be TypeCacheEntry). + */ +static int +range_minval_cmp(const void *a, const void *b, void *arg) +{ + BrinRange *ra = *(BrinRange **) a; + BrinRange *rb = *(BrinRange **) b; + TypeCacheEntry *typentry = (TypeCacheEntry *) arg; + FmgrInfo *cmpfunc = &typentry->cmp_proc_finfo; + Datum c; + int r; + + c = FunctionCall2Coll(cmpfunc, typentry->typcollation, + ra->min_value, rb->min_value); + r = DatumGetInt32(c); + + if (r != 0) + return r; + + if (ra->blkno_start < rb->blkno_start) + return -1; + else + return 1; +} + +/* + * Compare ranges by maxval (collation and operator are taken from the extra + * argument, which is expected to be TypeCacheEntry). + */ +static int +range_maxval_cmp(const void *a, const void *b, void *arg) +{ + BrinRange *ra = *(BrinRange **) a; + BrinRange *rb = *(BrinRange **) b; + TypeCacheEntry *typentry = (TypeCacheEntry *) arg; + FmgrInfo *cmpfunc = &typentry->cmp_proc_finfo; + Datum c; + int r; + + c = FunctionCall2Coll(cmpfunc, typentry->typcollation, + ra->max_value, rb->max_value); + r = DatumGetInt32(c); + + if (r != 0) + return r; + + if (ra->blkno_start < rb->blkno_start) + return -1; + else + return 1; +} + +/* compare values using an operator from typcache */ +static int +range_values_cmp(const void *a, const void *b, void *arg) +{ + Datum da = * (Datum *) a; + Datum db = * (Datum *) b; + TypeCacheEntry *typentry = (TypeCacheEntry *) arg; + FmgrInfo *cmpfunc = &typentry->cmp_proc_finfo; + Datum c; + + c = FunctionCall2Coll(cmpfunc, typentry->typcollation, + da, db); + return DatumGetInt32(c); +} + +/* + * minval_end + * Determine first index so that (minval > value). + * + * The array of ranges is expected to be sorted by minvalue, so this is the first + * range that can't possibly intersect with a range having "value" as maxval. + */ +static int +minval_end(BrinRange **ranges, int nranges, Datum value, TypeCacheEntry *typcache) +{ + int start = 0, + end = (nranges - 1); + + // everything matches + if (range_values_cmp(&value, &ranges[end]->min_value, typcache) >= 0) + return nranges; + + // no matches + if (range_values_cmp(&value, &ranges[start]->min_value, typcache) < 0) + return 0; + + while ((end - start) > 0) + { + int midpoint; + int r; + + midpoint = start + (end - start) / 2; + + r = range_values_cmp(&value, &ranges[midpoint]->min_value, typcache); + + if (r >= 0) + start = midpoint + 1; + else + end = midpoint; + } + + Assert(range_values_cmp(&ranges[start]->min_value, &value, typcache) > 0); + Assert(range_values_cmp(&ranges[start-1]->min_value, &value, typcache) <= 0); + + return start; +} + + +/* + * lower_bound + * Determine first index so that (values[index] >= value). + * + * The array of values is sorted, and this returns the first value that + * exceeds (or is equal) to the minvalue. + */ +static int +lower_bound(Datum *values, int nvalues, Datum minvalue, TypeCacheEntry *typcache) +{ + int start = 0, + end = (nvalues - 1); + + /* all values exceed minvalue - return the first element */ + if (range_values_cmp(&minvalue, &values[start], typcache) <= 0) + return 0; + + /* nothing matches - return the element after the last one */ + if (range_values_cmp(&minvalue, &values[end], typcache) > 0) + return nvalues; + + /* + * Now we know the lower boundary is somewhere in the array (and we know + * it's not the first element, because that's covered by the first check + * above). So do a binary search. + */ + while ((end - start) > 0) + { + int midpoint; + int r; + + midpoint = start + (end - start) / 2; + + r = range_values_cmp(&minvalue, &values[midpoint], typcache); + + if (r <= 0) /* minvalue >= midpoint */ + end = midpoint; + else /* midpoint < minvalue */ + start = (midpoint + 1); + } + + Assert(range_values_cmp(&minvalue, &values[start], typcache) <= 0); + Assert(range_values_cmp(&minvalue, &values[start-1], typcache) > 0); + + return start; +} + +/* + * upper_bound + * Determine last index so that (values[index] <= maxvalue). + * + * The array of values is sorted, and this returns the last value that + * does not exceed (or is equal) to the maxvalue. + */ +static int +upper_bound(Datum *values, int nvalues, Datum maxvalue, TypeCacheEntry *typcache) +{ + int start = 0, + end = (nvalues - 1); + + /* everything matches, return the last element */ + if (range_values_cmp(&values[end], &maxvalue, typcache) <= 0) + return (nvalues - 1); + + /* nothing matches, return the element before the first one */ + if (range_values_cmp(&values[start], &maxvalue, typcache) > 0) + return -1; + + /* + * Now we know the lower boundary is somewhere in the array (and we know + * it's not the last element, because that's covered by the first check + * above). So do a binary search. + */ + while ((end - start) > 0) + { + int midpoint; + int r; + + midpoint = start + (end - start) / 2; + + /* Ensure we always move (it might be equal to start due to rounding). */ + midpoint = Max(start+1, midpoint); + + r = range_values_cmp(&values[midpoint], &maxvalue, typcache); + + if (r <= 0) /* value <= maxvalue */ + start = midpoint; + else /* value > maxvalue */ + end = midpoint - 1; + } + + Assert(range_values_cmp(&values[start], &maxvalue, typcache) <= 0); + Assert(range_values_cmp(&values[start+1], &maxvalue, typcache) > 0); + + return start; +} + +/* + * brin_minmax_count_overlaps + * Calculate number of overlaps. + * + * This uses the minranges to quickly eliminate ranges that can't possibly + * intersect. We simply walk minranges until minval > current maxval, and + * we're done. + * + * Unlike brin_minmax_count_overlaps2, this does not have issues with wide + * ranges, so this is what we should use. + */ +static void +brin_minmax_count_overlaps(BrinRange **minranges, int nranges, + TypeCacheEntry *typcache, BrinMinmaxStats *stats) +{ + int64 noverlaps; + + noverlaps = 0; + for (int i = 0; i < nranges; i++) + { + Datum maxval = minranges[i]->max_value; + + /* + * Determine index of the first range with (minval > current maxval) + * by binary search. We know all other ranges can't overlap the + * current one. We simply subtract indexes to count ranges. + */ + int idx = minval_end(minranges, nranges, maxval, typcache); + + /* -1 because we don't count the range as intersecting with itself */ + noverlaps += (idx - i - 1); + } + + /* + * We only count 1/2 the ranges (minval > current minval), so the total + * number of overlaps is twice what we counted. + */ + noverlaps *= 2; + + stats->avg_overlaps = (double) noverlaps / nranges; +} + +/* + * brin_minmax_match_tuples_to_ranges + * Match tuples to ranges, count average number of ranges per tuple. + * + * Alternative to brin_minmax_match_tuples_to_ranges2, leveraging ordering + * of values, not ranges. + * + * XXX This seems like the optimal way to do this. + */ +static void +brin_minmax_match_tuples_to_ranges(BrinRanges *ranges, + int numrows, HeapTuple *rows, + int nvalues, Datum *values, + TypeCacheEntry *typcache, + BrinMinmaxStats *stats) +{ + int64 nmatches = 0; + int64 nmatches_unique = 0; + int64 nvalues_unique = 0; + + int64 *unique = (int64 *) palloc0(sizeof(int64) * nvalues); + + /* + * Build running count of unique values. We know there are unique[i] + * unique values in values array up to index "i". + */ + unique[0] = 1; + for (int i = 1; i < nvalues; i++) + { + if (range_values_cmp(&values[i-1], &values[i], typcache) == 0) + unique[i] = unique[i-1]; + else + unique[i] = unique[i-1] + 1; + } + + nvalues_unique = unique[nvalues-1]; + + /* + * Walk the ranges, for each range determine the first/last mapping + * value. Use the "unique" array to count the unique values. + */ + for (int i = 0; i < ranges->nranges; i++) + { + int start, + end, + nvalues_match, + nunique_match; + + CHECK_FOR_INTERRUPTS(); + + start = lower_bound(values, nvalues, ranges->ranges[i].min_value, typcache); + end = upper_bound(values, nvalues, ranges->ranges[i].max_value, typcache); + + /* if nothing matches (e.g. end=0), skip this range */ + if (end <= start) + continue; + + nvalues_match = (end - start + 1); + nunique_match = (unique[end] - unique[start] + 1); + + Assert((nvalues_match >= 1) && (nvalues_match <= nvalues)); + Assert((nunique_match >= 1) && (nunique_match <= unique[nvalues-1])); + + nmatches += nvalues_match; + nmatches_unique += nunique_match; + } + + Assert(nmatches >= 0); + Assert(nmatches_unique >= 0); + + stats->avg_matches = (double) nmatches / numrows; + stats->avg_matches_unique = (double) nmatches_unique / nvalues_unique; +} + +/* + * brin_minmax_value_stats + * Calculate statistics about minval/maxval values. + * + * We calculate the number of distinct values, and also correlation with respect + * to blkno_start. We don't calculate the regular correlation coefficient, because + * our goal is to estimate how sequential the accesses are. The regular correlation + * would produce 0 for cyclical data sets like mod(i,1000000), but it may be quite + * sequantial access. Maybe it should be called differently, not correlation? + * + * XXX Maybe this should calculate minval vs. maxval correlation too? + * + * XXX I don't know how important the sequentiality is - BRIN generally uses 1MB + * page ranges, which is pretty sequential and the one random seek in between is + * likely going to be negligible. Maybe for small page ranges it'll matter, though. + */ +static void +brin_minmax_value_stats(BrinRange **minranges, BrinRange **maxranges, + int nranges, TypeCacheEntry *typcache, + BrinMinmaxStats *stats) +{ + /* */ + int64 minval_ndist = 1, + maxval_ndist = 1, + minval_corr = 0, + maxval_corr = 0; + + for (int i = 1; i < nranges; i++) + { + if (range_values_cmp(&minranges[i-1]->min_value, &minranges[i]->min_value, typcache) != 0) + minval_ndist++; + + if (range_values_cmp(&maxranges[i-1]->max_value, &maxranges[i]->max_value, typcache) != 0) + maxval_ndist++; + + /* is it immediately sequential? */ + if (minranges[i-1]->blkno_end + 1 == minranges[i]->blkno_start) + minval_corr++; + + /* is it immediately sequential? */ + if (maxranges[i-1]->blkno_end + 1 == maxranges[i]->blkno_start) + maxval_corr++; + } + + stats->minval_ndistinct = minval_ndist; + stats->maxval_ndistinct = maxval_ndist; + + stats->minval_correlation = (double) minval_corr / nranges; + stats->maxval_correlation = (double) maxval_corr / nranges; +} + +/* + * brin_minmax_increment_stats + * Calculate the increment size for minval/maxval steps. + * + * Calculates the minval/maxval increment size, i.e. number of rows that need + * to be added to the sort. This serves as an input to calculation of a good + * watermark step. + */ +static void +brin_minmax_increment_stats(BrinRange **minranges, BrinRange **maxranges, + int nranges, Datum *values, int nvalues, + TypeCacheEntry *typcache, BrinMinmaxStats *stats) +{ + /* */ + int64 minval_ndist = 1, + maxval_ndist = 1; + + double sum_minval = 0, + sum_maxval = 0, + max_minval = 0, + max_maxval = 0; + + for (int i = 1; i < nranges; i++) + { + if (range_values_cmp(&minranges[i-1]->min_value, &minranges[i]->min_value, typcache) != 0) + { + double p; + int start = upper_bound(values, nvalues, minranges[i-1]->min_value, typcache); + int end = upper_bound(values, nvalues, minranges[i]->min_value, typcache); + + /* + * Maybe there are no matching rows, but we still need to count + * this as distinct minval (even though the sample increase is 0). + */ + minval_ndist++; + + Assert(end >= start); + + /* no sample rows match this, so skip */ + if (end == start) + continue; + + p = (double) (end - start) / nvalues; + + max_minval = Max(max_minval, p); + sum_minval += p; + } + + if (range_values_cmp(&maxranges[i-1]->max_value, &maxranges[i]->max_value, typcache) != 0) + { + double p; + int start = upper_bound(values, nvalues, maxranges[i-1]->max_value, typcache); + int end = upper_bound(values, nvalues, maxranges[i]->max_value, typcache); + + /* + * Maybe there are no matching rows, but we still need to count + * this as distinct maxval (even though the sample increase is 0). + */ + maxval_ndist++; + + Assert(end >= start); + + /* no sample rows match this, so skip */ + if (end == start) + continue; + + p = (double) (end - start) / nvalues; + + max_maxval = Max(max_maxval, p); + sum_maxval += p; + } + } + + stats->minval_increment_avg = (sum_minval / minval_ndist); + stats->minval_increment_max = max_minval; + + stats->maxval_increment_avg = (sum_maxval / maxval_ndist); + stats->maxval_increment_max = max_maxval; +} + +/* + * brin_minmax_stats + * Calculate custom statistics for a BRIN minmax index. + * + * At the moment this calculates: + * + * - number of summarized/not-summarized and all/has nulls ranges + * - average number of overlaps for a range + * - average number of rows matching a range + * - number of distinct minval/maxval values + * + * XXX This could also calculate correlation of the range minval, so that + * we can estimate how much random I/O will happen during the BrinSort. + * And perhaps we should also sort the ranges by (minval,block_start) to + * make this as sequential as possible? + * + * XXX Another interesting statistics might be the number of ranges with + * the same minval (or number of distinct minval values), because that's + * essentially what we need to estimate how many ranges will be read in + * one brinsort step. In fact, knowing the number of distinct minval + * values tells us the number of BrinSort loops. + * + * XXX We might also calculate a histogram of minval/maxval values. + * + * XXX I wonder if we could track for each range track probabilities: + * + * - P1 = P(v <= minval) + * - P2 = P(x <= Max(maxval)) for Max(maxval) over preceding ranges + * + * That would allow us to estimate how many ranges we'll have to read to produce + * a particular number of rows, because we need the first probability to exceed + * the requested number of rows (fraction of the table): + * + * (limit rows / reltuples) <= P(v <= minval) + * + * and then the second probability would say how many rows we'll process (either + * sort or spill). And inversely for the DESC ordering. + * + * The difference between P1 for two ranges is how much we'd have to sort + * if we moved the watermark between the ranges (first minval to second one). + * The (P2 - P1) for the new watermark range measures the number of rows in + * the tuplestore. We'll need to aggregate this, though, we can't keep the + * whole data - probably average/median/max for the differences would be nice. + * Might be tricky for different watermark step values, though. + * + * This would also allow estimating how many rows will spill from each range, + * because we have an estimate how many rows match a range on average, and + * we can compare it to the difference between P1. + * + * One issue is we don't have actual tuples from the ranges, so we can't + * measure exactly how many rows would we add. But we can match the sample + * and at least estimate the the probability difference. + * + * Actually - we do know the tuples *are* in those ranges, because if we + * assume the tuple is in some other range, that range would have to have + * a minimal/maximal value so that the value is consistent. Which means + * the range has to be between those ranges. Of course, this only estimates + * the rows we'd going to add to the tuplesort - there might be more rows + * we read and spill to tuplestore, but that's something we can estimate + * using average tuples per range. + */ +Datum +brin_minmax_stats(PG_FUNCTION_ARGS) +{ + Relation heapRel = (Relation) PG_GETARG_POINTER(0); + Relation indexRel = (Relation) PG_GETARG_POINTER(1); + AttrNumber attnum = PG_GETARG_INT16(2); /* index attnum */ + AttrNumber heap_attnum = PG_GETARG_INT16(3); + Expr *expr = (Expr *) PG_GETARG_POINTER(4); + HeapTuple *rows = (HeapTuple *) PG_GETARG_POINTER(5); + int numrows = PG_GETARG_INT32(6); + + BrinOpaque *opaque; + BlockNumber nblocks; + BlockNumber nranges; + BlockNumber heapBlk; + BrinMemTuple *dtup; + BrinTuple *btup = NULL; + Size btupsz = 0; + Buffer buf = InvalidBuffer; + BrinRanges *ranges; + BlockNumber pagesPerRange; + BrinDesc *bdesc; + BrinMinmaxStats *stats; + Form_pg_attribute attr; + + Oid typoid; + TypeCacheEntry *typcache; + BrinRange **minranges, + **maxranges; + int64 prev_min_index; + + /* expression stats */ + EState *estate; + ExprContext *econtext; + ExprState *exprstate; + TupleTableSlot *slot; + + /* attnum or expression has to be supplied */ + Assert(AttributeNumberIsValid(heap_attnum) || (expr != NULL)); + + /* but not both of them at the same time */ + Assert(!(AttributeNumberIsValid(heap_attnum) && (expr != NULL))); + + /* + * Mostly what brinbeginscan does to initialize BrinOpaque, except that + * we use active snapshot instead of the scan snapshot. + */ + opaque = palloc_object(BrinOpaque); + opaque->bo_rmAccess = brinRevmapInitialize(indexRel, + &opaque->bo_pagesPerRange, + GetActiveSnapshot()); + opaque->bo_bdesc = brin_build_desc(indexRel); + + bdesc = opaque->bo_bdesc; + pagesPerRange = opaque->bo_pagesPerRange; + + /* make sure the provided attnum is valid */ + Assert((attnum > 0) && (attnum <= bdesc->bd_tupdesc->natts)); + + /* attribute information */ + attr = TupleDescAttr(bdesc->bd_tupdesc, attnum - 1); + + /* + * We need to know the size of the table so that we know how long to iterate + * on the revmap (and to pre-allocate the arrays). + */ + nblocks = RelationGetNumberOfBlocks(heapRel); + + /* + * How many ranges can there be? We simply look at the number of pages, + * divide it by the pages_per_range. + * + * XXX We need to be careful not to overflow nranges, so we just divide + * and then maybe add 1 for partial ranges. + */ + nranges = (nblocks / pagesPerRange); + if (nblocks % pagesPerRange != 0) + nranges += 1; + + /* allocate for space, and also for the alternative ordering */ + ranges = palloc0(offsetof(BrinRanges, ranges) + nranges * sizeof(BrinRange)); + ranges->nranges = 0; + + /* allocate an initial in-memory tuple, out of the per-range memcxt */ + dtup = brin_new_memtuple(bdesc); + + /* result stats */ + stats = palloc0(sizeof(BrinMinmaxStats)); + SET_VARSIZE(stats, sizeof(BrinMinmaxStats)); + + /* + * Now scan the revmap. We start by querying for heap page 0, + * incrementing by the number of pages per range; this gives us a full + * view of the table. + * + * XXX We count the ranges, and count the special types (not summarized, + * all-null and has-null). The regular ranges are accumulated into an + * array, so that we can calculate additional statistics (overlaps, hits + * for sample tuples, etc). + * + * XXX This needs rethinking to make it work with large indexes with more + * ranges than we can fit into memory (work_mem/maintenance_work_mem). + */ + for (heapBlk = 0; heapBlk < nblocks; heapBlk += pagesPerRange) + { + bool gottuple = false; + BrinTuple *tup; + OffsetNumber off; + Size size; + + stats->n_ranges++; + + CHECK_FOR_INTERRUPTS(); + + tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf, + &off, &size, BUFFER_LOCK_SHARE, + GetActiveSnapshot()); + if (tup) + { + gottuple = true; + btup = brin_copy_tuple(tup, size, btup, &btupsz); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + + /* Ranges with no indexed tuple are ignored for overlap analysis. */ + if (!gottuple) + { + continue; + } + else + { + dtup = brin_deform_tuple(bdesc, btup, dtup); + if (dtup->bt_placeholder) + { + /* Placeholders can be ignored too, as if not summarized. */ + continue; + } + else + { + BrinValues *bval; + + bval = &dtup->bt_columns[attnum - 1]; + + /* OK this range is summarized */ + stats->n_summarized++; + + if (bval->bv_allnulls) + stats->n_all_nulls++; + + if (bval->bv_hasnulls) + stats->n_has_nulls++; + + if (!bval->bv_allnulls) + { + BrinRange *range; + + range = &ranges->ranges[ranges->nranges++]; + + range->blkno_start = heapBlk; + range->blkno_end = heapBlk + (pagesPerRange - 1); + + range->min_value = datumCopy(bval->bv_values[0], + attr->attbyval, attr->attlen); + range->max_value = datumCopy(bval->bv_values[1], + attr->attbyval, attr->attlen); + } + } + } + } + + if (buf != InvalidBuffer) + ReleaseBuffer(buf); + + /* if we have no regular ranges, we're done */ + if (ranges->nranges == 0) + goto cleanup; + + /* + * Build auxiliary info to optimize the calculation. + * + * We have ranges in the blocknum order, but that is not very useful when + * calculating which ranges interstect - we could cross-check every range + * against every other range, but that's O(N^2) and thus may get extremely + * expensive pretty quick). + * + * To make that cheaper, we'll build two orderings, allowing us to quickly + * eliminate ranges that can't possibly overlap: + * + * - minranges = ranges ordered by min_value + * - maxranges = ranges ordered by max_value + * + * To count intersections, we'll then walk maxranges (i.e. ranges ordered + * by maxval), and for each following range we'll check if it overlaps. + * If yes, we'll proceed to the next one, until we find a range that does + * not overlap. But there might be a later page overlapping - but we can + * use a min_index_lowest tracking the minimum min_index for "future" + * ranges to quickly decide if there are such ranges. If there are none, + * we can terminate (and proceed to the next maxranges element), else we + * have to process additional ranges. + * + * Note: This only counts overlaps with ranges with max_value higher than + * the current one - we want to count all, but the overlaps with preceding + * ranges have already been counted when processing those preceding ranges. + * That is, we'll end up with counting each overlap just for one of those + * ranges, so we get only 1/2 the count. + * + * Note: We don't count the range as overlapping with itself. This needs + * to be considered later, when applying the statistics. + * + * + * XXX This will not work for very many ranges - we can have up to 2^32 of + * them, so allocating a ~32B struct for each would need a lot of memory. + * Not sure what to do about that, perhaps we could sample a couple ranges + * and do some calculations based on that? That is, we could process all + * ranges up to some number (say, statistics_target * 300, as for rows), and + * then sample ranges for larger tables. Then sort the sampled ranges, and + * walk through all ranges once, comparing them to the sample and counting + * overlaps (having them sorted should allow making this quite efficient, + * I think - following algorithm similar to the one implemented here). + */ + + /* info about ordering for the data type */ + typoid = get_atttype(RelationGetRelid(indexRel), attnum); + typcache = lookup_type_cache(typoid, TYPECACHE_CMP_PROC_FINFO); + + /* shouldn't happen, I think - we use this to build the index */ + Assert(OidIsValid(typcache->cmp_proc_finfo.fn_oid)); + + minranges = (BrinRange **) palloc0(ranges->nranges * sizeof(BrinRanges *)); + maxranges = (BrinRange **) palloc0(ranges->nranges * sizeof(BrinRanges *)); + + /* + * Build and sort the ranges min_value / max_value (just pointers + * to the main array). Then go and assign the min_index to each + * range, and finally walk the maxranges array backwards and track + * the min_index_lowest as minimum of "future" indexes. + */ + for (int i = 0; i < ranges->nranges; i++) + { + minranges[i] = &ranges->ranges[i]; + maxranges[i] = &ranges->ranges[i]; + } + + qsort_arg(minranges, ranges->nranges, sizeof(BrinRange *), + range_minval_cmp, typcache); + + qsort_arg(maxranges, ranges->nranges, sizeof(BrinRange *), + range_maxval_cmp, typcache); + + /* + * Update the min_index for each range. If the values are equal, be sure to + * pick the lowest index with that min_value. + */ + minranges[0]->min_index = 0; + for (int i = 1; i < ranges->nranges; i++) + { + if (range_values_cmp(&minranges[i]->min_value, &minranges[i-1]->min_value, typcache) == 0) + minranges[i]->min_index = minranges[i-1]->min_index; + else + minranges[i]->min_index = i; + } + + /* + * Walk the maxranges backward and assign the min_index_lowest as + * a running minimum. + */ + prev_min_index = ranges->nranges; + for (int i = (ranges->nranges - 1); i >= 0; i--) + { + maxranges[i]->min_index_lowest = Min(maxranges[i]->min_index, + prev_min_index); + prev_min_index = maxranges[i]->min_index_lowest; + } + + /* calculate average number of overlapping ranges for any range */ + brin_minmax_count_overlaps(minranges, ranges->nranges, typcache, stats); + + /* calculate minval/maxval stats (distinct values and correlation) */ + brin_minmax_value_stats(minranges, maxranges, + ranges->nranges, typcache, stats); + + /* + * If processing expression, prepare context to evaluate it. + * + * XXX cleanup / refactoring needed + */ + if (expr) + { + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + + /* Need a slot to hold the current heap tuple, too */ + slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRel), + &TTSOpsHeapTuple); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + exprstate = ExecPrepareExpr(expr, estate); + } + + /* match tuples to ranges */ + { + int nvalues = 0; + Datum *values = (Datum *) palloc0(numrows * sizeof(Datum)); + + TupleDesc tdesc = RelationGetDescr(heapRel); + + for (int i = 0; i < numrows; i++) + { + bool isnull; + Datum value; + + if (!expr) + value = heap_getattr(rows[i], heap_attnum, tdesc, &isnull); + else + { + /* + * Reset the per-tuple context each time, to reclaim any cruft + * left behind by evaluating the predicate or index expressions. + */ + ResetExprContext(econtext); + + /* Set up for predicate or expression evaluation */ + ExecStoreHeapTuple(rows[i], slot, false); + + value = ExecEvalExpr(exprstate, + GetPerTupleExprContext(estate), + &isnull); + } + + if (!isnull) + values[nvalues++] = value; + } + + qsort_arg(values, nvalues, sizeof(Datum), range_values_cmp, typcache); + + /* optimized algorithm */ + brin_minmax_match_tuples_to_ranges(ranges, + numrows, rows, nvalues, values, + typcache, stats); + + brin_minmax_increment_stats(minranges, maxranges, ranges->nranges, + values, nvalues, typcache, stats); + } + + /* XXX cleanup / refactoring needed */ + if (expr) + { + ExecDropSingleTupleTableSlot(slot); + FreeExecutorState(estate); + } + + /* + * Possibly quite large, so release explicitly and don't rely + * on the memory context to discard this. + */ + pfree(minranges); + pfree(maxranges); + +cleanup: + /* possibly quite large, so release explicitly */ + pfree(ranges); + + /* free the BrinOpaque, just like brinendscan() would */ + brinRevmapTerminate(opaque->bo_rmAccess); + brin_free_desc(opaque->bo_bdesc); + + PG_RETURN_POINTER(stats); +} + /* * Cache and return the procedure for the given strategy. * diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 65750958bb2..984a7f85cda 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -16,6 +16,7 @@ #include +#include "access/brin_internal.h" #include "access/detoast.h" #include "access/genam.h" #include "access/multixact.h" @@ -30,6 +31,7 @@ #include "catalog/catalog.h" #include "catalog/index.h" #include "catalog/indexing.h" +#include "catalog/pg_am.h" #include "catalog/pg_collation.h" #include "catalog/pg_inherits.h" #include "catalog/pg_namespace.h" @@ -81,6 +83,7 @@ typedef struct AnlIndexData /* Default statistics target (GUC parameter) */ int default_statistics_target = 100; +bool enable_indexam_stats = false; /* A few variables that don't seem worth passing around as parameters */ static MemoryContext anl_context = NULL; @@ -92,7 +95,7 @@ static void do_analyze_rel(Relation onerel, AcquireSampleRowsFunc acquirefunc, BlockNumber relpages, bool inh, bool in_outer_xact, int elevel); static void compute_index_stats(Relation onerel, double totalrows, - AnlIndexData *indexdata, int nindexes, + AnlIndexData *indexdata, Relation *indexRels, int nindexes, HeapTuple *rows, int numrows, MemoryContext col_context); static VacAttrStats *examine_attribute(Relation onerel, int attnum, @@ -453,15 +456,49 @@ do_analyze_rel(Relation onerel, VacuumParams *params, { AnlIndexData *thisdata = &indexdata[ind]; IndexInfo *indexInfo; + bool collectAmStats; + Oid regproc; thisdata->indexInfo = indexInfo = BuildIndexInfo(Irel[ind]); thisdata->tupleFract = 1.0; /* fix later if partial */ - if (indexInfo->ii_Expressions != NIL && va_cols == NIL) + + /* + * Should we collect AM-specific statistics for any of the columns? + * + * If AM-specific statistics are enabled (using a GUC), see if we + * have an optional support procedure to build the statistics. + * + * If there's any such attribute, we just force building stats + * even for regular index keys (not just expressions) and indexes + * without predicates. It'd be good to only build the AM stats, but + * for now this is good enough. + * + * XXX The GUC is there morestly to make it easier to enable/disable + * this during development. + * + * FIXME Only build the AM statistics, not the other stats. And only + * do that for the keys with the optional procedure. not all of them. + */ + collectAmStats = false; + if (enable_indexam_stats && (Irel[ind]->rd_indam->amstatsprocnum != 0)) + { + for (int j = 0; j < indexInfo->ii_NumIndexAttrs; j++) + { + regproc = index_getprocid(Irel[ind], (j+1), Irel[ind]->rd_indam->amstatsprocnum); + if (OidIsValid(regproc)) + { + collectAmStats = true; + break; + } + } + } + + if ((indexInfo->ii_Expressions != NIL || collectAmStats) && va_cols == NIL) { ListCell *indexpr_item = list_head(indexInfo->ii_Expressions); thisdata->vacattrstats = (VacAttrStats **) - palloc(indexInfo->ii_NumIndexAttrs * sizeof(VacAttrStats *)); + palloc0(indexInfo->ii_NumIndexAttrs * sizeof(VacAttrStats *)); tcnt = 0; for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++) { @@ -482,6 +519,12 @@ do_analyze_rel(Relation onerel, VacuumParams *params, if (thisdata->vacattrstats[tcnt] != NULL) tcnt++; } + else + { + thisdata->vacattrstats[tcnt] = + examine_attribute(Irel[ind], i + 1, NULL); + tcnt++; + } } thisdata->attr_cnt = tcnt; } @@ -587,7 +630,7 @@ do_analyze_rel(Relation onerel, VacuumParams *params, if (nindexes > 0) compute_index_stats(onerel, totalrows, - indexdata, nindexes, + indexdata, Irel, nindexes, rows, numrows, col_context); @@ -821,12 +864,93 @@ do_analyze_rel(Relation onerel, VacuumParams *params, anl_context = NULL; } +/* + * compute_indexam_stats + * Call the optional procedure to compute AM-specific statistics. + * + * We simply call the procedure, which is expected to produce a bytea value. + * + * At the moment this only deals with BRIN indexes, and bails out for other + * access methods, but it should be generic - use something like amoptsprocnum + * and just check if the procedure exists. + */ +static void +compute_indexam_stats(Relation onerel, + Relation indexRel, IndexInfo *indexInfo, + double totalrows, AnlIndexData *indexdata, + HeapTuple *rows, int numrows) +{ + int expridx; + + if (!enable_indexam_stats) + return; + + /* ignore index AMs without the optional procedure */ + if (indexRel->rd_indam->amstatsprocnum == 0) + return; + + /* + * Look at attributes, and calculate stats for those that have the + * optional stats proc for the opfamily. + */ + expridx = 0; + for (int i = 0; i < indexInfo->ii_NumIndexAttrs; i++) + { + AttrNumber attno = (i + 1); + AttrNumber attnum = indexInfo->ii_IndexAttrNumbers[i]; /* heap attnum */ + RegProcedure regproc; + FmgrInfo *statsproc; + Datum datum; + VacAttrStats *stats; + MemoryContext oldcxt; + Node *expr = NULL; + + if (!AttributeNumberIsValid(attnum)) + { + expr = (Node *) list_nth(RelationGetIndexExpressions(indexRel), + expridx); + expridx++; + } + + /* do this first, as it doesn't fail when proc not defined */ + regproc = index_getprocid(indexRel, attno, indexRel->rd_indam->amstatsprocnum); + + /* ignore opclasses without the optional procedure */ + if (!RegProcedureIsValid(regproc)) + continue; + + statsproc = index_getprocinfo(indexRel, attno, indexRel->rd_indam->amstatsprocnum); + Assert(statsproc != NULL); + + stats = indexdata->vacattrstats[i]; + + oldcxt = MemoryContextSwitchTo(stats->anl_context); + + /* call the proc, let the AM calculate whatever it wants */ + /* XXX maybe we should just pass the index attno and leave the + * expression handling up to the procedure? */ + datum = FunctionCall7Coll(statsproc, + InvalidOid, /* FIXME correct collation */ + PointerGetDatum(onerel), + PointerGetDatum(indexRel), + Int16GetDatum(attno), + Int16GetDatum(attnum), + PointerGetDatum(expr), + PointerGetDatum(rows), + Int32GetDatum(numrows)); + + stats->staindexam = datum; + + MemoryContextSwitchTo(oldcxt); + } +} + /* * Compute statistics about indexes of a relation */ static void compute_index_stats(Relation onerel, double totalrows, - AnlIndexData *indexdata, int nindexes, + AnlIndexData *indexdata, Relation *indexRels, int nindexes, HeapTuple *rows, int numrows, MemoryContext col_context) { @@ -846,6 +970,7 @@ compute_index_stats(Relation onerel, double totalrows, { AnlIndexData *thisdata = &indexdata[ind]; IndexInfo *indexInfo = thisdata->indexInfo; + Relation indexRel = indexRels[ind]; int attr_cnt = thisdata->attr_cnt; TupleTableSlot *slot; EState *estate; @@ -858,6 +983,13 @@ compute_index_stats(Relation onerel, double totalrows, rowno; double totalindexrows; + /* + * If this is a BRIN index, try calling a procedure to collect + * extra opfamily-specific statistics (if procedure defined). + */ + compute_indexam_stats(onerel, indexRel, indexInfo, totalrows, + thisdata, rows, numrows); + /* Ignore index if no columns to analyze and not partial */ if (attr_cnt == 0 && indexInfo->ii_Predicate == NIL) continue; @@ -1661,6 +1793,13 @@ update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats) values[Anum_pg_statistic_stanullfrac - 1] = Float4GetDatum(stats->stanullfrac); values[Anum_pg_statistic_stawidth - 1] = Int32GetDatum(stats->stawidth); values[Anum_pg_statistic_stadistinct - 1] = Float4GetDatum(stats->stadistinct); + + /* optional AM-specific stats */ + if (DatumGetPointer(stats->staindexam) != NULL) + values[Anum_pg_statistic_staindexam - 1] = stats->staindexam; + else + nulls[Anum_pg_statistic_staindexam - 1] = true; + i = Anum_pg_statistic_stakind1 - 1; for (k = 0; k < STATISTIC_NUM_SLOTS; k++) { diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index 572d9b44643..97fee77ea57 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -2370,6 +2370,8 @@ serialize_expr_stats(AnlExprData *exprdata, int nexprs) values[Anum_pg_statistic_stanullfrac - 1] = Float4GetDatum(stats->stanullfrac); values[Anum_pg_statistic_stawidth - 1] = Int32GetDatum(stats->stawidth); values[Anum_pg_statistic_stadistinct - 1] = Float4GetDatum(stats->stadistinct); + nulls[Anum_pg_statistic_staindexam - 1] = true; + i = Anum_pg_statistic_stakind1 - 1; for (k = 0; k < STATISTIC_NUM_SLOTS; k++) { diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index fe37e65af03..cc2f3ef012a 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -7834,6 +7834,7 @@ brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Relation indexRel; ListCell *l; VariableStatData vardata; + double averageOverlaps; Assert(rte->rtekind == RTE_RELATION); @@ -7881,6 +7882,7 @@ brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * correlation statistics, we will keep it as 0. */ *indexCorrelation = 0; + averageOverlaps = 0.0; foreach(l, path->indexclauses) { @@ -7890,6 +7892,36 @@ brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count, /* attempt to lookup stats in relation for this index column */ if (attnum != 0) { + /* + * If AM-specific statistics are enabled, try looking up the stats + * for the index key. We only have this for minmax opclasses, so + * we just cast it like that. But other BRIN opclasses might need + * other stats so either we need to abstract this somehow, or maybe + * just collect a sufficiently generic stats for all BRIN indexes. + * + * XXX Make this non-minmax specific. + */ + if (enable_indexam_stats) + { + BrinMinmaxStats *amstats + = (BrinMinmaxStats *) get_attindexam(index->indexoid, attnum); + + if (amstats) + { + elog(DEBUG1, "found AM stats: attnum %d n_ranges %lld n_summarized %lld n_all_nulls %lld n_has_nulls %lld avg_overlaps %f", + attnum, (long long)amstats->n_ranges, (long long)amstats->n_summarized, + (long long)amstats->n_all_nulls, (long long)amstats->n_has_nulls, + amstats->avg_overlaps); + + /* + * The only thing we use at the moment is the average number + * of overlaps for a single range. Use the other stuff too. + */ + averageOverlaps = Max(averageOverlaps, + 1.0 + amstats->avg_overlaps); + } + } + /* Simple variable -- look to stats for the underlying table */ if (get_relation_stats_hook && (*get_relation_stats_hook) (root, rte, attnum, &vardata)) @@ -7970,6 +8002,14 @@ brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count, baserel->relid, JOIN_INNER, NULL); + /* + * XXX Can we combine qualSelectivity with the average number of matching + * ranges per value? qualSelectivity estimates how many tuples ar we + * going to match, and average number of matches says how many ranges + * will each of those match on average. We don't know how many will + * be duplicate, but it gives us a worst-case estimate, at least. + */ + /* * Now calculate the minimum possible ranges we could match with if all of * the rows were in the perfect order in the table's heap. @@ -7986,6 +8026,25 @@ brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count, else estimatedRanges = Min(minimalRanges / *indexCorrelation, indexRanges); + elog(DEBUG1, "before index AM stats: cestimatedRanges = %f", estimatedRanges); + + /* + * If we found some AM stats, look at average number of overlapping ranges, + * and apply that to the currently estimated ranges. + * + * XXX We pretty much combine this with correlation info (because it was + * already applied in the estimatedRanges formula above), which might be + * overly pessimistic. The overlaps stats seems somewhat redundant with + * the correlation, so maybe we should do just one? The AM stats seems + * like a more reliable information, because the correlation is not very + * sensitive to outliers, for example. So maybe let's prefer that, and + * only use the correlation as fallback when AM stats are not available? + */ + if (averageOverlaps > 0.0) + estimatedRanges = Min(estimatedRanges * averageOverlaps, indexRanges); + + elog(DEBUG1, "after index AM stats: cestimatedRanges = %f", estimatedRanges); + /* we expect to visit this portion of the table */ selec = estimatedRanges / indexRanges; diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index c07382051d6..e41aabdeae0 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -3138,6 +3138,47 @@ get_attavgwidth(Oid relid, AttrNumber attnum) return 0; } + +/* + * get_attstaindexam + * + * Given the table and attribute number of a column, get the index AM + * statistics. Return NULL if no data available. + * + * Currently this is only consulted for individual tables, not for inheritance + * trees, so we don't need an "inh" parameter. + */ +bytea * +get_attindexam(Oid relid, AttrNumber attnum) +{ + HeapTuple tp; + + tp = SearchSysCache3(STATRELATTINH, + ObjectIdGetDatum(relid), + Int16GetDatum(attnum), + BoolGetDatum(false)); + if (HeapTupleIsValid(tp)) + { + Datum val; + bytea *retval = NULL; + bool isnull; + + val = SysCacheGetAttr(STATRELATTINH, tp, + Anum_pg_statistic_staindexam, + &isnull); + + if (!isnull) + retval = (bytea *) PG_DETOAST_DATUM(val); + + // staindexam = ((Form_pg_statistic) GETSTRUCT(tp))->staindexam; + ReleaseSysCache(tp); + + return retval; + } + + return NULL; +} + /* * get_attstatsslot * diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 1c0583fe267..cd3d97a3a63 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -1003,6 +1003,16 @@ struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_indexam_stats", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of index AM stats."), + NULL, + GUC_EXPLAIN + }, + &enable_indexam_stats, + false, + NULL, NULL, NULL + }, { {"geqo", PGC_USERSET, QUERY_TUNING_GEQO, gettext_noop("Enables genetic query optimization."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index d06074b86f6..47e80ad150c 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -375,6 +375,7 @@ #enable_hashagg = on #enable_hashjoin = on #enable_incremental_sort = on +#enable_indexam_stats = off #enable_indexscan = on #enable_indexonlyscan = on #enable_material = on diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 4f1f67b4d03..e3eab725ae5 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -216,6 +216,8 @@ typedef struct IndexAmRoutine uint16 amsupport; /* opclass options support function number or 0 */ uint16 amoptsprocnum; + /* opclass statistics support function number or 0 */ + uint16 amstatsprocnum; /* does AM support ORDER BY indexed column's value? */ bool amcanorder; /* does AM support ORDER BY result of an operator on indexed column? */ diff --git a/src/include/access/brin.h b/src/include/access/brin.h index ed66f1b3d51..1d21b816fcd 100644 --- a/src/include/access/brin.h +++ b/src/include/access/brin.h @@ -34,6 +34,69 @@ typedef struct BrinStatsData BlockNumber revmapNumPages; } BrinStatsData; +/* + * Info about ranges for BRIN Sort. + */ +typedef struct BrinRange +{ + BlockNumber blkno_start; + BlockNumber blkno_end; + + Datum min_value; + Datum max_value; + bool has_nulls; + bool all_nulls; + bool not_summarized; + + /* + * Index of the range when ordered by min_value (if there are multiple + * ranges with the same min_value, it's the lowest one). + */ + uint32 min_index; + + /* + * Minimum min_index from all ranges with higher max_value (i.e. when + * sorted by max_value). If there are multiple ranges with the same + * max_value, it depends on the ordering (i.e. the ranges may get + * different min_index_lowest, depending on the exact ordering). + */ + uint32 min_index_lowest; +} BrinRange; + +typedef struct BrinRanges +{ + int nranges; + BrinRange ranges[FLEXIBLE_ARRAY_MEMBER]; +} BrinRanges; + +typedef struct BrinMinmaxStats +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int64 n_ranges; + int64 n_summarized; + int64 n_all_nulls; + int64 n_has_nulls; + + /* average number of overlapping ranges */ + double avg_overlaps; + + /* average number of matching ranges (per value) */ + double avg_matches; + double avg_matches_unique; + + /* minval/maxval stats (ndistinct, correlation to blkno) */ + int64 minval_ndistinct; + int64 maxval_ndistinct; + double minval_correlation; + double maxval_correlation; + + /* minval/maxval increment stats */ + double minval_increment_avg; + double minval_increment_max; + double maxval_increment_avg; + double maxval_increment_max; + +} BrinMinmaxStats; #define BRIN_DEFAULT_PAGES_PER_RANGE 128 #define BrinGetPagesPerRange(relation) \ diff --git a/src/include/access/brin_internal.h b/src/include/access/brin_internal.h index 97ddc925b27..eac796e6f47 100644 --- a/src/include/access/brin_internal.h +++ b/src/include/access/brin_internal.h @@ -75,6 +75,7 @@ typedef struct BrinDesc #define BRIN_PROCNUM_OPTIONS 5 /* optional */ /* procedure numbers up to 10 are reserved for BRIN future expansion */ #define BRIN_FIRST_OPTIONAL_PROCNUM 11 +#define BRIN_PROCNUM_STATISTICS 11 /* optional */ #define BRIN_LAST_OPTIONAL_PROCNUM 15 #undef BRIN_DEBUG diff --git a/src/include/catalog/pg_amproc.dat b/src/include/catalog/pg_amproc.dat index 5b950129de0..9bbd1f14f12 100644 --- a/src/include/catalog/pg_amproc.dat +++ b/src/include/catalog/pg_amproc.dat @@ -804,6 +804,8 @@ amproc => 'brin_minmax_consistent' }, { amprocfamily => 'brin/bytea_minmax_ops', amproclefttype => 'bytea', amprocrighttype => 'bytea', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/bytea_minmax_ops', amproclefttype => 'bytea', + amprocrighttype => 'bytea', amprocnum => '11', amproc => 'brin_minmax_stats' }, # bloom bytea { amprocfamily => 'brin/bytea_bloom_ops', amproclefttype => 'bytea', @@ -835,6 +837,8 @@ amproc => 'brin_minmax_consistent' }, { amprocfamily => 'brin/char_minmax_ops', amproclefttype => 'char', amprocrighttype => 'char', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/char_minmax_ops', amproclefttype => 'char', + amprocrighttype => 'char', amprocnum => '11', amproc => 'brin_minmax_stats' }, # bloom "char" { amprocfamily => 'brin/char_bloom_ops', amproclefttype => 'char', @@ -864,6 +868,8 @@ amproc => 'brin_minmax_consistent' }, { amprocfamily => 'brin/name_minmax_ops', amproclefttype => 'name', amprocrighttype => 'name', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/name_minmax_ops', amproclefttype => 'name', + amprocrighttype => 'name', amprocnum => '11', amproc => 'brin_minmax_stats' }, # bloom name { amprocfamily => 'brin/name_bloom_ops', amproclefttype => 'name', @@ -893,6 +899,8 @@ amproc => 'brin_minmax_consistent' }, { amprocfamily => 'brin/integer_minmax_ops', amproclefttype => 'int8', amprocrighttype => 'int8', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/integer_minmax_ops', amproclefttype => 'int8', + amprocrighttype => 'int8', amprocnum => '11', amproc => 'brin_minmax_stats' }, { amprocfamily => 'brin/integer_minmax_ops', amproclefttype => 'int2', amprocrighttype => 'int2', amprocnum => '1', @@ -905,6 +913,8 @@ amproc => 'brin_minmax_consistent' }, { amprocfamily => 'brin/integer_minmax_ops', amproclefttype => 'int2', amprocrighttype => 'int2', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/integer_minmax_ops', amproclefttype => 'int2', + amprocrighttype => 'int2', amprocnum => '11', amproc => 'brin_minmax_stats' }, { amprocfamily => 'brin/integer_minmax_ops', amproclefttype => 'int4', amprocrighttype => 'int4', amprocnum => '1', @@ -917,6 +927,8 @@ amproc => 'brin_minmax_consistent' }, { amprocfamily => 'brin/integer_minmax_ops', amproclefttype => 'int4', amprocrighttype => 'int4', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/integer_minmax_ops', amproclefttype => 'int4', + amprocrighttype => 'int4', amprocnum => '11', amproc => 'brin_minmax_stats' }, # minmax multi integer: int2, int4, int8 { amprocfamily => 'brin/integer_minmax_multi_ops', amproclefttype => 'int2', @@ -1034,6 +1046,8 @@ amproc => 'brin_minmax_consistent' }, { amprocfamily => 'brin/text_minmax_ops', amproclefttype => 'text', amprocrighttype => 'text', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/text_minmax_ops', amproclefttype => 'text', + amprocrighttype => 'text', amprocnum => '11', amproc => 'brin_minmax_stats' }, # bloom text { amprocfamily => 'brin/text_bloom_ops', amproclefttype => 'text', @@ -1062,6 +1076,8 @@ amproc => 'brin_minmax_consistent' }, { amprocfamily => 'brin/oid_minmax_ops', amproclefttype => 'oid', amprocrighttype => 'oid', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/oid_minmax_ops', amproclefttype => 'oid', + amprocrighttype => 'oid', amprocnum => '11', amproc => 'brin_minmax_stats' }, # minmax multi oid { amprocfamily => 'brin/oid_minmax_multi_ops', amproclefttype => 'oid', @@ -1110,6 +1126,8 @@ amproc => 'brin_minmax_consistent' }, { amprocfamily => 'brin/tid_minmax_ops', amproclefttype => 'tid', amprocrighttype => 'tid', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/tid_minmax_ops', amproclefttype => 'tid', + amprocrighttype => 'tid', amprocnum => '11', amproc => 'brin_minmax_stats' }, # bloom tid { amprocfamily => 'brin/tid_bloom_ops', amproclefttype => 'tid', @@ -1160,6 +1178,9 @@ { amprocfamily => 'brin/float_minmax_ops', amproclefttype => 'float4', amprocrighttype => 'float4', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/float_minmax_ops', amproclefttype => 'float4', + amprocrighttype => 'float4', amprocnum => '11', + amproc => 'brin_minmax_stats' }, { amprocfamily => 'brin/float_minmax_ops', amproclefttype => 'float8', amprocrighttype => 'float8', amprocnum => '1', @@ -1173,6 +1194,9 @@ { amprocfamily => 'brin/float_minmax_ops', amproclefttype => 'float8', amprocrighttype => 'float8', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/float_minmax_ops', amproclefttype => 'float8', + amprocrighttype => 'float8', amprocnum => '11', + amproc => 'brin_minmax_stats' }, # minmax multi float { amprocfamily => 'brin/float_minmax_multi_ops', amproclefttype => 'float4', @@ -1261,6 +1285,9 @@ { amprocfamily => 'brin/macaddr_minmax_ops', amproclefttype => 'macaddr', amprocrighttype => 'macaddr', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/macaddr_minmax_ops', amproclefttype => 'macaddr', + amprocrighttype => 'macaddr', amprocnum => '11', + amproc => 'brin_minmax_stats' }, # minmax multi macaddr { amprocfamily => 'brin/macaddr_minmax_multi_ops', amproclefttype => 'macaddr', @@ -1314,6 +1341,9 @@ { amprocfamily => 'brin/macaddr8_minmax_ops', amproclefttype => 'macaddr8', amprocrighttype => 'macaddr8', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/macaddr8_minmax_ops', amproclefttype => 'macaddr8', + amprocrighttype => 'macaddr8', amprocnum => '11', + amproc => 'brin_minmax_stats' }, # minmax multi macaddr8 { amprocfamily => 'brin/macaddr8_minmax_multi_ops', @@ -1366,6 +1396,8 @@ amproc => 'brin_minmax_consistent' }, { amprocfamily => 'brin/network_minmax_ops', amproclefttype => 'inet', amprocrighttype => 'inet', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/network_minmax_ops', amproclefttype => 'inet', + amprocrighttype => 'inet', amprocnum => '11', amproc => 'brin_minmax_stats' }, # minmax multi inet { amprocfamily => 'brin/network_minmax_multi_ops', amproclefttype => 'inet', @@ -1436,6 +1468,9 @@ { amprocfamily => 'brin/bpchar_minmax_ops', amproclefttype => 'bpchar', amprocrighttype => 'bpchar', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/bpchar_minmax_ops', amproclefttype => 'bpchar', + amprocrighttype => 'bpchar', amprocnum => '11', + amproc => 'brin_minmax_stats' }, # bloom character { amprocfamily => 'brin/bpchar_bloom_ops', amproclefttype => 'bpchar', @@ -1467,6 +1502,8 @@ amproc => 'brin_minmax_consistent' }, { amprocfamily => 'brin/time_minmax_ops', amproclefttype => 'time', amprocrighttype => 'time', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/time_minmax_ops', amproclefttype => 'time', + amprocrighttype => 'time', amprocnum => '11', amproc => 'brin_minmax_stats' }, # minmax multi time without time zone { amprocfamily => 'brin/time_minmax_multi_ops', amproclefttype => 'time', @@ -1517,6 +1554,9 @@ { amprocfamily => 'brin/datetime_minmax_ops', amproclefttype => 'timestamp', amprocrighttype => 'timestamp', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/datetime_minmax_ops', amproclefttype => 'timestamp', + amprocrighttype => 'timestamp', amprocnum => '11', + amproc => 'brin_minmax_stats' }, { amprocfamily => 'brin/datetime_minmax_ops', amproclefttype => 'timestamptz', amprocrighttype => 'timestamptz', amprocnum => '1', @@ -1530,6 +1570,9 @@ { amprocfamily => 'brin/datetime_minmax_ops', amproclefttype => 'timestamptz', amprocrighttype => 'timestamptz', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/datetime_minmax_ops', amproclefttype => 'timestamptz', + amprocrighttype => 'timestamptz', amprocnum => '11', + amproc => 'brin_minmax_stats' }, { amprocfamily => 'brin/datetime_minmax_ops', amproclefttype => 'date', amprocrighttype => 'date', amprocnum => '1', @@ -1542,6 +1585,8 @@ amproc => 'brin_minmax_consistent' }, { amprocfamily => 'brin/datetime_minmax_ops', amproclefttype => 'date', amprocrighttype => 'date', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/datetime_minmax_ops', amproclefttype => 'date', + amprocrighttype => 'date', amprocnum => '11', amproc => 'brin_minmax_stats' }, # minmax multi datetime (date, timestamp, timestamptz) { amprocfamily => 'brin/datetime_minmax_multi_ops', @@ -1668,6 +1713,9 @@ { amprocfamily => 'brin/interval_minmax_ops', amproclefttype => 'interval', amprocrighttype => 'interval', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/interval_minmax_ops', amproclefttype => 'interval', + amprocrighttype => 'interval', amprocnum => '11', + amproc => 'brin_minmax_stats' }, # minmax multi interval { amprocfamily => 'brin/interval_minmax_multi_ops', @@ -1721,6 +1769,9 @@ { amprocfamily => 'brin/timetz_minmax_ops', amproclefttype => 'timetz', amprocrighttype => 'timetz', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/timetz_minmax_ops', amproclefttype => 'timetz', + amprocrighttype => 'timetz', amprocnum => '11', + amproc => 'brin_minmax_stats' }, # minmax multi time with time zone { amprocfamily => 'brin/timetz_minmax_multi_ops', amproclefttype => 'timetz', @@ -1771,6 +1822,8 @@ amproc => 'brin_minmax_consistent' }, { amprocfamily => 'brin/bit_minmax_ops', amproclefttype => 'bit', amprocrighttype => 'bit', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/bit_minmax_ops', amproclefttype => 'bit', + amprocrighttype => 'bit', amprocnum => '11', amproc => 'brin_minmax_stats' }, # minmax bit varying { amprocfamily => 'brin/varbit_minmax_ops', amproclefttype => 'varbit', @@ -1785,6 +1838,9 @@ { amprocfamily => 'brin/varbit_minmax_ops', amproclefttype => 'varbit', amprocrighttype => 'varbit', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/varbit_minmax_ops', amproclefttype => 'varbit', + amprocrighttype => 'varbit', amprocnum => '11', + amproc => 'brin_minmax_stats' }, # minmax numeric { amprocfamily => 'brin/numeric_minmax_ops', amproclefttype => 'numeric', @@ -1799,6 +1855,9 @@ { amprocfamily => 'brin/numeric_minmax_ops', amproclefttype => 'numeric', amprocrighttype => 'numeric', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/numeric_minmax_ops', amproclefttype => 'numeric', + amprocrighttype => 'numeric', amprocnum => '11', + amproc => 'brin_minmax_stats' }, # minmax multi numeric { amprocfamily => 'brin/numeric_minmax_multi_ops', amproclefttype => 'numeric', @@ -1851,6 +1910,8 @@ amproc => 'brin_minmax_consistent' }, { amprocfamily => 'brin/uuid_minmax_ops', amproclefttype => 'uuid', amprocrighttype => 'uuid', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/uuid_minmax_ops', amproclefttype => 'uuid', + amprocrighttype => 'uuid', amprocnum => '11', amproc => 'brin_minmax_stats' }, # minmax multi uuid { amprocfamily => 'brin/uuid_minmax_multi_ops', amproclefttype => 'uuid', @@ -1924,6 +1985,9 @@ { amprocfamily => 'brin/pg_lsn_minmax_ops', amproclefttype => 'pg_lsn', amprocrighttype => 'pg_lsn', amprocnum => '4', amproc => 'brin_minmax_union' }, +{ amprocfamily => 'brin/pg_lsn_minmax_ops', amproclefttype => 'pg_lsn', + amprocrighttype => 'pg_lsn', amprocnum => '11', + amproc => 'brin_minmax_stats' }, # minmax multi pg_lsn { amprocfamily => 'brin/pg_lsn_minmax_multi_ops', amproclefttype => 'pg_lsn', diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index e2a7642a2ba..af31ff911eb 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -8496,6 +8496,10 @@ { oid => '3386', descr => 'BRIN minmax support', proname => 'brin_minmax_union', prorettype => 'bool', proargtypes => 'internal internal internal', prosrc => 'brin_minmax_union' }, +{ oid => '9800', descr => 'BRIN minmax support', + proname => 'brin_minmax_stats', prorettype => 'bool', + proargtypes => 'internal internal int2 int2 internal int4', + prosrc => 'brin_minmax_stats' }, # BRIN minmax multi { oid => '4616', descr => 'BRIN multi minmax support', diff --git a/src/include/catalog/pg_statistic.h b/src/include/catalog/pg_statistic.h index 8770c5b4c60..d3d0bce257a 100644 --- a/src/include/catalog/pg_statistic.h +++ b/src/include/catalog/pg_statistic.h @@ -121,6 +121,11 @@ CATALOG(pg_statistic,2619,StatisticRelationId) anyarray stavalues3; anyarray stavalues4; anyarray stavalues5; + + /* + * Statistics calculated by index AM (e.g. BRIN for ranges, etc.). + */ + bytea staindexam; #endif } FormData_pg_statistic; diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 689dbb77024..dba411cacf7 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -155,6 +155,7 @@ typedef struct VacAttrStats float4 *stanumbers[STATISTIC_NUM_SLOTS]; int numvalues[STATISTIC_NUM_SLOTS]; Datum *stavalues[STATISTIC_NUM_SLOTS]; + Datum staindexam; /* index-specific stats (as bytea) */ /* * These fields describe the stavalues[n] element types. They will be @@ -299,6 +300,7 @@ extern PGDLLIMPORT int vacuum_multixact_freeze_min_age; extern PGDLLIMPORT int vacuum_multixact_freeze_table_age; extern PGDLLIMPORT int vacuum_failsafe_age; extern PGDLLIMPORT int vacuum_multixact_failsafe_age; +extern PGDLLIMPORT bool enable_indexam_stats; /* Variables for cost-based parallel vacuum */ extern PGDLLIMPORT pg_atomic_uint32 *VacuumSharedCostBalance; diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h index 4f5418b9728..fcef91d306d 100644 --- a/src/include/utils/lsyscache.h +++ b/src/include/utils/lsyscache.h @@ -185,6 +185,7 @@ extern Oid getBaseType(Oid typid); extern Oid getBaseTypeAndTypmod(Oid typid, int32 *typmod); extern int32 get_typavgwidth(Oid typid, int32 typmod); extern int32 get_attavgwidth(Oid relid, AttrNumber attnum); +extern bytea *get_attindexam(Oid relid, AttrNumber attnum); extern bool get_attstatsslot(AttStatsSlot *sslot, HeapTuple statstuple, int reqkind, Oid reqop, int flags); extern void free_attstatsslot(AttStatsSlot *sslot); diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 001c6e7eb9d..b7fda6fc828 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -117,6 +117,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_hashagg | on enable_hashjoin | on enable_incremental_sort | on + enable_indexam_stats | off enable_indexonlyscan | on enable_indexscan | on enable_material | on @@ -132,7 +133,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(21 rows) +(22 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail -- 2.39.2