This patch implements the new tuple sampling method as discussed on
-hackers and -performance a few weeks ago.
"Large DB" -hackers 2004-04-02
"query slows down with more accurate stats" -perform 2004-04-13
"Tuple sampling" -hackers 2004-04-19
Servus
Manfred
diff -rcN ../base/src/backend/commands/analyze.c src/backend/commands/analyze.c
*** ../base/src/backend/commands/analyze.c Sun May 9 12:05:51 2004
--- src/backend/commands/analyze.c Sun May 9 19:33:27 2004
***************
*** 38,43 ****
--- 38,66 ----
#include "utils/syscache.h"
#include "utils/tuplesort.h"
+ /*
+ * With two-stage sampling we process at most 300000 blocks, each of
+ * which contains less than 1200 tuples (at 32K block size, smallest
+ * possible tuple size). So the number of tuples processed can be
+ * stored in a 32 bit int.
+ * Change this datatype to double if the number of rows in the table
+ * might exceed INT_MAX. Then the algorithm should work as long as the
+ * row count does not become so large that it is not represented accurately
+ * in a double (on IEEE-math machines this would be around 2^52 rows).
+ */
+ typedef int TupleCount;
+
+ /*
+ ** data structure for Algorithm S from Knuth 3.4.2
+ */
+ typedef struct
+ {
+ BlockNumber N; /* number of blocks, known in advance */
+ int n; /* sample size */
+ BlockNumber t; /* current block number */
+ int m; /* blocks selected so far */
+ } BlockSamplerData;
+ typedef BlockSamplerData *BlockSampler;
/* Per-index data for ANALYZE */
typedef struct AnlIndexData
***************
*** 57,62 ****
--- 80,89 ----
static MemoryContext anl_context = NULL;
+ static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize);
+ static bool BlockSampler_HasMore(BlockSampler bs);
+ static BlockNumber BlockSampler_Next(BlockSampler bs);
+
static void compute_index_stats(Relation onerel, double totalrows,
AnlIndexData *indexdata, int nindexes,
HeapTuple *rows, int numrows,
***************
*** 66,72 ****
int targrows, double *totalrows);
static double random_fract(void);
static double init_selection_state(int n);
! static double select_next_random_record(double t, int n, double *stateptr);
static int compare_rows(const void *a, const void *b);
static void update_attstats(Oid relid, int natts, VacAttrStats **vacattrstats);
static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
--- 93,99 ----
int targrows, double *totalrows);
static double random_fract(void);
static double init_selection_state(int n);
! static TupleCount get_next_S(TupleCount t, int n, double *stateptr);
static int compare_rows(const void *a, const void *b);
static void update_attstats(Oid relid, int natts, VacAttrStats **vacattrstats);
static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
***************
*** 638,649 ****
}
/*
* acquire_sample_rows -- acquire a random sample of rows from the table
*
! * Up to targrows rows are collected (if there are fewer than that many
! * rows in the table, all rows are collected). When the table is larger
! * than targrows, a truly random sample is collected: every row has an
! * equal chance of ending up in the final sample.
*
* We also estimate the total number of rows in the table, and return that
* into *totalrows.
--- 665,754 ----
}
/*
+ ** BlockSampler_Init -- prepare for random sampling of blocknumbers
+ **
+ ** BlockSampler is used for stage one of our new two-stage tuple
+ ** sampling mechanism as discussed on -hackers 2004-04-02 (subject
+ ** "Large DB"). It selects a random sample of samplesize blocks out of
+ ** the nblocks blocks in the table. If the table has less than
+ ** samplesize blocks, all blocks are selected.
+ **
+ */
+ static void
+ BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize)
+ {
+ bs->N = nblocks; /* table size */
+ /*
+ ** If we decide to reduce samplesize for tables that have less or
+ ** not much more than samplesize blocks, here is the place to do
+ ** it.
+ */
+ bs->n = samplesize;
+ bs->t = 0; /* blocks scanned so far */
+ bs->m = 0; /* blocks selected so far */
+ }
+
+ static bool
+ BlockSampler_HasMore(BlockSampler bs)
+ {
+ return (bs->t < bs->N) && (bs->m < bs->n);
+ }
+
+ static BlockNumber
+ BlockSampler_Next(BlockSampler bs)
+ {
+ BlockNumber K = bs->N - bs->t; /* remaining blocks */
+ int k = bs->n - bs->m; /* blocks still to sample */
+ double p; /* probability to skip the next block */
+ double V; /* random */
+
+ Assert(BlockSampler_HasMore(bs));
+
+ if (k >= K)
+ {
+ /* need all the rest */
+ bs->t += 1;
+ bs->m += 1;
+ return bs->t - 1;
+ }
+
+ p = 1.0 - (double) k / (double) K;
+ V = random_fract();
+ while (V < p)
+ {
+ bs->t += 1;
+ K -= 1;
+ /*
+ ** Assert(K > 0)
+ ** because we startet with K > k > 0,
+ ** and when K == k, the loop terminates
+ */
+ p *= 1.0 - (double) k / (double) K;
+ }
+
+ /* select */
+ bs->t += 1;
+ bs->m += 1;
+ return bs->t - 1;
+ }
+
+ /*
* acquire_sample_rows -- acquire a random sample of rows from the table
*
! * As of May 2004 we use a new two-stage method: Stage one selects up
! * to targrows random blocks (or all blocks, if there aren't so many).
! * Stage two scans these blocks and uses the Vitter algorithm to create
! * a random sample of targrows rows (or less, if there are less in the
! * sample of blocks). The two stages are executed simultaneously: Each
! * block is processed as soon as stage one returns its number and while
! * the rows are read stage two controls which ones are to be inserted
! * into the sample.
! *
! * Although every row has an equal chance of ending up in the final
! * sample, this sampling method is not perfect: not every possible
! * sample has an equal chance of being selected. For large relations
! * the number of different blocks represented by the sample tends to be
! * too small. We can live with that for now. Improvements are welcome.
*
* We also estimate the total number of rows in the table, and return that
* into *totalrows.
***************
*** 656,754 ****
double *totalrows)
{
int numrows = 0;
! HeapScanDesc scan;
BlockNumber totalblocks;
! HeapTuple tuple;
! ItemPointer lasttuple;
! BlockNumber lastblock,
! estblock;
! OffsetNumber lastoffset;
! int numest;
! double tuplesperpage;
! double t;
double rstate;
Assert(targrows > 1);
! /*
! * Do a simple linear scan until we reach the target number of rows.
! */
! scan = heap_beginscan(onerel, SnapshotNow, 0, NULL);
! totalblocks = scan->rs_nblocks; /* grab current relation size */
! while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
! {
! rows[numrows++] = heap_copytuple(tuple);
! if (numrows >= targrows)
! break;
! vacuum_delay_point();
! }
! heap_endscan(scan);
/*
! * If we ran out of tuples then we're done, no matter how few we
! * collected. No sort is needed, since they're already in order.
! */
! if (!HeapTupleIsValid(tuple))
! {
! *totalrows = (double) numrows;
!
! ereport(elevel,
! (errmsg("\"%s\": %u pages, %d rows sampled, %.0f estimated total rows",
! RelationGetRelationName(onerel),
! totalblocks, numrows, *totalrows)));
!
! return numrows;
! }
!
/*
! * Otherwise, start replacing tuples in the sample until we reach the
! * end of the relation. This algorithm is from Jeff Vitter's paper
! * (see full citation below). It works by repeatedly computing the
! * number of the next tuple we want to fetch, which will replace a
! * randomly chosen element of the reservoir (current set of tuples).
! * At all times the reservoir is a true random sample of the tuples
! * we've passed over so far, so when we fall off the end of the
! * relation we're done.
! *
! * A slight difficulty is that since we don't want to fetch tuples or
! * even pages that we skip over, it's not possible to fetch *exactly*
! * the N'th tuple at each step --- we don't know how many valid tuples
! * are on the skipped pages. We handle this by assuming that the
! * average number of valid tuples/page on the pages already scanned
! * over holds good for the rest of the relation as well; this lets us
! * estimate which page the next tuple should be on and its position in
! * the page. Then we fetch the first valid tuple at or after that
! * position, being careful not to use the same tuple twice. This
! * approach should still give a good random sample, although it's not
! * perfect.
! */
! lasttuple = &(rows[numrows - 1]->t_self);
! lastblock = ItemPointerGetBlockNumber(lasttuple);
! lastoffset = ItemPointerGetOffsetNumber(lasttuple);
!
! /*
! * If possible, estimate tuples/page using only completely-scanned
! * pages.
! */
! for (numest = numrows; numest > 0; numest--)
! {
! if (ItemPointerGetBlockNumber(&(rows[numest - 1]->t_self)) != lastblock)
! break;
! }
! if (numest == 0)
! {
! numest = numrows; /* don't have a full page? */
! estblock = lastblock + 1;
! }
! else
! estblock = lastblock;
! tuplesperpage = (double) numest / (double) estblock;
!
! t = (double) numrows; /* t is the # of records processed so far */
rstate = init_selection_state(targrows);
! for (;;)
{
- double targpos;
BlockNumber targblock;
Buffer targbuffer;
Page targpage;
--- 761,788 ----
double *totalrows)
{
int numrows = 0;
! TupleCount liverows = 0;
! TupleCount deadrows = 0;
! TupleCount rowstoskip = -1;
BlockNumber totalblocks;
! BlockSamplerData bs;
double rstate;
Assert(targrows > 1);
! totalblocks = RelationGetNumberOfBlocks(onerel);
/*
! ** Prepare for sampling block numbers
! */
! BlockSampler_Init(&bs, totalblocks, targrows);
/*
! ** Prepare for sampling rows
! */
rstate = init_selection_state(targrows);
!
! while (BlockSampler_HasMore(&bs))
{
BlockNumber targblock;
Buffer targbuffer;
Page targpage;
***************
*** 757,783 ****
vacuum_delay_point();
! t = select_next_random_record(t, targrows, &rstate);
! /* Try to read the t'th record in the table */
! targpos = t / tuplesperpage;
! targblock = (BlockNumber) targpos;
! targoffset = ((int) ((targpos - targblock) * tuplesperpage)) +
! FirstOffsetNumber;
! /* Make sure we are past the last selected record */
! if (targblock <= lastblock)
! {
! targblock = lastblock;
! if (targoffset <= lastoffset)
! targoffset = lastoffset + 1;
! }
! /* Loop to find first valid record at or after given position */
! pageloop:;
!
! /*
! * Have we fallen off the end of the relation?
! */
! if (targblock >= totalblocks)
! break;
/*
* We must maintain a pin on the target page's buffer to ensure
--- 791,797 ----
vacuum_delay_point();
! targblock = BlockSampler_Next(&bs);
/*
* We must maintain a pin on the target page's buffer to ensure
***************
*** 795,848 ****
maxoffset = PageGetMaxOffsetNumber(targpage);
LockBuffer(targbuffer, BUFFER_LOCK_UNLOCK);
! for (;;)
{
HeapTupleData targtuple;
Buffer tupbuffer;
- if (targoffset > maxoffset)
- {
- /* Fell off end of this page, try next */
- ReleaseBuffer(targbuffer);
- targblock++;
- targoffset = FirstOffsetNumber;
- goto pageloop;
- }
ItemPointerSet(&targtuple.t_self, targblock, targoffset);
if (heap_fetch(onerel, SnapshotNow, &targtuple, &tupbuffer,
false, NULL))
{
/*
! * Found a suitable tuple, so save it, replacing one old
! * tuple at random
*/
! int k = (int) (targrows * random_fract());
- Assert(k >= 0 && k < targrows);
- heap_freetuple(rows[k]);
- rows[k] = heap_copytuple(&targtuple);
/* this releases the second pin acquired by heap_fetch: */
ReleaseBuffer(tupbuffer);
- /* this releases the initial pin: */
- ReleaseBuffer(targbuffer);
- lastblock = targblock;
- lastoffset = targoffset;
- break;
}
! /* this tuple is dead, so advance to next one on same page */
! targoffset++;
}
}
/*
! * Now we need to sort the collected tuples by position (itempointer).
*/
! qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows);
/*
* Estimate total number of valid rows in relation.
*/
! *totalrows = floor((double) totalblocks * tuplesperpage + 0.5);
/*
* Emit some interesting relation info
--- 809,895 ----
maxoffset = PageGetMaxOffsetNumber(targpage);
LockBuffer(targbuffer, BUFFER_LOCK_UNLOCK);
! for (targoffset = FirstOffsetNumber; targoffset <= maxoffset; ++targoffset)
{
HeapTupleData targtuple;
Buffer tupbuffer;
ItemPointerSet(&targtuple.t_self, targblock, targoffset);
if (heap_fetch(onerel, SnapshotNow, &targtuple, &tupbuffer,
false, NULL))
{
+ liverows += 1;
/*
! * The first targrows rows are simply copied into the
! * reservoir.
! * Then we start replacing tuples in the sample until
! * we reach the end of the relation. This algorithm is
! * from Jeff Vitter's paper (see full citation below).
! * It works by repeatedly computing the number of the
! * next tuple we want to fetch, which will replace a
! * randomly chosen element of the reservoir (current
! * set of tuples). At all times the reservoir is a true
! * random sample of the tuples we've passed over so far,
! * so when we fall off the end of the relation we're done.
*/
! if (numrows < targrows)
! rows[numrows++] = heap_copytuple(&targtuple);
! else
! {
! if (rowstoskip < 0)
! rowstoskip = get_next_S(liverows, targrows, &rstate);
! if (rowstoskip == 0)
! {
! /*
! * Found a suitable tuple, so save it,
! * replacing one old tuple at random
! */
! int k = (int) (targrows * random_fract());
!
! Assert(k >= 0 && k < targrows);
! heap_freetuple(rows[k]);
! rows[k] = heap_copytuple(&targtuple);
! }
!
! rowstoskip -= 1;
! }
/* this releases the second pin acquired by heap_fetch: */
ReleaseBuffer(tupbuffer);
}
! else
! {
! /* This information is currently not used. */
! deadrows += 1;
! }
}
+
+ /* this releases the initial pin: */
+ ReleaseBuffer(targbuffer);
}
+ ereport(DEBUG2,
+ (errmsg("%d pages sampled, %.0f live rows and %.0f dead rows scanned",
+ bs.m, (double) liverows, (double) deadrows)));
+
/*
! * If we ran out of tuples then we're done. No sort is needed,
! * since they're already in order.
! *
! * Otherwise we need to sort the collected tuples by position
! * (itempointer). I don't care for corner cases where the tuples
! * are already sorted.
*/
! if (numrows == targrows)
! qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows);
/*
* Estimate total number of valid rows in relation.
*/
! if (bs.m > 0)
! *totalrows = floor((double) liverows * totalblocks / bs.m + 0.5);
! else
! *totalrows = 0.0;
/*
* Emit some interesting relation info
***************
*** 872,894 ****
/*
* These two routines embody Algorithm Z from "Random sampling with a
* reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
! * (Mar. 1985), Pages 37-57. While Vitter describes his algorithm in terms
! * of the count S of records to skip before processing another record,
! * it is convenient to work primarily with t, the index (counting from 1)
! * of the last record processed and next record to process. The only extra
! * state needed between calls is W, a random state variable.
! *
! * Note: the original algorithm defines t, S, numer, and denom as integers.
! * Here we express them as doubles to avoid overflow if the number of rows
! * in the table exceeds INT_MAX. The algorithm should work as long as the
! * row count does not become so large that it is not represented accurately
! * in a double (on IEEE-math machines this would be around 2^52 rows).
*
* init_selection_state computes the initial W value.
*
! * Given that we've already processed t records (t >= n),
! * select_next_random_record determines the number of the next record to
! * process.
*/
static double
init_selection_state(int n)
--- 919,935 ----
/*
* These two routines embody Algorithm Z from "Random sampling with a
* reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
! * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
! * of the count S of records to skip before processing another record.
! * It is computed primarily based on t, the index (counting from 1)
! * of the last record processed. The only extra state needed between
! * calls is W, a random state variable.
*
* init_selection_state computes the initial W value.
*
! * Given that we've already processed t records (t >= n), get_next_S
! * determines the number of records to skip before the next record is
! * processed.
*/
static double
init_selection_state(int n)
***************
*** 897,932 ****
return exp(-log(random_fract()) / n);
}
! static double
! select_next_random_record(double t, int n, double *stateptr)
{
/* The magic constant here is T from Vitter's paper */
! if (t <= (22.0 * n))
{
/* Process records using Algorithm X until t is large enough */
double V,
quot;
V = random_fract(); /* Generate V */
t += 1;
! quot = (t - (double) n) / t;
/* Find min S satisfying (4.1) */
while (quot > V)
{
t += 1;
! quot *= (t - (double) n) / t;
}
}
else
{
/* Now apply Algorithm Z */
double W = *stateptr;
! double term = t - (double) n + 1;
! double S;
for (;;)
{
! double numer,
numer_lim,
denom;
double U,
--- 938,976 ----
return exp(-log(random_fract()) / n);
}
! static TupleCount
! get_next_S(TupleCount t, int n, double *stateptr)
{
/* The magic constant here is T from Vitter's paper */
! if (t <= (22 * n))
{
/* Process records using Algorithm X until t is large enough */
+ TupleCount S = 0;
double V,
quot;
V = random_fract(); /* Generate V */
t += 1;
! quot = 1.0 - (double) n / t;
/* Find min S satisfying (4.1) */
while (quot > V)
{
+ S += 1;
t += 1;
! quot *= 1.0 - (double) n / t;
}
+ return S;
}
else
{
/* Now apply Algorithm Z */
double W = *stateptr;
! TupleCount term = t - n + 1;
! TupleCount S;
for (;;)
{
! TupleCount numer,
numer_lim,
denom;
double U,
***************
*** 941,947 ****
X = t * (W - 1.0);
S = floor(X); /* S is tentatively set to floor(X) */
/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
! tmp = (t + 1) / term;
lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
rhs = (((t + X) / (term + S)) * term) / t;
if (lhs <= rhs)
--- 985,991 ----
X = t * (W - 1.0);
S = floor(X); /* S is tentatively set to floor(X) */
/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
! tmp = (double) (t + 1) / term;
lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
rhs = (((t + X) / (term + S)) * term) / t;
if (lhs <= rhs)
***************
*** 951,979 ****
}
/* Test if U <= f(S)/cg(X) */
y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
! if ((double) n < S)
{
denom = t;
numer_lim = term + S;
}
else
{
! denom = t - (double) n + S;
numer_lim = t + 1;
}
for (numer = t + S; numer >= numer_lim; numer -= 1)
{
! y *= numer / denom;
denom -= 1;
}
W = exp(-log(random_fract()) / n); /* Generate W in advance */
if (exp(log(y) / n) <= (t + X) / t)
break;
}
- t += S + 1;
*stateptr = W;
}
- return t;
}
/*
--- 995,1022 ----
}
/* Test if U <= f(S)/cg(X) */
y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
! if (n < S)
{
denom = t;
numer_lim = term + S;
}
else
{
! denom = t - n + S;
numer_lim = t + 1;
}
for (numer = t + S; numer >= numer_lim; numer -= 1)
{
! y *= (double) numer / denom;
denom -= 1;
}
W = exp(-log(random_fract()) / n); /* Generate W in advance */
if (exp(log(y) / n) <= (t + X) / t)
break;
}
*stateptr = W;
+ return S;
}
}
/*