Tuple sampling - Mailing list pgsql-patches

From Manfred Koizar
Subject Tuple sampling
Date
Msg-id 1tkva0h547jhomsasujt2qs7gcgg0gtvrp@email.aon.at
Whole thread Raw
Responses Re: Tuple sampling  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-patches
This patch implements the new tuple sampling method as discussed on
-hackers and -performance a few weeks ago.

"Large DB" -hackers 2004-04-02
"query slows down with more accurate stats" -perform 2004-04-13
"Tuple sampling" -hackers 2004-04-19

Servus
 Manfred
diff -rcN ../base/src/backend/commands/analyze.c src/backend/commands/analyze.c
*** ../base/src/backend/commands/analyze.c    Sun May  9 12:05:51 2004
--- src/backend/commands/analyze.c    Sun May  9 19:33:27 2004
***************
*** 38,43 ****
--- 38,66 ----
  #include "utils/syscache.h"
  #include "utils/tuplesort.h"

+ /*
+  * With two-stage sampling we process at most 300000 blocks, each of
+  * which contains less than 1200 tuples (at 32K block size, smallest
+  * possible tuple size).  So the number of tuples processed can be
+  * stored in a 32 bit int.
+  * Change this datatype to double if the number of rows in the table
+  * might exceed INT_MAX.  Then the algorithm should work as long as the
+  * row count does not become so large that it is not represented accurately
+  * in a double (on IEEE-math machines this would be around 2^52 rows).
+  */
+ typedef int TupleCount;
+
+ /*
+ ** data structure for Algorithm S from Knuth 3.4.2
+ */
+ typedef struct
+ {
+     BlockNumber    N;                /* number of blocks, known in advance */
+     int            n;                /* sample size */
+     BlockNumber    t;                /* current block number */
+     int            m;                /* blocks selected so far */
+ } BlockSamplerData;
+ typedef BlockSamplerData *BlockSampler;

  /* Per-index data for ANALYZE */
  typedef struct AnlIndexData
***************
*** 57,62 ****
--- 80,89 ----
  static MemoryContext anl_context = NULL;


+ static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize);
+ static bool BlockSampler_HasMore(BlockSampler bs);
+ static BlockNumber BlockSampler_Next(BlockSampler bs);
+
  static void compute_index_stats(Relation onerel, double totalrows,
                                  AnlIndexData *indexdata, int nindexes,
                                  HeapTuple *rows, int numrows,
***************
*** 66,72 ****
                      int targrows, double *totalrows);
  static double random_fract(void);
  static double init_selection_state(int n);
! static double select_next_random_record(double t, int n, double *stateptr);
  static int    compare_rows(const void *a, const void *b);
  static void update_attstats(Oid relid, int natts, VacAttrStats **vacattrstats);
  static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
--- 93,99 ----
                      int targrows, double *totalrows);
  static double random_fract(void);
  static double init_selection_state(int n);
! static TupleCount get_next_S(TupleCount t, int n, double *stateptr);
  static int    compare_rows(const void *a, const void *b);
  static void update_attstats(Oid relid, int natts, VacAttrStats **vacattrstats);
  static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
***************
*** 638,649 ****
  }

  /*
   * acquire_sample_rows -- acquire a random sample of rows from the table
   *
!  * Up to targrows rows are collected (if there are fewer than that many
!  * rows in the table, all rows are collected).    When the table is larger
!  * than targrows, a truly random sample is collected: every row has an
!  * equal chance of ending up in the final sample.
   *
   * We also estimate the total number of rows in the table, and return that
   * into *totalrows.
--- 665,754 ----
  }

  /*
+ ** BlockSampler_Init -- prepare for random sampling of blocknumbers
+ **
+ ** BlockSampler is used for stage one of our new two-stage tuple
+ ** sampling mechanism as discussed on -hackers 2004-04-02 (subject
+ ** "Large DB").  It selects a random sample of samplesize blocks out of
+ ** the nblocks blocks in the table.  If the table has less than
+ ** samplesize blocks, all blocks are selected.
+ **
+ */
+ static void
+ BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize)
+ {
+     bs->N = nblocks;            /* table size */
+     /*
+     ** If we decide to reduce samplesize for tables that have less or
+     ** not much more than samplesize blocks, here is the place to do
+     ** it.
+     */
+     bs->n = samplesize;
+     bs->t = 0;                    /* blocks scanned so far */
+     bs->m = 0;                    /* blocks selected so far */
+ }
+
+ static bool
+ BlockSampler_HasMore(BlockSampler bs)
+ {
+     return (bs->t < bs->N) && (bs->m < bs->n);
+ }
+
+ static BlockNumber
+ BlockSampler_Next(BlockSampler bs)
+ {
+     BlockNumber    K = bs->N - bs->t;        /* remaining blocks */
+     int            k = bs->n - bs->m;        /* blocks still to sample */
+     double        p;                /* probability to skip the next block */
+     double        V;                /* random */
+
+     Assert(BlockSampler_HasMore(bs));
+
+     if (k >= K)
+     {
+         /* need all the rest */
+         bs->t += 1;
+         bs->m += 1;
+         return bs->t - 1;
+     }
+
+     p = 1.0 - (double) k / (double) K;
+     V = random_fract();
+     while (V < p)
+     {
+         bs->t += 1;
+         K -= 1;
+         /*
+         ** Assert(K > 0)
+         ** because we startet with K > k > 0,
+         ** and when K == k, the loop terminates
+         */
+         p *= 1.0 - (double) k / (double) K;
+     }
+
+     /* select */
+     bs->t += 1;
+     bs->m += 1;
+     return bs->t - 1;
+ }
+
+ /*
   * acquire_sample_rows -- acquire a random sample of rows from the table
   *
!  * As of May 2004 we use a new two-stage method:  Stage one selects up
!  * to targrows random blocks (or all blocks, if there aren't so many).
!  * Stage two scans these blocks and uses the Vitter algorithm to create
!  * a random sample of targrows rows (or less, if there are less in the
!  * sample of blocks).  The two stages are executed simultaneously:  Each
!  * block is processed as soon as stage one returns its number and while
!  * the rows are read stage two controls which ones are to be inserted
!  * into the sample.
!  *
!  * Although every row has an equal chance of ending up in the final
!  * sample, this sampling method is not perfect: not every possible
!  * sample has an equal chance of being selected.  For large relations
!  * the number of different blocks represented by the sample tends to be
!  * too small.  We can live with that for now.  Improvements are welcome.
   *
   * We also estimate the total number of rows in the table, and return that
   * into *totalrows.
***************
*** 656,754 ****
                      double *totalrows)
  {
      int            numrows = 0;
!     HeapScanDesc scan;
      BlockNumber    totalblocks;
!     HeapTuple    tuple;
!     ItemPointer lasttuple;
!     BlockNumber lastblock,
!                 estblock;
!     OffsetNumber lastoffset;
!     int            numest;
!     double        tuplesperpage;
!     double        t;
      double        rstate;

      Assert(targrows > 1);

!     /*
!      * Do a simple linear scan until we reach the target number of rows.
!      */
!     scan = heap_beginscan(onerel, SnapshotNow, 0, NULL);
!     totalblocks = scan->rs_nblocks;    /* grab current relation size */
!     while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
!     {
!         rows[numrows++] = heap_copytuple(tuple);
!         if (numrows >= targrows)
!             break;
!         vacuum_delay_point();
!     }
!     heap_endscan(scan);

      /*
!      * If we ran out of tuples then we're done, no matter how few we
!      * collected.  No sort is needed, since they're already in order.
!      */
!     if (!HeapTupleIsValid(tuple))
!     {
!         *totalrows = (double) numrows;
!
!         ereport(elevel,
!                 (errmsg("\"%s\": %u pages, %d rows sampled, %.0f estimated total rows",
!                         RelationGetRelationName(onerel),
!                         totalblocks, numrows, *totalrows)));
!
!         return numrows;
!     }
!
      /*
!      * Otherwise, start replacing tuples in the sample until we reach the
!      * end of the relation.  This algorithm is from Jeff Vitter's paper
!      * (see full citation below).  It works by repeatedly computing the
!      * number of the next tuple we want to fetch, which will replace a
!      * randomly chosen element of the reservoir (current set of tuples).
!      * At all times the reservoir is a true random sample of the tuples
!      * we've passed over so far, so when we fall off the end of the
!      * relation we're done.
!      *
!      * A slight difficulty is that since we don't want to fetch tuples or
!      * even pages that we skip over, it's not possible to fetch *exactly*
!      * the N'th tuple at each step --- we don't know how many valid tuples
!      * are on the skipped pages.  We handle this by assuming that the
!      * average number of valid tuples/page on the pages already scanned
!      * over holds good for the rest of the relation as well; this lets us
!      * estimate which page the next tuple should be on and its position in
!      * the page.  Then we fetch the first valid tuple at or after that
!      * position, being careful not to use the same tuple twice.  This
!      * approach should still give a good random sample, although it's not
!      * perfect.
!      */
!     lasttuple = &(rows[numrows - 1]->t_self);
!     lastblock = ItemPointerGetBlockNumber(lasttuple);
!     lastoffset = ItemPointerGetOffsetNumber(lasttuple);
!
!     /*
!      * If possible, estimate tuples/page using only completely-scanned
!      * pages.
!      */
!     for (numest = numrows; numest > 0; numest--)
!     {
!         if (ItemPointerGetBlockNumber(&(rows[numest - 1]->t_self)) != lastblock)
!             break;
!     }
!     if (numest == 0)
!     {
!         numest = numrows;        /* don't have a full page? */
!         estblock = lastblock + 1;
!     }
!     else
!         estblock = lastblock;
!     tuplesperpage = (double) numest / (double) estblock;
!
!     t = (double) numrows;        /* t is the # of records processed so far */
      rstate = init_selection_state(targrows);
!     for (;;)
      {
-         double        targpos;
          BlockNumber targblock;
          Buffer        targbuffer;
          Page        targpage;
--- 761,788 ----
                      double *totalrows)
  {
      int            numrows = 0;
!     TupleCount    liverows = 0;
!     TupleCount    deadrows = 0;
!     TupleCount    rowstoskip = -1;
      BlockNumber    totalblocks;
!     BlockSamplerData bs;
      double        rstate;

      Assert(targrows > 1);

!     totalblocks = RelationGetNumberOfBlocks(onerel);

      /*
!     ** Prepare for sampling block numbers
!     */
!     BlockSampler_Init(&bs, totalblocks, targrows);
      /*
!     ** Prepare for sampling rows
!     */
      rstate = init_selection_state(targrows);
!
!     while (BlockSampler_HasMore(&bs))
      {
          BlockNumber targblock;
          Buffer        targbuffer;
          Page        targpage;
***************
*** 757,783 ****

          vacuum_delay_point();

!         t = select_next_random_record(t, targrows, &rstate);
!         /* Try to read the t'th record in the table */
!         targpos = t / tuplesperpage;
!         targblock = (BlockNumber) targpos;
!         targoffset = ((int) ((targpos - targblock) * tuplesperpage)) +
!             FirstOffsetNumber;
!         /* Make sure we are past the last selected record */
!         if (targblock <= lastblock)
!         {
!             targblock = lastblock;
!             if (targoffset <= lastoffset)
!                 targoffset = lastoffset + 1;
!         }
!         /* Loop to find first valid record at or after given position */
! pageloop:;
!
!         /*
!          * Have we fallen off the end of the relation?
!          */
!         if (targblock >= totalblocks)
!             break;

          /*
           * We must maintain a pin on the target page's buffer to ensure
--- 791,797 ----

          vacuum_delay_point();

!         targblock = BlockSampler_Next(&bs);

          /*
           * We must maintain a pin on the target page's buffer to ensure
***************
*** 795,848 ****
          maxoffset = PageGetMaxOffsetNumber(targpage);
          LockBuffer(targbuffer, BUFFER_LOCK_UNLOCK);

!         for (;;)
          {
              HeapTupleData targtuple;
              Buffer        tupbuffer;

-             if (targoffset > maxoffset)
-             {
-                 /* Fell off end of this page, try next */
-                 ReleaseBuffer(targbuffer);
-                 targblock++;
-                 targoffset = FirstOffsetNumber;
-                 goto pageloop;
-             }
              ItemPointerSet(&targtuple.t_self, targblock, targoffset);
              if (heap_fetch(onerel, SnapshotNow, &targtuple, &tupbuffer,
                             false, NULL))
              {
                  /*
!                  * Found a suitable tuple, so save it, replacing one old
!                  * tuple at random
                   */
!                 int            k = (int) (targrows * random_fract());

-                 Assert(k >= 0 && k < targrows);
-                 heap_freetuple(rows[k]);
-                 rows[k] = heap_copytuple(&targtuple);
                  /* this releases the second pin acquired by heap_fetch: */
                  ReleaseBuffer(tupbuffer);
-                 /* this releases the initial pin: */
-                 ReleaseBuffer(targbuffer);
-                 lastblock = targblock;
-                 lastoffset = targoffset;
-                 break;
              }
!             /* this tuple is dead, so advance to next one on same page */
!             targoffset++;
          }
      }

      /*
!      * Now we need to sort the collected tuples by position (itempointer).
       */
!     qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows);

      /*
       * Estimate total number of valid rows in relation.
       */
!     *totalrows = floor((double) totalblocks * tuplesperpage + 0.5);

      /*
       * Emit some interesting relation info
--- 809,895 ----
          maxoffset = PageGetMaxOffsetNumber(targpage);
          LockBuffer(targbuffer, BUFFER_LOCK_UNLOCK);

!         for (targoffset = FirstOffsetNumber; targoffset <= maxoffset; ++targoffset)
          {
              HeapTupleData targtuple;
              Buffer        tupbuffer;

              ItemPointerSet(&targtuple.t_self, targblock, targoffset);
              if (heap_fetch(onerel, SnapshotNow, &targtuple, &tupbuffer,
                             false, NULL))
              {
+                 liverows += 1;
                  /*
!                  * The first targrows rows are simply copied into the
!                  * reservoir.
!                  * Then we start replacing tuples in the sample until
!                  * we reach the end of the relation.  This algorithm is
!                  * from Jeff Vitter's paper (see full citation below).
!                  * It works by repeatedly computing the number of the
!                  * next tuple we want to fetch, which will replace a
!                  * randomly chosen element of the reservoir (current
!                  * set of tuples).  At all times the reservoir is a true
!                  * random sample of the tuples we've passed over so far,
!                  * so when we fall off the end of the relation we're done.
                   */
!                 if (numrows < targrows)
!                     rows[numrows++] = heap_copytuple(&targtuple);
!                 else
!                 {
!                     if (rowstoskip < 0)
!                         rowstoskip = get_next_S(liverows, targrows, &rstate);
!                     if (rowstoskip == 0)
!                     {
!                         /*
!                          * Found a suitable tuple, so save it,
!                          * replacing one old tuple at random
!                          */
!                         int    k = (int) (targrows * random_fract());
!
!                         Assert(k >= 0 && k < targrows);
!                         heap_freetuple(rows[k]);
!                         rows[k] = heap_copytuple(&targtuple);
!                     }
!
!                     rowstoskip -= 1;
!                 }

                  /* this releases the second pin acquired by heap_fetch: */
                  ReleaseBuffer(tupbuffer);
              }
!             else
!             {
!                 /* This information is currently not used. */
!                 deadrows += 1;
!             }
          }
+
+         /* this releases the initial pin: */
+         ReleaseBuffer(targbuffer);
      }

+     ereport(DEBUG2,
+             (errmsg("%d pages sampled, %.0f live rows and %.0f dead rows scanned",
+                     bs.m, (double) liverows, (double) deadrows)));
+
      /*
!      * If we ran out of tuples then we're done.  No sort is needed,
!      * since they're already in order.
!      *
!      * Otherwise we need to sort the collected tuples by position
!      * (itempointer).  I don't care for corner cases where the tuples
!      * are already sorted.
       */
!     if (numrows == targrows)
!         qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows);

      /*
       * Estimate total number of valid rows in relation.
       */
!     if (bs.m > 0)
!         *totalrows = floor((double) liverows * totalblocks / bs.m + 0.5);
!     else
!         *totalrows = 0.0;

      /*
       * Emit some interesting relation info
***************
*** 872,894 ****
  /*
   * These two routines embody Algorithm Z from "Random sampling with a
   * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
!  * (Mar. 1985), Pages 37-57.  While Vitter describes his algorithm in terms
!  * of the count S of records to skip before processing another record,
!  * it is convenient to work primarily with t, the index (counting from 1)
!  * of the last record processed and next record to process.  The only extra
!  * state needed between calls is W, a random state variable.
!  *
!  * Note: the original algorithm defines t, S, numer, and denom as integers.
!  * Here we express them as doubles to avoid overflow if the number of rows
!  * in the table exceeds INT_MAX.  The algorithm should work as long as the
!  * row count does not become so large that it is not represented accurately
!  * in a double (on IEEE-math machines this would be around 2^52 rows).
   *
   * init_selection_state computes the initial W value.
   *
!  * Given that we've already processed t records (t >= n),
!  * select_next_random_record determines the number of the next record to
!  * process.
   */
  static double
  init_selection_state(int n)
--- 919,935 ----
  /*
   * These two routines embody Algorithm Z from "Random sampling with a
   * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
!  * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
!  * of the count S of records to skip before processing another record.
!  * It is computed primarily based on t, the index (counting from 1)
!  * of the last record processed.  The only extra state needed between
!  * calls is W, a random state variable.
   *
   * init_selection_state computes the initial W value.
   *
!  * Given that we've already processed t records (t >= n), get_next_S
!  * determines the number of records to skip before the next record is
!  * processed.
   */
  static double
  init_selection_state(int n)
***************
*** 897,932 ****
      return exp(-log(random_fract()) / n);
  }

! static double
! select_next_random_record(double t, int n, double *stateptr)
  {
      /* The magic constant here is T from Vitter's paper */
!     if (t <= (22.0 * n))
      {
          /* Process records using Algorithm X until t is large enough */
          double        V,
                      quot;

          V = random_fract();        /* Generate V */
          t += 1;
!         quot = (t - (double) n) / t;
          /* Find min S satisfying (4.1) */
          while (quot > V)
          {
              t += 1;
!             quot *= (t - (double) n) / t;
          }
      }
      else
      {
          /* Now apply Algorithm Z */
          double        W = *stateptr;
!         double        term = t - (double) n + 1;
!         double        S;

          for (;;)
          {
!             double        numer,
                          numer_lim,
                          denom;
              double        U,
--- 938,976 ----
      return exp(-log(random_fract()) / n);
  }

! static TupleCount
! get_next_S(TupleCount t, int n, double *stateptr)
  {
      /* The magic constant here is T from Vitter's paper */
!     if (t <= (22 * n))
      {
          /* Process records using Algorithm X until t is large enough */
+         TupleCount S = 0;
          double        V,
                      quot;

          V = random_fract();        /* Generate V */
          t += 1;
!         quot = 1.0 - (double) n / t;
          /* Find min S satisfying (4.1) */
          while (quot > V)
          {
+             S += 1;
              t += 1;
!             quot *= 1.0 - (double) n / t;
          }
+         return S;
      }
      else
      {
          /* Now apply Algorithm Z */
          double        W = *stateptr;
!         TupleCount    term = t - n + 1;
!         TupleCount    S;

          for (;;)
          {
!             TupleCount    numer,
                          numer_lim,
                          denom;
              double        U,
***************
*** 941,947 ****
              X = t * (W - 1.0);
              S = floor(X);        /* S is tentatively set to floor(X) */
              /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
!             tmp = (t + 1) / term;
              lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
              rhs = (((t + X) / (term + S)) * term) / t;
              if (lhs <= rhs)
--- 985,991 ----
              X = t * (W - 1.0);
              S = floor(X);        /* S is tentatively set to floor(X) */
              /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
!             tmp = (double) (t + 1) / term;
              lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
              rhs = (((t + X) / (term + S)) * term) / t;
              if (lhs <= rhs)
***************
*** 951,979 ****
              }
              /* Test if U <= f(S)/cg(X) */
              y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
!             if ((double) n < S)
              {
                  denom = t;
                  numer_lim = term + S;
              }
              else
              {
!                 denom = t - (double) n + S;
                  numer_lim = t + 1;
              }
              for (numer = t + S; numer >= numer_lim; numer -= 1)
              {
!                 y *= numer / denom;
                  denom -= 1;
              }
              W = exp(-log(random_fract()) / n);    /* Generate W in advance */
              if (exp(log(y) / n) <= (t + X) / t)
                  break;
          }
-         t += S + 1;
          *stateptr = W;
      }
-     return t;
  }

  /*
--- 995,1022 ----
              }
              /* Test if U <= f(S)/cg(X) */
              y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
!             if (n < S)
              {
                  denom = t;
                  numer_lim = term + S;
              }
              else
              {
!                 denom = t - n + S;
                  numer_lim = t + 1;
              }
              for (numer = t + S; numer >= numer_lim; numer -= 1)
              {
!                 y *= (double) numer / denom;
                  denom -= 1;
              }
              W = exp(-log(random_fract()) / n);    /* Generate W in advance */
              if (exp(log(y) / n) <= (t + X) / t)
                  break;
          }
          *stateptr = W;
+         return S;
      }
  }

  /*

pgsql-patches by date:

Previous
From: "Magnus Hagander"
Date:
Subject: Re: Cancel/Kill backend functions
Next
From: Neil Conway
Date:
Subject: Re: Cancel/Kill backend functions