Re: Seq scans status update - Mailing list pgsql-patches

From Heikki Linnakangas
Subject Re: Seq scans status update
Date
Msg-id 4657395B.9000706@enterprisedb.com
Whole thread Raw
In response to Re: Seq scans status update  (Bruce Momjian <bruce@momjian.us>)
Responses Re: Seq scans status update  (Tom Lane <tgl@sss.pgh.pa.us>)
Re: Seq scans status update  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-patches
Here's a new version, all known issues are now fixed. I'm now happy with
this patch.

Next, I'll start looking at the latest version of Jeff's synchronized
scans patch.

Bruce Momjian wrote:
> Great.  Based on this, do you have a patch that is ready to apply.
>
> ---------------------------------------------------------------------------
>
> Heikki Linnakangas wrote:
>> Heikki Linnakangas wrote:
>>> In any case, I'd like to see more test results before we make a
>>> decision. I'm running tests with DBT-2 and a seq scan running in the
>>> background to see if the cache-spoiling effect shows up. I'm also trying
>>> to get hold of some bigger hardware to run on. Running these tests takes
>>> some calendar time, but the hard work has already been done. I'm going
>>> to start reviewing Jeff's synchronized scans patch now.
>> Here are the results of the DBT-2 tests:
>>
>> http://community.enterprisedb.com/seqscan/imola/
>>
>> In each of these tests, at the end of rampup a script is started that
>> issues a "SELECT COUNT(*) FROM stock" in a loop, with 2 minute delay
>> between end of previous query and start of next one.
>>
>> The patch makes the seq scans go significantly faster. In the 1 hour
>> test period, the patched tests perform roughly 30-100% as many selects
>> as unpatched tests.
>>
>> With 100 and 105 warehouses, it also significantly reduces the impact of
>> the seq scan on other queries; response times are lower with the patch.
>> With 120 warehouses the reduction of impact is not as clear, but when
>> you plot the response times it's still there (the plots on the "response
>> times charts"-page are useless because they're overwhelmed by the
>> checkpoint spike).
>>
>> --
>>    Heikki Linnakangas
>>    EnterpriseDB   http://www.enterprisedb.com
>>
>> ---------------------------(end of broadcast)---------------------------
>> TIP 5: don't forget to increase your free space map settings
>


--
   Heikki Linnakangas
   EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/access/heap/heapam.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/heap/heapam.c,v
retrieving revision 1.232
diff -c -r1.232 heapam.c
*** src/backend/access/heap/heapam.c    8 Apr 2007 01:26:27 -0000    1.232
--- src/backend/access/heap/heapam.c    25 May 2007 19:22:33 -0000
***************
*** 83,88 ****
--- 83,96 ----
       */
      scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);

+     /* A scan on a table smaller than shared_buffers is treated like random
+      * access, but bigger scans use the bulk read page replacement policy.
+      */
+     if (scan->rs_nblocks > NBuffers)
+         scan->rs_accesspattern = AP_BULKREAD;
+     else
+         scan->rs_accesspattern = AP_NORMAL;
+
      scan->rs_inited = false;
      scan->rs_ctup.t_data = NULL;
      ItemPointerSetInvalid(&scan->rs_ctup.t_self);
***************
*** 123,133 ****
--- 131,146 ----

      Assert(page < scan->rs_nblocks);

+     /* Read the page with the right strategy */
+     SetAccessPattern(scan->rs_accesspattern);
+
      scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
                                           scan->rs_rd,
                                           page);
      scan->rs_cblock = page;

+     SetAccessPattern(AP_NORMAL);
+
      if (!scan->rs_pageatatime)
          return;

Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.268
diff -c -r1.268 xlog.c
*** src/backend/access/transam/xlog.c    30 Apr 2007 21:01:52 -0000    1.268
--- src/backend/access/transam/xlog.c    15 May 2007 16:23:30 -0000
***************
*** 1668,1673 ****
--- 1668,1700 ----
  }

  /*
+  * Returns true if 'record' hasn't been flushed to disk yet.
+  */
+ bool
+ XLogNeedsFlush(XLogRecPtr record)
+ {
+     /* Quick exit if already known flushed */
+     if (XLByteLE(record, LogwrtResult.Flush))
+         return false;
+
+     /* read LogwrtResult and update local state */
+     {
+         /* use volatile pointer to prevent code rearrangement */
+         volatile XLogCtlData *xlogctl = XLogCtl;
+
+         SpinLockAcquire(&xlogctl->info_lck);
+         LogwrtResult = xlogctl->LogwrtResult;
+         SpinLockRelease(&xlogctl->info_lck);
+     }
+
+     /* check again */
+     if (XLByteLE(record, LogwrtResult.Flush))
+         return false;
+
+     return true;
+ }
+
+ /*
   * Ensure that all XLOG data through the given position is flushed to disk.
   *
   * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
Index: src/backend/commands/copy.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/copy.c,v
retrieving revision 1.283
diff -c -r1.283 copy.c
*** src/backend/commands/copy.c    27 Apr 2007 22:05:46 -0000    1.283
--- src/backend/commands/copy.c    15 May 2007 17:05:29 -0000
***************
*** 1876,1881 ****
--- 1876,1888 ----
      nfields = file_has_oids ? (attr_count + 1) : attr_count;
      field_strings = (char **) palloc(nfields * sizeof(char *));

+     /* Use the special COPY buffer replacement strategy if WAL-logging
+      * is enabled. If it's not, the pages we're writing are dirty but
+      * don't need a WAL flush to write out, so the BULKREAD strategy
+      * is more suitable.
+      */
+     SetAccessPattern(use_wal ? AP_COPY : AP_BULKREAD);
+
      /* Initialize state variables */
      cstate->fe_eof = false;
      cstate->eol_type = EOL_UNKNOWN;
***************
*** 2161,2166 ****
--- 2168,2176 ----
                              cstate->filename)));
      }

+     /* Reset buffer replacement strategy */
+     SetAccessPattern(AP_NORMAL);
+
      /*
       * If we skipped writing WAL, then we need to sync the heap (but not
       * indexes since those use WAL anyway)
Index: src/backend/commands/vacuum.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/vacuum.c,v
retrieving revision 1.350
diff -c -r1.350 vacuum.c
*** src/backend/commands/vacuum.c    16 Apr 2007 18:29:50 -0000    1.350
--- src/backend/commands/vacuum.c    15 May 2007 17:06:18 -0000
***************
*** 421,431 ****
                   * Tell the buffer replacement strategy that vacuum is causing
                   * the IO
                   */
!                 StrategyHintVacuum(true);

                  analyze_rel(relid, vacstmt);

!                 StrategyHintVacuum(false);

                  if (use_own_xacts)
                      CommitTransactionCommand();
--- 421,431 ----
                   * Tell the buffer replacement strategy that vacuum is causing
                   * the IO
                   */
!                 SetAccessPattern(AP_VACUUM);

                  analyze_rel(relid, vacstmt);

!                 SetAccessPattern(AP_NORMAL);

                  if (use_own_xacts)
                      CommitTransactionCommand();
***************
*** 442,448 ****
          /* Make sure cost accounting is turned off after error */
          VacuumCostActive = false;
          /* And reset buffer replacement strategy, too */
!         StrategyHintVacuum(false);
          PG_RE_THROW();
      }
      PG_END_TRY();
--- 442,448 ----
          /* Make sure cost accounting is turned off after error */
          VacuumCostActive = false;
          /* And reset buffer replacement strategy, too */
!         SetAccessPattern(AP_NORMAL);
          PG_RE_THROW();
      }
      PG_END_TRY();
***************
*** 1088,1094 ****
       * Tell the cache replacement strategy that vacuum is causing all
       * following IO
       */
!     StrategyHintVacuum(true);

      /*
       * Do the actual work --- either FULL or "lazy" vacuum
--- 1088,1094 ----
       * Tell the cache replacement strategy that vacuum is causing all
       * following IO
       */
!     SetAccessPattern(AP_VACUUM);

      /*
       * Do the actual work --- either FULL or "lazy" vacuum
***************
*** 1098,1104 ****
      else
          lazy_vacuum_rel(onerel, vacstmt);

!     StrategyHintVacuum(false);

      /* all done with this class, but hold lock until commit */
      relation_close(onerel, NoLock);
--- 1098,1104 ----
      else
          lazy_vacuum_rel(onerel, vacstmt);

!     SetAccessPattern(AP_NORMAL);

      /* all done with this class, but hold lock until commit */
      relation_close(onerel, NoLock);
Index: src/backend/storage/buffer/README
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/buffer/README,v
retrieving revision 1.11
diff -c -r1.11 README
*** src/backend/storage/buffer/README    23 Jul 2006 03:07:58 -0000    1.11
--- src/backend/storage/buffer/README    16 May 2007 11:43:11 -0000
***************
*** 152,159 ****
  a field to show which backend is doing its I/O).


! Buffer replacement strategy
! ---------------------------

  There is a "free list" of buffers that are prime candidates for replacement.
  In particular, buffers that are completely free (contain no valid page) are
--- 152,159 ----
  a field to show which backend is doing its I/O).


! Normal buffer replacement strategy
! ----------------------------------

  There is a "free list" of buffers that are prime candidates for replacement.
  In particular, buffers that are completely free (contain no valid page) are
***************
*** 199,221 ****
  have to give up and try another buffer.  This however is not a concern
  of the basic select-a-victim-buffer algorithm.)

- A special provision is that while running VACUUM, a backend does not
- increment the usage count on buffers it accesses.  In fact, if ReleaseBuffer
- sees that it is dropping the pin count to zero and the usage count is zero,
- then it appends the buffer to the tail of the free list.  (This implies that
- VACUUM, but only VACUUM, must take the BufFreelistLock during ReleaseBuffer;
- this shouldn't create much of a contention problem.)  This provision
- encourages VACUUM to work in a relatively small number of buffers rather
- than blowing out the entire buffer cache.  It is reasonable since a page
- that has been touched only by VACUUM is unlikely to be needed again soon.
-
- Since VACUUM usually requests many pages very fast, the effect of this is that
- it will get back the very buffers it filled and possibly modified on the next
- call and will therefore do its work in a few shared memory buffers, while
- being able to use whatever it finds in the cache already.  This also implies
- that most of the write traffic caused by a VACUUM will be done by the VACUUM
- itself and not pushed off onto other processes.


  Background writer's processing
  ------------------------------
--- 199,243 ----
  have to give up and try another buffer.  This however is not a concern
  of the basic select-a-victim-buffer algorithm.)


+ Buffer ring replacement strategy
+ ---------------------------------
+
+ When running a query that needs to access a large number of pages, like VACUUM,
+ COPY, or a large sequential scan, a different strategy is used.  A page that
+ has been touched only by such a scan is unlikely to be needed again soon, so
+ instead of running the normal clock sweep algorithm and blowing out the entire
+ buffer cache, a small ring of buffers is allocated using the normal clock sweep
+ algorithm and those buffers are reused for the whole scan.  This also implies
+ that most of the write traffic caused by such a statement will be done by the
+ backend itself and not pushed off onto other processes.
+
+ The size of the ring used depends on the kind of scan:
+
+ For sequential scans, a small 256 KB ring is used. That's small enough to fit
+ in L2 cache, which makes transferring pages from OS cache to shared buffer
+ cache efficient. Even less would often be enough, but the ring must be big
+ enough to accommodate all pages in the scan that are pinned concurrently.
+ 256 KB should also be enough to leave a small cache trail for other backends to
+ join in a synchronized seq scan. If a buffer is dirtied and LSN set, the buffer
+ is removed from the ring and a replacement buffer is chosen using the normal
+ replacement strategy. In a scan that modifies every page in the scan, like a
+ bulk UPDATE or DELETE, the buffers in the ring will always be dirtied and the
+ ring strategy effectively degrades to the normal strategy.
+
+ VACUUM uses a 256 KB ring like sequential scans, but dirty pages are not
+ removed from the ring. WAL is flushed instead to allow reuse of the buffers.
+ Before introducing the buffer ring strategy in 8.3, buffers were put to the
+ freelist, which was effectively a buffer ring of 1 buffer.
+
+ COPY behaves like VACUUM, but a much larger ring is used. The ring size is
+ chosen to be twice the WAL segment size. This avoids polluting the buffer cache
+ like the clock sweep would do, and using a ring larger than WAL segment size
+ avoids having to do any extra WAL flushes, since a WAL segment will always be
+ filled, forcing a WAL flush, before looping through the buffer ring and bumping
+ into a buffer that would force a WAL flush. However, for non-WAL-logged COPY
+ operations the smaller 256 KB ring is used because WAL flushes are not needed
+ to write the buffers.

  Background writer's processing
  ------------------------------
Index: src/backend/storage/buffer/bufmgr.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.218
diff -c -r1.218 bufmgr.c
*** src/backend/storage/buffer/bufmgr.c    2 May 2007 23:34:48 -0000    1.218
--- src/backend/storage/buffer/bufmgr.c    16 May 2007 12:34:10 -0000
***************
*** 419,431 ****
      /* Loop here in case we have to try another victim buffer */
      for (;;)
      {
          /*
           * Select a victim buffer.    The buffer is returned with its header
           * spinlock still held!  Also the BufFreelistLock is still held, since
           * it would be bad to hold the spinlock while possibly waking up other
           * processes.
           */
!         buf = StrategyGetBuffer();

          Assert(buf->refcount == 0);

--- 419,433 ----
      /* Loop here in case we have to try another victim buffer */
      for (;;)
      {
+         bool lock_held;
+
          /*
           * Select a victim buffer.    The buffer is returned with its header
           * spinlock still held!  Also the BufFreelistLock is still held, since
           * it would be bad to hold the spinlock while possibly waking up other
           * processes.
           */
!         buf = StrategyGetBuffer(&lock_held);

          Assert(buf->refcount == 0);

***************
*** 436,442 ****
          PinBuffer_Locked(buf);

          /* Now it's safe to release the freelist lock */
!         LWLockRelease(BufFreelistLock);

          /*
           * If the buffer was dirty, try to write it out.  There is a race
--- 438,445 ----
          PinBuffer_Locked(buf);

          /* Now it's safe to release the freelist lock */
!         if (lock_held)
!             LWLockRelease(BufFreelistLock);

          /*
           * If the buffer was dirty, try to write it out.  There is a race
***************
*** 464,469 ****
--- 467,489 ----
               */
              if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
              {
+                 /* In BULKREAD-mode, check if a WAL flush would be needed to
+                  * evict this buffer. If so, ask the replacement strategy if
+                  * we should go ahead and do it or choose another victim.
+                  */
+                 if (active_access_pattern == AP_BULKREAD)
+                 {
+                     if (XLogNeedsFlush(BufferGetLSN(buf)))
+                     {
+                         if (StrategyRejectBuffer(buf))
+                         {
+                             LWLockRelease(buf->content_lock);
+                             UnpinBuffer(buf, true, false);
+                             continue;
+                         }
+                     }
+                 }
+
                  FlushBuffer(buf, NULL);
                  LWLockRelease(buf->content_lock);
              }
***************
*** 925,932 ****
      PrivateRefCount[b]--;
      if (PrivateRefCount[b] == 0)
      {
-         bool        immed_free_buffer = false;
-
          /* I'd better not still hold any locks on the buffer */
          Assert(!LWLockHeldByMe(buf->content_lock));
          Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
--- 945,950 ----
***************
*** 940,956 ****
          /* Update buffer usage info, unless this is an internal access */
          if (normalAccess)
          {
!             if (!strategy_hint_vacuum)
              {
!                 if (buf->usage_count < BM_MAX_USAGE_COUNT)
!                     buf->usage_count++;
              }
              else
!             {
!                 /* VACUUM accesses don't bump usage count, instead... */
!                 if (buf->refcount == 0 && buf->usage_count == 0)
!                     immed_free_buffer = true;
!             }
          }

          if ((buf->flags & BM_PIN_COUNT_WAITER) &&
--- 958,975 ----
          /* Update buffer usage info, unless this is an internal access */
          if (normalAccess)
          {
!             if (active_access_pattern != AP_NORMAL)
              {
!                 /* We don't want large one-off scans like vacuum to inflate
!                  * the usage_count. We do want to set it to 1, though, to keep
!                  * other backends from hijacking it from the buffer ring.
!                  */
!                 if (buf->usage_count == 0)
!                     buf->usage_count = 1;
              }
              else
!             if (buf->usage_count < BM_MAX_USAGE_COUNT)
!                 buf->usage_count++;
          }

          if ((buf->flags & BM_PIN_COUNT_WAITER) &&
***************
*** 965,978 ****
          }
          else
              UnlockBufHdr(buf);
-
-         /*
-          * If VACUUM is releasing an otherwise-unused buffer, send it to the
-          * freelist for near-term reuse.  We put it at the tail so that it
-          * won't be used before any invalid buffers that may exist.
-          */
-         if (immed_free_buffer)
-             StrategyFreeBuffer(buf, false);
      }
  }

--- 984,989 ----
Index: src/backend/storage/buffer/freelist.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/buffer/freelist.c,v
retrieving revision 1.58
diff -c -r1.58 freelist.c
*** src/backend/storage/buffer/freelist.c    5 Jan 2007 22:19:37 -0000    1.58
--- src/backend/storage/buffer/freelist.c    25 May 2007 19:25:16 -0000
***************
*** 18,23 ****
--- 18,25 ----
  #include "storage/buf_internals.h"
  #include "storage/bufmgr.h"

+ #include "utils/memutils.h"
+

  /*
   * The shared freelist control information.
***************
*** 39,47 ****
  /* Pointers to shared state */
  static BufferStrategyControl *StrategyControl = NULL;

! /* Backend-local state about whether currently vacuuming */
! bool        strategy_hint_vacuum = false;


  /*
   * StrategyGetBuffer
--- 41,59 ----
  /* Pointers to shared state */
  static BufferStrategyControl *StrategyControl = NULL;

! /* Currently active access pattern hint. */
! AccessPattern active_access_pattern = AP_NORMAL;

+ /* prototypes for internal functions */
+ static volatile BufferDesc *GetBufferFromRing(void);
+ static void PutBufferToRing(volatile BufferDesc *buf);
+ static void InitRing(void);
+
+ /* Did the last buffer returned by StrategyGetBuffer come from the buffer
+  * ring or from freelist/clock sweep? StrategyRejectBuffer needs to know
+  * that, see comments there.
+  */
+ static bool lastBufferCameFromRing = false;

  /*
   * StrategyGetBuffer
***************
*** 51,67 ****
   *    the selected buffer must not currently be pinned by anyone.
   *
   *    To ensure that no one else can pin the buffer before we do, we must
!  *    return the buffer with the buffer header spinlock still held.  That
!  *    means that we return with the BufFreelistLock still held, as well;
!  *    the caller must release that lock once the spinlock is dropped.
   */
  volatile BufferDesc *
! StrategyGetBuffer(void)
  {
      volatile BufferDesc *buf;
      int            trycounter;

      LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);

      /*
       * Try to get a buffer from the freelist.  Note that the freeNext fields
--- 63,98 ----
   *    the selected buffer must not currently be pinned by anyone.
   *
   *    To ensure that no one else can pin the buffer before we do, we must
!  *    return the buffer with the buffer header spinlock still held.  If
!  *    *lock_held is set at return, we return with the BufFreelistLock still
!  *    held, as well;    the caller must release that lock once the spinlock is
!  *    dropped.
   */
  volatile BufferDesc *
! StrategyGetBuffer(bool *lock_held)
  {
      volatile BufferDesc *buf;
      int            trycounter;

+     /* Get a buffer from the ring if we're doing a bulk scan */
+     if (active_access_pattern != AP_NORMAL)
+     {
+         buf = GetBufferFromRing();
+         if (buf != NULL)
+         {
+             *lock_held = false;
+             lastBufferCameFromRing = true;
+             return buf;
+         }
+     }
+
+     lastBufferCameFromRing = false;
+
+     /*
+      * If our selected buffer wasn't available, pick another...
+      */
      LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
+     *lock_held = true;

      /*
       * Try to get a buffer from the freelist.  Note that the freeNext fields
***************
*** 86,96 ****
           */
          LockBufHdr(buf);
          if (buf->refcount == 0 && buf->usage_count == 0)
              return buf;
          UnlockBufHdr(buf);
      }

!     /* Nothing on the freelist, so run the "clock sweep" algorithm */
      trycounter = NBuffers;
      for (;;)
      {
--- 117,131 ----
           */
          LockBufHdr(buf);
          if (buf->refcount == 0 && buf->usage_count == 0)
+         {
+             if (active_access_pattern != AP_NORMAL)
+                 PutBufferToRing(buf);
              return buf;
+         }
          UnlockBufHdr(buf);
      }

!     /* Nothing on the freelist, so run the shared "clock sweep" algorithm */
      trycounter = NBuffers;
      for (;;)
      {
***************
*** 105,111 ****
--- 140,150 ----
           */
          LockBufHdr(buf);
          if (buf->refcount == 0 && buf->usage_count == 0)
+         {
+             if (active_access_pattern != AP_NORMAL)
+                 PutBufferToRing(buf);
              return buf;
+         }
          if (buf->usage_count > 0)
          {
              buf->usage_count--;
***************
*** 191,204 ****
  }

  /*
!  * StrategyHintVacuum -- tell us whether VACUUM is active
   */
  void
! StrategyHintVacuum(bool vacuum_active)
  {
!     strategy_hint_vacuum = vacuum_active;
! }


  /*
   * StrategyShmemSize
--- 230,254 ----
  }

  /*
!  * SetAccessPattern -- Sets the active access pattern hint
!  *
!  * Caller is responsible for resetting the hint to AP_NORMAL after the bulk
!  * operation is done. It's ok to switch repeatedly between AP_NORMAL and one of
!  * the other strategies, for example in a query with one large sequential scan
!  * nested loop joined to an index scan. Index tuples should be fetched with the
!  * normal strategy and the pages from the seq scan should be read in with the
!  * AP_BULKREAD strategy. The ring won't be affected by such switching, however
!  * switching to an access pattern with different ring size will invalidate the
!  * old ring.
   */
  void
! SetAccessPattern(AccessPattern new_pattern)
  {
!     active_access_pattern = new_pattern;

+     if (active_access_pattern != AP_NORMAL)
+         InitRing();
+ }

  /*
   * StrategyShmemSize
***************
*** 274,276 ****
--- 324,504 ----
      else
          Assert(!init);
  }
+
+ /* ----------------------------------------------------------------
+  *                Backend-private buffer ring management
+  * ----------------------------------------------------------------
+  */
+
+ /*
+  * Ring sizes for different access patterns. See README for the rationale
+  * of these.
+  */
+ #define BULKREAD_RING_SIZE    256 * 1024 / BLCKSZ
+ #define VACUUM_RING_SIZE    256 * 1024 / BLCKSZ
+ #define COPY_RING_SIZE        Min(NBuffers / 8, (XLOG_SEG_SIZE / BLCKSZ) * 2)
+
+ /*
+  * BufferRing is an array of buffer ids, and RingSize it's size in number of
+  * elements. It's allocated in TopMemoryContext the first time it's needed.
+  */
+ static int *BufferRing = NULL;
+ static int RingSize = 0;
+
+ /* Index of the "current" slot in the ring. It's advanced every time a buffer
+  * is handed out from the ring with GetBufferFromRing and it points to the
+  * last buffer returned from the ring. RingCurSlot + 1 is the next victim
+  * GetBufferRing will hand out.
+  */
+ static int RingCurSlot = 0;
+
+ /* magic value to mark empty slots in the ring */
+ #define BUF_ID_NOT_SET -1
+
+
+ /*
+  * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
+  *        ring is empty.
+  *
+  * The bufhdr spin lock is held on the returned buffer.
+  */
+ static volatile BufferDesc *
+ GetBufferFromRing(void)
+ {
+     volatile BufferDesc *buf;
+
+     /* ring should be initialized by now */
+     Assert(RingSize > 0 && BufferRing != NULL);
+
+     /* Run private "clock cycle" */
+     if (++RingCurSlot >= RingSize)
+         RingCurSlot = 0;
+
+     /*
+      * If that slot hasn't been filled yet, tell the caller to allocate
+      * a new buffer with the normal allocation strategy. He will then
+      * fill this slot by calling PutBufferToRing with the new buffer.
+      */
+     if (BufferRing[RingCurSlot] == BUF_ID_NOT_SET)
+         return NULL;
+
+     buf = &BufferDescriptors[BufferRing[RingCurSlot]];
+
+     /*
+      * If the buffer is pinned we cannot use it under any circumstances.
+      * If usage_count == 0 then the buffer is fair game.
+      *
+      * We also choose this buffer if usage_count == 1. Strictly, this
+      * might sometimes be the wrong thing to do, but we rely on the high
+      * probability that it was this process that last touched the buffer.
+      * If it wasn't, we'll choose a suboptimal victim, but  it shouldn't
+      * make any difference in the big scheme of things.
+      *
+      */
+     LockBufHdr(buf);
+     if (buf->refcount == 0 && buf->usage_count <= 1)
+         return buf;
+     UnlockBufHdr(buf);
+
+     return NULL;
+ }
+
+ /*
+  * PutBufferToRing -- adds a buffer to the buffer ring
+  *
+  * Caller must hold the buffer header spinlock on the buffer.
+  */
+ static void
+ PutBufferToRing(volatile BufferDesc *buf)
+ {
+     /* ring should be initialized by now */
+     Assert(RingSize > 0 && BufferRing != NULL);
+
+     if (BufferRing[RingCurSlot] == BUF_ID_NOT_SET)
+         BufferRing[RingCurSlot] = buf->buf_id;
+
+ }
+
+ /*
+  * Initializes a ring buffer with correct size for the currently
+  * active strategy. Does nothing if the ring already has the right size.
+  */
+ static void
+ InitRing(void)
+ {
+     int new_size;
+     int old_size = RingSize;
+     int i;
+     MemoryContext oldcxt;
+
+     /* Determine new size */
+
+     switch(active_access_pattern)
+     {
+         case AP_BULKREAD:
+             new_size = BULKREAD_RING_SIZE;
+             break;
+         case AP_COPY:
+             new_size = COPY_RING_SIZE;
+             break;
+         case AP_VACUUM:
+             new_size = VACUUM_RING_SIZE;
+             break;
+         default:
+             elog(ERROR, "unexpected buffer cache strategy %d",
+                  active_access_pattern);
+             return; /* keep compile happy */
+     }
+
+     /*
+      * Seq scans set and reset the strategy on every page, so we better exit
+      * quickly if no change in size is needed.
+      */
+     if (new_size == old_size)
+         return;
+
+     /* Allocate array */
+
+     oldcxt = MemoryContextSwitchTo(TopMemoryContext);
+
+     if (old_size == 0)
+     {
+         Assert(BufferRing == NULL);
+         BufferRing = palloc(new_size * sizeof(int));
+     }
+     else
+         BufferRing = repalloc(BufferRing, new_size * sizeof(int));
+
+     MemoryContextSwitchTo(oldcxt);
+
+     for(i = 0; i < new_size; i++)
+         BufferRing[i] = BUF_ID_NOT_SET;
+
+     RingCurSlot = 0;
+     RingSize = new_size;
+ }
+
+ /*
+  * In AP_BULKREAD mode, the buffer manager calls this function when it turns
+  * out that the buffer handed to it by StrategyGetBuffer needs a WAL flush to
+  * write out. That gives us a second chance to choose another victim.
+  *
+  * Returns true if buffer manager should ask for a new victim, and false
+  * if WAL should be flushed and this buffer used.
+  */
+ bool
+ StrategyRejectBuffer(volatile BufferDesc *buf)
+ {
+     Assert(RingSize > 0);
+
+     /* If the buffer didn't come from the ring, we tell the caller to do the
+      * WAL flush and use the buffer. We don't want to mess with how the clock
+      * sweep works; in worst case there's no buffers in the buffer cache that
+      * can be reused without a WAL flush, and we'd get into an endless loop
+      * trying.
+      */
+     if (lastBufferCameFromRing)
+         BufferRing[RingCurSlot] = BUF_ID_NOT_SET;
+
+     return lastBufferCameFromRing;
+ }
Index: src/include/access/relscan.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/relscan.h,v
retrieving revision 1.52
diff -c -r1.52 relscan.h
*** src/include/access/relscan.h    20 Jan 2007 18:43:35 -0000    1.52
--- src/include/access/relscan.h    15 May 2007 17:01:31 -0000
***************
*** 28,33 ****
--- 28,34 ----
      ScanKey        rs_key;            /* array of scan key descriptors */
      BlockNumber rs_nblocks;        /* number of blocks to scan */
      bool        rs_pageatatime; /* verify visibility page-at-a-time? */
+     AccessPattern rs_accesspattern; /* access pattern to use for reads */

      /* scan current state */
      bool        rs_inited;        /* false = scan not init'd yet */
Index: src/include/access/xlog.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/xlog.h,v
retrieving revision 1.76
diff -c -r1.76 xlog.h
*** src/include/access/xlog.h    5 Jan 2007 22:19:51 -0000    1.76
--- src/include/access/xlog.h    14 May 2007 21:22:40 -0000
***************
*** 151,156 ****
--- 151,157 ----

  extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata);
  extern void XLogFlush(XLogRecPtr RecPtr);
+ extern bool XLogNeedsFlush(XLogRecPtr RecPtr);

  extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
Index: src/include/storage/buf_internals.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/buf_internals.h,v
retrieving revision 1.89
diff -c -r1.89 buf_internals.h
*** src/include/storage/buf_internals.h    5 Jan 2007 22:19:57 -0000    1.89
--- src/include/storage/buf_internals.h    15 May 2007 17:07:59 -0000
***************
*** 16,21 ****
--- 16,22 ----
  #define BUFMGR_INTERNALS_H

  #include "storage/buf.h"
+ #include "storage/bufmgr.h"
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/spin.h"
***************
*** 168,174 ****
  extern BufferDesc *LocalBufferDescriptors;

  /* in freelist.c */
! extern bool strategy_hint_vacuum;

  /* event counters in buf_init.c */
  extern long int ReadBufferCount;
--- 169,175 ----
  extern BufferDesc *LocalBufferDescriptors;

  /* in freelist.c */
! extern AccessPattern active_access_pattern;

  /* event counters in buf_init.c */
  extern long int ReadBufferCount;
***************
*** 184,195 ****
   */

  /* freelist.c */
! extern volatile BufferDesc *StrategyGetBuffer(void);
  extern void StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head);
  extern int    StrategySyncStart(void);
  extern Size StrategyShmemSize(void);
  extern void StrategyInitialize(bool init);

  /* buf_table.c */
  extern Size BufTableShmemSize(int size);
  extern void InitBufTable(int size);
--- 185,198 ----
   */

  /* freelist.c */
! extern volatile BufferDesc *StrategyGetBuffer(bool *lock_held);
  extern void StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head);
  extern int    StrategySyncStart(void);
  extern Size StrategyShmemSize(void);
  extern void StrategyInitialize(bool init);

+ extern bool StrategyRejectBuffer(volatile BufferDesc *buf);
+
  /* buf_table.c */
  extern Size BufTableShmemSize(int size);
  extern void InitBufTable(int size);
Index: src/include/storage/bufmgr.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/bufmgr.h,v
retrieving revision 1.103
diff -c -r1.103 bufmgr.h
*** src/include/storage/bufmgr.h    2 May 2007 23:18:03 -0000    1.103
--- src/include/storage/bufmgr.h    25 May 2007 19:23:37 -0000
***************
*** 49,54 ****
--- 49,67 ----
  #define BUFFER_LOCK_EXCLUSIVE    2

  /*
+  * For better cache efficiency, we use different buffer replacement strategies
+  * for different kinds of access patterns. Use SetAccessPattern to hint the
+  * buffer manager which kind of access we're doing at the moment.
+  */
+ typedef enum AccessPattern
+ {
+     AP_NORMAL,        /* Normal random access */
+     AP_BULKREAD,    /* Large read-only scan (hint bit updates are ok) */
+     AP_COPY,        /* Large updating scan, like COPY with WAL enabled */
+     AP_VACUUM,        /* VACUUM */
+ } AccessPattern;
+
+ /*
   * These routines are beaten on quite heavily, hence the macroization.
   */

***************
*** 157,162 ****
  extern void AtProcExit_LocalBuffers(void);

  /* in freelist.c */
! extern void StrategyHintVacuum(bool vacuum_active);

  #endif
--- 170,175 ----
  extern void AtProcExit_LocalBuffers(void);

  /* in freelist.c */
! extern void SetAccessPattern(AccessPattern new_pattern);

  #endif

pgsql-patches by date:

Previous
From: Tom Lane
Date:
Subject: Re: Rewritten Index Advisor patch
Next
From: Tom Lane
Date:
Subject: Re: Seq scans status update