Experimental ARC implementation - Mailing list pgsql-hackers

From Jan Wieck
Subject Experimental ARC implementation
Date
Msg-id 3FA67541.9020105@Yahoo.com
Whole thread Raw
Responses Re: Experimental ARC implementation  (Jan Wieck <JanWieck@Yahoo.com>)
List pgsql-hackers
Attached is a first trial implementation of the Adaptive Replacement
Cache (ARC). The patch is against CVS HEAD of 7.4.

The algorithm is based on what's described in these papers:

http://www.almaden.ibm.com/StorageSystems/autonomic_storage/ARC/arcfast.pdf
http://www.almaden.ibm.com/StorageSystems/autonomic_storage/ARC/oneup.pdf

kindly pointed to by Cuong Bui a few days ago.

To work with PostgreSQL and to mostly keep our differentiation between
the buffer manager and the cache replacement policy I implemented it as
a set of (hopefully) more strategy independent, separate calls that
mostly replace freelist.c. The main problem why the code differs
optically from the pseudo code found in oneup.pdf is, that it needs to
be concurrency safe. In particular the underlying list structures can be
changed by other backends during IO, and that happens half way through
the algorithm.

The major change to the existing freelist is an additional layer of
indirection. The original buffer descriptors now only hold a cache page
and it's clean/dirty information and such. The new layer is the cache
directory who's size is different from the number of data blocks in the
buffer cache.

With a TPC-C like transaction profile and a buffer cache size that
allows 10-15% of the database to stay resident, I am not able to see any
dramatic change in behaviour or performance. At the maximum the
difference could be called a "light relief". But that transaction
profile is not a good mix compared to any real world application anyway,
so I wouldn't give too much on that result. It's also possible that I
made some mistakes in the implementation, so have a critical eye on that.

When playing with it please keep a few facts in mind. A 64MB cache
(shared_buffers=8192) needs about 10 minutes of full transaction load to
populate, in benchmark terms this is called ramp-up time. The algorithm
is aiming at scan resistance, so a pure index access approach will not
show any significant changes. Therefore, don't tell me that a 1x scale
10 client by 1000 transactions pgbench run doesn't show any performance
boost, it cannot.

I will follow up shortly with an approach that integrates Tom's delay
mechanism plus my first READ_BY_VACUUM hack into one combined experiement.


Jan

--
#======================================================================#
# It's easier to get forgiveness for being wrong than for being right. #
# Let's break this rule - forgive me.                                  #
#================================================== JanWieck@Yahoo.com #
Index: src/backend/storage/buffer/buf_init.c
===================================================================
RCS file: /home/pgsql/CvsRoot/pgsql-server/src/backend/storage/buffer/buf_init.c,v
retrieving revision 1.54
diff -c -b -r1.54 buf_init.c
*** src/backend/storage/buffer/buf_init.c    2003/08/04 02:40:03    1.54
--- src/backend/storage/buffer/buf_init.c    2003/11/03 13:43:55
***************
*** 48,56 ****
  int            ShowPinTrace = 0;

  int            Data_Descriptors;
- int            Free_List_Descriptor;
- int            Lookup_List_Descriptor;
- int            Num_Descriptors;

  BufferDesc *BufferDescriptors;
  Block       *BufferBlockPointers;
--- 48,53 ----
***************
*** 133,141 ****
      int            i;

      Data_Descriptors = NBuffers;
-     Free_List_Descriptor = Data_Descriptors;
-     Lookup_List_Descriptor = Data_Descriptors + 1;
-     Num_Descriptors = Data_Descriptors + 1;

      /*
       * It's probably not really necessary to grab the lock --- if there's
--- 130,135 ----
***************
*** 156,162 ****

      BufferDescriptors = (BufferDesc *)
          ShmemInitStruct("Buffer Descriptors",
!                       Num_Descriptors * sizeof(BufferDesc), &foundDescs);

      BufferBlocks = (char *)
          ShmemInitStruct("Buffer Blocks",
--- 150,156 ----

      BufferDescriptors = (BufferDesc *)
          ShmemInitStruct("Buffer Descriptors",
!                       Data_Descriptors * sizeof(BufferDesc), &foundDescs);

      BufferBlocks = (char *)
          ShmemInitStruct("Buffer Blocks",
***************
*** 176,191 ****
          block = BufferBlocks;

          /*
!          * link the buffers into a circular, doubly-linked list to
!          * initialize free list, and initialize the buffer headers. Still
!          * don't know anything about replacement strategy in this file.
           */
          for (i = 0; i < Data_Descriptors; block += BLCKSZ, buf++, i++)
          {
              Assert(ShmemIsValid((unsigned long) block));

!             buf->freeNext = i + 1;
!             buf->freePrev = i - 1;

              CLEAR_BUFFERTAG(&(buf->tag));
              buf->buf_id = i;
--- 170,183 ----
          block = BufferBlocks;

          /*
!          * link the buffers into a single linked list. This will become the
!          * LiFo list of unused buffers returned by StragegyGetBuffer().
           */
          for (i = 0; i < Data_Descriptors; block += BLCKSZ, buf++, i++)
          {
              Assert(ShmemIsValid((unsigned long) block));

!             buf->bufNext = i + 1;

              CLEAR_BUFFERTAG(&(buf->tag));
              buf->buf_id = i;
***************
*** 199,212 ****
              buf->wait_backend_id = 0;
          }

!         /* close the circular queue */
!         BufferDescriptors[0].freePrev = Data_Descriptors - 1;
!         BufferDescriptors[Data_Descriptors - 1].freeNext = 0;
      }

      /* Init other shared buffer-management stuff */
!     InitBufTable();
!     InitFreeList(!foundDescs);

      LWLockRelease(BufMgrLock);
  }
--- 191,202 ----
              buf->wait_backend_id = 0;
          }

!         /* Correct last entry */
!         BufferDescriptors[Data_Descriptors - 1].bufNext = -1;
      }

      /* Init other shared buffer-management stuff */
!     StrategyInitialize(!foundDescs);

      LWLockRelease(BufMgrLock);
  }
Index: src/backend/storage/buffer/buf_table.c
===================================================================
RCS file: /home/pgsql/CvsRoot/pgsql-server/src/backend/storage/buffer/buf_table.c,v
retrieving revision 1.29
diff -c -b -r1.29 buf_table.c
*** src/backend/storage/buffer/buf_table.c    2003/08/04 02:40:03    1.29
--- src/backend/storage/buffer/buf_table.c    2003/11/03 13:45:13
***************
*** 38,44 ****
   * Initialize shmem hash table for mapping buffers
   */
  void
! InitBufTable(void)
  {
      HASHCTL        info;

--- 38,44 ----
   * Initialize shmem hash table for mapping buffers
   */
  void
! InitBufTable(int size)
  {
      HASHCTL        info;

***************
*** 50,56 ****
      info.hash = tag_hash;

      SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
!                                   NBuffers, NBuffers,
                                    &info,
                                    HASH_ELEM | HASH_FUNCTION);

--- 50,56 ----
      info.hash = tag_hash;

      SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
!                                   size, size,
                                    &info,
                                    HASH_ELEM | HASH_FUNCTION);

***************
*** 58,136 ****
          elog(FATAL, "could not initialize shared buffer hash table");
  }

! BufferDesc *
  BufTableLookup(BufferTag *tagPtr)
  {
      BufferLookupEnt *result;

      if (tagPtr->blockNum == P_NEW)
!         return NULL;

      result = (BufferLookupEnt *)
          hash_search(SharedBufHash, (void *) tagPtr, HASH_FIND, NULL);
      if (!result)
!         return NULL;

!     return &(BufferDescriptors[result->id]);
  }

  /*
   * BufTableDelete
   */
  bool
! BufTableDelete(BufferDesc *buf)
  {
      BufferLookupEnt *result;
!
!     /*
!      * buffer not initialized or has been removed from table already.
!      * BM_DELETED keeps us from removing buffer twice.
!      */
!     if (buf->flags & BM_DELETED)
!         return TRUE;
!
!     buf->flags |= BM_DELETED;

      result = (BufferLookupEnt *)
!         hash_search(SharedBufHash, (void *) &(buf->tag), HASH_REMOVE, NULL);

!     if (!result)                /* shouldn't happen */
!         elog(ERROR, "shared buffer hash table corrupted");

!     /*
!      * Clear the buffer's tag.  This doesn't matter for the hash table,
!      * since the buffer is already removed from it, but it ensures that
!      * sequential searches through the buffer table won't think the buffer
!      * is still valid for its old page.
!      */
!     buf->tag.rnode.relNode = InvalidOid;
!     buf->tag.rnode.tblNode = InvalidOid;

      return TRUE;
  }

  bool
! BufTableInsert(BufferDesc *buf)
  {
      BufferLookupEnt *result;
-     bool        found;
-
-     /* cannot insert it twice */
-     Assert(buf->flags & BM_DELETED);
-     buf->flags &= ~(BM_DELETED);

      result = (BufferLookupEnt *)
!         hash_search(SharedBufHash, (void *) &(buf->tag), HASH_ENTER, &found);

!     if (!result)
!         ereport(ERROR,
!                 (errcode(ERRCODE_OUT_OF_MEMORY),
!                  errmsg("out of shared memory")));
!
!     if (found)                    /* found something else in the table? */
          elog(ERROR, "shared buffer hash table corrupted");

-     result->id = buf->buf_id;
      return TRUE;
  }

--- 58,120 ----
          elog(FATAL, "could not initialize shared buffer hash table");
  }

! /*
!  * BufTableLookup
!  */
! int
  BufTableLookup(BufferTag *tagPtr)
  {
      BufferLookupEnt *result;

      if (tagPtr->blockNum == P_NEW)
!         return -1;

      result = (BufferLookupEnt *)
          hash_search(SharedBufHash, (void *) tagPtr, HASH_FIND, NULL);
      if (!result)
!         return -1;

!     return result->id;
  }

  /*
   * BufTableDelete
   */
  bool
! BufTableInsert(BufferTag *tagPtr, Buffer buf_id)
  {
      BufferLookupEnt *result;
!     bool        found;

      result = (BufferLookupEnt *)
!         hash_search(SharedBufHash, (void *) tagPtr, HASH_ENTER, &found);

!     if (!result)
!         ereport(ERROR,
!                 (errcode(ERRCODE_OUT_OF_MEMORY),
!                  errmsg("out of shared memory")));

!     if (found)                    /* found something else in the table? */
!         elog(ERROR, "shared buffer hash table corrupted");

+     result->id = buf_id;
      return TRUE;
  }

+ /*
+  * BufTableDelete
+  */
  bool
! BufTableDelete(BufferTag *tagPtr)
  {
      BufferLookupEnt *result;

      result = (BufferLookupEnt *)
!         hash_search(SharedBufHash, (void *) tagPtr, HASH_REMOVE, NULL);

!     if (!result)                /* shouldn't happen */
          elog(ERROR, "shared buffer hash table corrupted");

      return TRUE;
  }

Index: src/backend/storage/buffer/bufmgr.c
===================================================================
RCS file: /home/pgsql/CvsRoot/pgsql-server/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.141
diff -c -b -r1.141 bufmgr.c
*** src/backend/storage/buffer/bufmgr.c    2003/09/25 06:58:01    1.141
--- src/backend/storage/buffer/bufmgr.c    2003/11/03 13:50:31
***************
*** 260,271 ****
      if (status == SM_FAIL)
      {
          /* IO Failed.  cleanup the data structures and go home */

-         if (!BufTableDelete(bufHdr))
-         {
-             LWLockRelease(BufMgrLock);
-             elog(FATAL, "buffer table broken after I/O error");
-         }
          /* remember that BufferAlloc() pinned the buffer */
          UnpinBuffer(bufHdr);

--- 260,267 ----
      if (status == SM_FAIL)
      {
          /* IO Failed.  cleanup the data structures and go home */
+         StrategyInvalidateBuffer(bufHdr);

          /* remember that BufferAlloc() pinned the buffer */
          UnpinBuffer(bufHdr);

***************
*** 318,324 ****
      INIT_BUFFERTAG(&newTag, reln, blockNum);

      /* see if the block is in the buffer pool already */
!     buf = BufTableLookup(&newTag);
      if (buf != NULL)
      {
          /*
--- 314,320 ----
      INIT_BUFFERTAG(&newTag, reln, blockNum);

      /* see if the block is in the buffer pool already */
!     buf = StrategyBufferLookup(&newTag, false);
      if (buf != NULL)
      {
          /*
***************
*** 379,385 ****
      inProgress = FALSE;
      for (buf = (BufferDesc *) NULL; buf == (BufferDesc *) NULL;)
      {
!         buf = GetFreeBuffer();

          /* GetFreeBuffer will abort if it can't find a free buffer */
          Assert(buf);
--- 375,381 ----
      inProgress = FALSE;
      for (buf = (BufferDesc *) NULL; buf == (BufferDesc *) NULL;)
      {
!         buf = StrategyGetBuffer();

          /* GetFreeBuffer will abort if it can't find a free buffer */
          Assert(buf);
***************
*** 492,498 ****
               * we haven't gotten around to insert the new tag into the
               * buffer table. So we need to check here.        -ay 3/95
               */
!             buf2 = BufTableLookup(&newTag);
              if (buf2 != NULL)
              {
                  /*
--- 488,494 ----
               * we haven't gotten around to insert the new tag into the
               * buffer table. So we need to check here.        -ay 3/95
               */
!             buf2 = StrategyBufferLookup(&newTag, true);
              if (buf2 != NULL)
              {
                  /*
***************
*** 535,563 ****
       */

      /*
!      * Change the name of the buffer in the lookup table:
!      *
!      * Need to update the lookup table before the read starts. If someone
!      * comes along looking for the buffer while we are reading it in, we
!      * don't want them to allocate a new buffer.  For the same reason, we
!      * didn't want to erase the buf table entry for the buffer we were
!      * writing back until now, either.
       */
!
!     if (!BufTableDelete(buf))
!     {
!         LWLockRelease(BufMgrLock);
!         elog(FATAL, "buffer wasn't in the buffer hash table");
!     }
!
      INIT_BUFFERTAG(&(buf->tag), reln, blockNum);

-     if (!BufTableInsert(buf))
-     {
-         LWLockRelease(BufMgrLock);
-         elog(FATAL, "buffer in buffer hash table twice");
-     }
-
      /*
       * Buffer contents are currently invalid.  Have to mark IO IN PROGRESS
       * so no one fiddles with them until the read completes.  If this
--- 531,542 ----
       */

      /*
!      * Tell the buffer replacement strategy that we are replacing the
!      * buffer content. Then rename the buffer.
       */
!     StrategyReplaceBuffer(buf, reln, blockNum);
      INIT_BUFFERTAG(&(buf->tag), reln, blockNum);

      /*
       * Buffer contents are currently invalid.  Have to mark IO IN PROGRESS
       * so no one fiddles with them until the read completes.  If this
***************
*** 959,967 ****

              if (isCommit)
                  elog(WARNING,
!                 "buffer refcount leak: [%03d] (freeNext=%d, freePrev=%d, "
                    "rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)",
!                      i, buf->freeNext, buf->freePrev,
                       buf->tag.rnode.tblNode, buf->tag.rnode.relNode,
                       buf->tag.blockNum, buf->flags,
                       buf->refcount, PrivateRefCount[i]);
--- 938,946 ----

              if (isCommit)
                  elog(WARNING,
!                 "buffer refcount leak: [%03d] (bufNext=%d, "
                    "rel=%u/%u, blockNum=%u, flags=0x%x, refcount=%d %ld)",
!                      i, buf->bufNext,
                       buf->tag.rnode.tblNode, buf->tag.rnode.relNode,
                       buf->tag.blockNum, buf->flags,
                       buf->refcount, PrivateRefCount[i]);
***************
*** 1229,1235 ****
              /*
               * And mark the buffer as no longer occupied by this rel.
               */
!             BufTableDelete(bufHdr);
          }
      }

--- 1208,1214 ----
              /*
               * And mark the buffer as no longer occupied by this rel.
               */
!             StrategyInvalidateBuffer(bufHdr);
          }
      }

***************
*** 1295,1301 ****
              /*
               * And mark the buffer as no longer occupied by this page.
               */
!             BufTableDelete(bufHdr);
          }
      }

--- 1274,1280 ----
              /*
               * And mark the buffer as no longer occupied by this page.
               */
!             StrategyInvalidateBuffer(bufHdr);
          }
      }

***************
*** 1543,1549 ****
                  return -2;
              }
              if (bufHdr->tag.blockNum >= firstDelBlock)
!                 BufTableDelete(bufHdr);
          }
      }

--- 1522,1528 ----
                  return -2;
              }
              if (bufHdr->tag.blockNum >= firstDelBlock)
!                 StrategyInvalidateBuffer(bufHdr);
          }
      }

Index: src/backend/storage/buffer/freelist.c
===================================================================
RCS file: /home/pgsql/CvsRoot/pgsql-server/src/backend/storage/buffer/freelist.c,v
retrieving revision 1.31
diff -c -b -r1.31 freelist.c
*** src/backend/storage/buffer/freelist.c    2003/08/04 02:40:03    1.31
--- src/backend/storage/buffer/freelist.c    2003/11/03 14:26:49
***************
*** 1,8 ****
  /*-------------------------------------------------------------------------
   *
   * freelist.c
!  *      routines for manipulating the buffer pool's replacement strategy
!  *      freelist.
   *
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
--- 1,7 ----
  /*-------------------------------------------------------------------------
   *
   * freelist.c
!  *      routines for manipulating the buffer pool's replacement strategy.
   *
   * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
***************
*** 32,85 ****
  #include "storage/ipc.h"
  #include "storage/proc.h"


! static BufferDesc *SharedFreeList;

  /*
!  * State-checking macros
   */

! #define IsInQueue(bf) \
! ( \
!     AssertMacro((bf->freeNext != INVALID_DESCRIPTOR)), \
!     AssertMacro((bf->freePrev != INVALID_DESCRIPTOR)), \
!     AssertMacro((bf->flags & BM_FREE)) \
! )

! #define IsNotInQueue(bf) \
! ( \
!     AssertMacro((bf->freeNext == INVALID_DESCRIPTOR)), \
!     AssertMacro((bf->freePrev == INVALID_DESCRIPTOR)), \
!     AssertMacro(! (bf->flags & BM_FREE)) \
! )


  /*
!  * AddBufferToFreelist
   *
!  * In theory, this is the only routine that needs to be changed
!  * if the buffer replacement strategy changes.    Just change
!  * the manner in which buffers are added to the freelist queue.
!  * Currently, they are added on an LRU basis.
   */
! static void
! AddBufferToFreelist(BufferDesc *bf)
  {
! #ifdef BMTRACE
!     _bm_trace(bf->tag.relId.dbId, bf->tag.relId.relId, bf->tag.blockNum,
!               BufferDescriptorGetBuffer(bf), BMT_DEALLOC);
! #endif   /* BMTRACE */
!     IsNotInQueue(bf);

!     /* change bf so it points to inFrontOfNew and its successor */
!     bf->freePrev = SharedFreeList->freePrev;
!     bf->freeNext = Free_List_Descriptor;

!     /* insert new into chain */
!     BufferDescriptors[bf->freeNext].freePrev = bf->buf_id;
!     BufferDescriptors[bf->freePrev].freeNext = bf->buf_id;
  }

  #undef PinBuffer

  /*
--- 31,612 ----
  #include "storage/ipc.h"
  #include "storage/proc.h"

+ #define STRAT_LIST_UNUSED    -1
+ #define STRAT_LIST_B1        0
+ #define STRAT_LIST_T1        1
+ #define STRAT_LIST_T2        2
+ #define STRAT_LIST_B2        3
+ #define STRAT_NUM_LISTS        4

! #ifndef MAX
! #define MAX(a,b) (((a) > (b)) ? (a) : (b))
! #endif
! #ifndef MIN
! #define MIN(a,b) (((a) < (b)) ? (a) : (b))
! #endif
!
! /*
!  * The Cache Directory Block (CDB) of the Adaptive Replacement Cache (ARC)
!  */
! typedef struct bufstratcdb
! {
!     int            prev;
!     int            next;
!     int            list;
!     BufferTag    buf_tag;
!     Buffer        buf_id;
! } BufferStrategyCDB;
!
! /*
!  * The shared ARC control information.
!  */
! typedef struct bufstratcontrol
! {
!
!     int        target_T1_size;                /* What T1 size are we aiming for */
!     int        listUnusedCDB;                /* All unused StrategyCDB */
!     int        listHead[STRAT_NUM_LISTS];    /* ARC lists B1, T1, T2 and B2 */
!     int        listTail[STRAT_NUM_LISTS];
!     int        listSize[STRAT_NUM_LISTS];
!     Buffer    listFreeBuffers;            /* List of unused buffers */
!
!     BufferStrategyCDB    cdb[1];            /* The cache directory */
! } BufferStrategyControl;
!
!
! static BufferStrategyControl    *StrategyControl = NULL;
! static BufferStrategyCDB        *StrategyCDB = NULL;
!
! static int        strategy_cdb_found;
! static int        strategy_cdb_replace;
! static int        strategy_get_from;
!
!
! #define T1_TARGET    StrategyControl->target_T1_size
! #define B1_LENGTH    StrategyControl->listSize[STRAT_LIST_B1]
! #define T1_LENGTH    StrategyControl->listSize[STRAT_LIST_T1]
! #define T2_LENGTH    StrategyControl->listSize[STRAT_LIST_T2]
! #define B2_LENGTH    StrategyControl->listSize[STRAT_LIST_B2]
!
!
! /*
!  * Macro to remove a CDB from whichever list it currently is on
!  */
! #define    STRAT_LIST_REMOVE(cdb) \
!     AssertMacro((cdb)->list >= 0 && (cdb)->list < STRAT_NUM_LISTS);        \
!     if ((cdb)->prev < 0)                                                \
!         StrategyControl->listHead[(cdb)->list] = (cdb)->next;            \
!     else                                                                \
!         StrategyCDB[(cdb)->prev].next = (cdb)->next;                    \
!     if ((cdb)->next < 0)                                                \
!         StrategyControl->listTail[(cdb)->list] = (cdb)->prev;            \
!     else                                                                \
!         StrategyCDB[(cdb)->next].prev = (cdb)->prev;                    \
!     StrategyControl->listSize[(cdb)->list]--;                            \
!     (cdb)->list = STRAT_LIST_UNUSED;
!
! /*
!  * Macro to add a CDB to the tail of a list (MRU position)
!  */
! #define STRAT_MRU_INSERT(cdb,l) \
!     AssertMacro((cdb)->list == STRAT_LIST_UNUSED);                        \
!     if (StrategyControl->listTail[(l)] < 0)                                \
!     {                                                                    \
!         (cdb)->prev = (cdb)->next = -1;                                    \
!         StrategyControl->listHead[(l)] =                                 \
!             StrategyControl->listTail[(l)] =                            \
!             ((cdb) - StrategyCDB);                                        \
!     }                                                                    \
!     else                                                                \
!     {                                                                    \
!         (cdb)->next = -1;                                                \
!         (cdb)->prev = StrategyControl->listTail[(l)];                    \
!         StrategyCDB[StrategyControl->listTail[(l)]].next =                 \
!             ((cdb) - StrategyCDB);                                        \
!         StrategyControl->listTail[(l)] =                                 \
!             ((cdb) - StrategyCDB);                                        \
!     }                                                                    \
!     StrategyControl->listSize[(l)]++;                                    \
!     (cdb)->list = (l);
!
!
! /*
!  * StrategyBufferLookup
!  *
!  *    Lookup a page request in the cache directory. A buffer is only
!  *    returned for a T1 or T2 cache hit. B1 and B2 hits are only
!  *    remembered here to later affect the behaviour.
!  */
! BufferDesc *
! StrategyBufferLookup(BufferTag *tagPtr, bool recheck)
! {
!     BufferStrategyCDB    *cdb;
!
!     /*
!      * Lookup the block in the shared hash table
!      */
!     strategy_cdb_found = BufTableLookup(tagPtr);
!
!     /*
!      * Handle CDB lookup miss
!      */
!     if (strategy_cdb_found < 0)
!     {
!         if (!recheck)
!         {
!             /*
!              * This is an initial lookup and we have a complete
!              * cache miss (block found nowhere). This means we
!              * remember according to the current T1 size and the
!              * target T1 size from where we take a block if we
!              * need one later.
!              */
!             if (T1_LENGTH >= MAX(1, T1_TARGET))
!                 strategy_get_from = STRAT_LIST_T1;
!             else
!                 strategy_get_from = STRAT_LIST_T2;
!         }
!
!         /* report cache miss */
!         return NULL;
!     }
!
!     /*
!      * We found a CDB
!      */
!     cdb = &StrategyCDB[strategy_cdb_found];
!
!     /*
!      * If this is a T1 or T2 hit, we simply move the CDB to the
!      * T2 MRU position and return the found buffer.
!      */
!     if (cdb->list == STRAT_LIST_T1 || cdb->list == STRAT_LIST_T2)
!     {
!         STRAT_LIST_REMOVE(cdb);
!         STRAT_MRU_INSERT(cdb, STRAT_LIST_T2);
!
!         return &BufferDescriptors[cdb->buf_id];
!     }
!
!     /*
!      * In the case of a recheck we don't care about B1 or B2 hits here.
!      * The bufmgr does this call only to make sure noone faulted in the
!      * block while we where busy flushing another. Now for this really
!      * to end up as a B1 or B2 cache hit, we must have been flushing for
!      * quite some time as the block not only must have been read, but
!      * also traveled through the queue and evicted from the T cache again
!      * already.
!      */
!     if (recheck)
!         return NULL;
!
!     /*
!      * Adjust the target size of the T1 cache depending on if this is
!      * a B1 or B2 hit.
!      */
!     switch (cdb->list)
!     {
!         case STRAT_LIST_B1:
!             /*
!              * B1 hit means that the T1 cache is probably too
!              * small. Adjust the T1 target size and continue
!              * below.
!              */
!             T1_TARGET = MIN(T1_TARGET + MAX(B2_LENGTH / B1_LENGTH, 1),
!                             Data_Descriptors);
!             break;
!
!         case STRAT_LIST_B2:
!             /*
!              * B2 hit means that the T2 cache is probably too
!              * small. Adjust the T1 target size and continue
!              * below.
!              */
!             T1_TARGET = MAX(T1_TARGET - MAX(B1_LENGTH / B2_LENGTH, 1), 0);
!             break;
!
!         default:
!             elog(ERROR, "Buffer hash table corrupted - CDB on list %d found",
!                     cdb->list);
!     }
!
!     /*
!      * Decide where to take from if we will be out of
!      * free blocks later in StrategyGetBuffer().
!      */
!     if (T1_LENGTH >= MAX(1, T1_TARGET))
!         strategy_get_from = STRAT_LIST_T1;
!     else
!         strategy_get_from = STRAT_LIST_T2;
!
!     /*
!      * Even if we had seen the block in the past, it's data is
!      * not currently in memory ... cache miss to the bufmgr.
!      */
!     return NULL;
! }
!
!
! /*
!  * StrategyGetBuffer
!  *
!  *    Called by the bufmgr to get the next candidate buffer to use in
!  *    BufferAlloc(). The only hard requirement BufferAlloc() has is that
!  *    this buffer must not currently be pinned.
!  */
! BufferDesc *
! StrategyGetBuffer(void)
! {
!     int                cdb_id;
!     BufferDesc       *buf;
!
!     if (StrategyControl->listFreeBuffers < 0)
!     {
!         /* We don't have a free buffer, must take one from T1 or T2 */
!
!         if (strategy_get_from == STRAT_LIST_T1)
!         {
!             /*
!              * We should take the first unpinned buffer from T1.
!              */
!             cdb_id = StrategyControl->listHead[STRAT_LIST_T1];
!             while (cdb_id >= 0)
!             {
!                 buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id];
!                 if (buf->refcount == 0)
!                 {
!                     strategy_cdb_replace = cdb_id;
!                     Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T1);
!                     return buf;
!                 }
!                 cdb_id = StrategyCDB[cdb_id].next;
!             }
!
!             /*
!              * No unpinned T1 buffer found - pardon T2 cache.
!              */
!             cdb_id = StrategyControl->listHead[STRAT_LIST_T2];
!             while (cdb_id >= 0)
!             {
!                 buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id];
!                 if (buf->refcount == 0)
!                 {
!                     strategy_cdb_replace = cdb_id;
!                     Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T2);
!                     return buf;
!                 }
!                 cdb_id = StrategyCDB[cdb_id].next;
!             }
!
!             /*
!              * No unpinned buffers at all!!!
!              */
!             elog(ERROR, "StrategyGetBuffer(): Out of unpinned buffers");
!         }
!         else
!         {
!             /*
!              * We should take the first unpinned buffer from T2.
!              */
!             cdb_id = StrategyControl->listHead[STRAT_LIST_T2];
!             while (cdb_id >= 0)
!             {
!                 buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id];
!                 if (buf->refcount == 0)
!                 {
!                     strategy_cdb_replace = cdb_id;
!                     Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T2);
!                     return buf;
!                 }
!                 cdb_id = StrategyCDB[cdb_id].next;
!             }
!
!             /*
!              * No unpinned T2 buffer found - pardon T1 cache.
!              */
!             cdb_id = StrategyControl->listHead[STRAT_LIST_T1];
!             while (cdb_id >= 0)
!             {
!                 buf = &BufferDescriptors[StrategyCDB[cdb_id].buf_id];
!                 if (buf->refcount == 0)
!                 {
!                     strategy_cdb_replace = cdb_id;
!                     Assert(StrategyCDB[cdb_id].list == STRAT_LIST_T1);
!                     return buf;
!                 }
!                 cdb_id = StrategyCDB[cdb_id].next;
!             }
!
!             /*
!              * No unpinned buffers at all!!!
!              */
!             elog(ERROR, "StrategyGetBuffer(): Out of unpinned buffers");
!         }
!     }
!     else
!     {
!         /* There is a completely free buffer available - take it */
!
!         /*
!          * Note: This code uses the side effect that a free buffer
!          * can never be pinned or dirty and therefore the call to
!          * StrategyReplaceBuffer() will happen without the bufmgr
!          * releasing the bufmgr-lock in the meantime. That means,
!          * that there will never be any reason to recheck. Otherwise
!          * we would leak shared buffers here!
!          */
!         strategy_cdb_replace = -1;
!         buf = &BufferDescriptors[StrategyControl->listFreeBuffers];
!
!         StrategyControl->listFreeBuffers = buf->bufNext;
!         buf->bufNext = -1;

+         /* Buffer of freelist cannot be pinned */
+         Assert(buf->refcount == 0);
+
+         return buf;
+     }
+
+     /* not reached */
+     return NULL;
+ }
+
+
  /*
!  * StrategyReplaceBuffer
!  *
!  *    Called by the buffer manager to inform us that he possibly flushed
!  *     a buffer and is now about to replace the content. Prior to this call,
!  *    the cache algorithm still reports the buffer as in the cache. After
!  *    this call we report the new block, even if IO might still need to
!  *    start.
   */
+ void
+ StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum)
+ {
+     BufferStrategyCDB       *cdb_found;
+     BufferStrategyCDB       *cdb_replace;

!     if (strategy_cdb_found >= 0)
!     {
!         /* This was a ghost buffer cache hit (B1 or B2) */
!         cdb_found = &StrategyCDB[strategy_cdb_found];
!
!         /* Assert that the buffer remembered in cdb_found is the one */
!         /* the buffer manager is currently faulting in */
!         Assert(BUFFERTAG_EQUALS(&(cdb_found->buf_tag), rnode, blockNum));
!
!         if (strategy_cdb_replace >= 0)
!         {
!             /* We are satisfying it with an evicted T buffer */
!             cdb_replace = &StrategyCDB[strategy_cdb_replace];

!             /* Assert that the buffer remembered in cdb_replace is */
!             /* the one the buffer manager has just evicted */
!             Assert(cdb_replace->list == STRAT_LIST_T1 ||
!                     cdb_replace->list == STRAT_LIST_T2);
!             Assert(cdb_replace->buf_id == buf->buf_id);
!             Assert(BUFFERTAGS_EQUAL(&(cdb_replace->buf_tag), &(buf->tag)));

+             /* Move the evicted T list entry to it's corresponding B list */
+             if (cdb_replace->list == STRAT_LIST_T1)
+             {
+                 STRAT_LIST_REMOVE(cdb_replace);
+                 STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B1);
+             }
+             else
+             {
+                 STRAT_LIST_REMOVE(cdb_replace);
+                 STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B2);
+             }
+             /* And clear it's block reference */
+             cdb_replace->buf_id = -1;
+         }
+         else
+         {
+             /* or we satisfy it with an unused buffer */
+         }

+         /* Now the found B CDB get's the buffer and is moved to T2 */
+         cdb_found->buf_id = buf->buf_id;
+         STRAT_LIST_REMOVE(cdb_found);
+         STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T2);
+     }
+     else
+     {
+         /* This was a complete cache miss, so we need to create */
+         /* a new CDB */
+         if (StrategyControl->listUnusedCDB >= 0)
+         {
+             /* We have an unused one, let's take that */
+             cdb_found = &StrategyCDB[StrategyControl->listUnusedCDB];
+             StrategyControl->listUnusedCDB = cdb_found->next;
+         }
+         else
+         {
+             /* We want to evict a CDB from B2 for it. Only if B2 is */
+             /* empty we take it from B1. */
+             if (B2_LENGTH > 0)
+                 cdb_found = &StrategyCDB[StrategyControl->listHead[STRAT_LIST_B2]];
+             else
+                 cdb_found = &StrategyCDB[StrategyControl->listHead[STRAT_LIST_B1]];
+
+             /* Remove it's hashtable entry and take it from the list */
+             BufTableDelete(&(cdb_found->buf_tag));
+             STRAT_LIST_REMOVE(cdb_found);
+         }
+
+         /* Set the CDB's buf_tag and insert the hash key */
+         INIT_BUFFERTAG(&(cdb_found->buf_tag), rnode, blockNum);
+         BufTableInsert(&(cdb_found->buf_tag), (cdb_found - StrategyCDB));
+
+         if (strategy_cdb_replace >= 0)
+         {
+             /* The buffer was formerly in a T list, move it's CDB
+              * to the corresponding B list */
+             cdb_replace = &StrategyCDB[strategy_cdb_replace];
+
+             Assert(cdb_replace->list == STRAT_LIST_T1 ||
+                     cdb_replace->list == STRAT_LIST_T2);
+             Assert(cdb_replace->buf_id == buf->buf_id);
+             Assert(BUFFERTAGS_EQUAL(&(cdb_replace->buf_tag), &(buf->tag)));
+
+             if (cdb_replace->list == STRAT_LIST_T1)
+             {
+                 STRAT_LIST_REMOVE(cdb_replace);
+                 STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B1);
+             }
+             else
+             {
+                 STRAT_LIST_REMOVE(cdb_replace);
+                 STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B2);
+             }
+             /* And clear it's block reference */
+             cdb_replace->buf_id = -1;
+         }
+         else
+         {
+             /* or we satisfy it with an unused buffer */
+         }
+
+         /* Assign the buffer id to the new CDB and add it to T1 */
+         cdb_found->buf_id = buf->buf_id;
+         STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T1);
+     }
+ }
+
+
  /*
!  * StrategyInvalidateBuffer
   *
!  *    Called by the buffer manager to inform us that a buffer content
!  *    is no longer valid. We simply throw away any eventual existing
!  *    buffer hash entry and move the CDB and buffer to the free lists.
   */
! void
! StrategyInvalidateBuffer(BufferDesc *buf)
  {
!     int                    cdb_id;
!     BufferStrategyCDB  *cdb;
!
!     cdb_id = BufTableLookup(&(buf->tag));

!     /* If we have the buffer somewhere in the directory, remove it
!      * and add the CDB to the list of unused CDB's. */
!     if (cdb_id >= 0)
!     {
!         cdb = &StrategyCDB[cdb_id];
!         BufTableDelete(&(cdb->buf_tag));
!         STRAT_LIST_REMOVE(cdb);
!         cdb->buf_id = -1;
!         cdb->next = StrategyControl->listUnusedCDB;
!         StrategyControl->listUnusedCDB = cdb_id;
!     }

!     /* Buffer is unreferenced now and should not contain any valid data
!      * so add it to the list of free buffers */
!     buf->bufNext = StrategyControl->listFreeBuffers;
!     StrategyControl->listFreeBuffers = buf->buf_id;
  }

+
+ /*
+  * StrategyInitialize -- initialize the buffer cache replacement
+  *        strategy.
+  *
+  * Assume: All of the buffers are already building a linked list.
+  *        Only called by postmaster and only during initialization.
+  */
+ void
+ StrategyInitialize(bool init)
+ {
+     bool found;
+     int i;
+
+     /*
+      * Initialize the shared CDB lookup hashtable
+      */
+     InitBufTable(Data_Descriptors * 2);
+
+     /*
+      * Get or create the shared strategy control block and the CDB's
+      */
+     StrategyControl = (BufferStrategyControl *)
+             ShmemInitStruct("Buffer Strategy Status",
+                     sizeof(BufferStrategyControl) +
+                     sizeof(BufferStrategyCDB) * (Data_Descriptors * 2 - 1),
+                     &found);
+     StrategyCDB = &(StrategyControl->cdb[0]);
+
+     if (!found)
+     {
+         /*
+          * Only done once, usually in postmaster
+          */
+         Assert(init);
+
+         /*
+          * Grab the whole linked list of free buffers for our
+          * strategy
+          */
+         StrategyControl->listFreeBuffers = 0;
+
+         /*
+          * We start off with a target T1 list size of
+          * half the available cache blocks.
+          */
+         StrategyControl->target_T1_size = Data_Descriptors / 2;
+
+         /*
+          * Initialize B1, T1, T2 and B2 lists to be empty
+          */
+         for (i = 0; i < STRAT_NUM_LISTS; i++)
+         {
+             StrategyControl->listHead[i] = -1;
+             StrategyControl->listTail[i] = -1;
+             StrategyControl->listSize[i] = 0;
+         }
+
+         /*
+          * All CDB's are linked as the listUnusedCDB
+          */
+         for (i = 0; i < Data_Descriptors * 2; i++)
+         {
+             StrategyCDB[i].next = i + 1;
+             StrategyCDB[i].list = STRAT_LIST_UNUSED;
+             CLEAR_BUFFERTAG(&(StrategyCDB[i].buf_tag));
+             StrategyCDB[i].buf_id = -1;
+         }
+         StrategyCDB[Data_Descriptors * 2 - 1].next = -1;
+         StrategyControl->listUnusedCDB = 0;
+     }
+     else
+     {
+         Assert(!init);
+     }
+ }
+
+
  #undef PinBuffer

  /*
***************
*** 95,112 ****

      if (buf->refcount == 0)
      {
-         IsInQueue(buf);
-
-         /* remove from freelist queue */
-         BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
-         BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
-         buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
-
          /* mark buffer as no longer free */
          buf->flags &= ~BM_FREE;
      }
-     else
-         IsNotInQueue(buf);

      if (PrivateRefCount[b] == 0)
          buf->refcount++;
--- 622,630 ----
***************
*** 144,150 ****
  {
      int            b = BufferDescriptorGetBuffer(buf) - 1;

-     IsNotInQueue(buf);
      Assert(buf->refcount > 0);
      Assert(PrivateRefCount[b] > 0);
      PrivateRefCount[b]--;
--- 662,667 ----
***************
*** 154,160 ****
      if (buf->refcount == 0)
      {
          /* buffer is now unpinned */
-         AddBufferToFreelist(buf);
          buf->flags |= BM_FREE;
      }
      else if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
--- 671,676 ----
***************
*** 186,249 ****
      }
  }
  #endif
-
- /*
-  * GetFreeBuffer() -- get the 'next' buffer from the freelist.
-  */
- BufferDesc *
- GetFreeBuffer(void)
- {
-     BufferDesc *buf;
-
-     if (Free_List_Descriptor == SharedFreeList->freeNext)
-     {
-         /* queue is empty. All buffers in the buffer pool are pinned. */
-         ereport(ERROR,
-                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
-                  errmsg("out of free buffers")));
-         return NULL;
-     }
-     buf = &(BufferDescriptors[SharedFreeList->freeNext]);
-
-     /* remove from freelist queue */
-     BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
-     BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
-     buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
-
-     buf->flags &= ~(BM_FREE);
-
-     return buf;
- }
-
- /*
-  * InitFreeList -- initialize the dummy buffer descriptor used
-  *        as a freelist head.
-  *
-  * Assume: All of the buffers are already linked in a circular
-  *        queue.     Only called by postmaster and only during
-  *        initialization.
-  */
- void
- InitFreeList(bool init)
- {
-     SharedFreeList = &(BufferDescriptors[Free_List_Descriptor]);
-
-     if (init)
-     {
-         /* we only do this once, normally in the postmaster */
-         SharedFreeList->data = INVALID_OFFSET;
-         SharedFreeList->flags = 0;
-         SharedFreeList->flags &= ~(BM_VALID | BM_DELETED | BM_FREE);
-         SharedFreeList->buf_id = Free_List_Descriptor;
-
-         /* insert it into a random spot in the circular queue */
-         SharedFreeList->freeNext = BufferDescriptors[0].freeNext;
-         SharedFreeList->freePrev = 0;
-         BufferDescriptors[SharedFreeList->freeNext].freePrev =
-             BufferDescriptors[SharedFreeList->freePrev].freeNext =
-             Free_List_Descriptor;
-     }
- }


  /*
--- 702,707 ----
Index: src/include/storage/buf_internals.h
===================================================================
RCS file: /home/pgsql/CvsRoot/pgsql-server/src/include/storage/buf_internals.h,v
retrieving revision 1.61
diff -c -b -r1.61 buf_internals.h
*** src/include/storage/buf_internals.h    2003/08/04 02:40:14    1.61
--- src/include/storage/buf_internals.h    2003/11/02 21:58:52
***************
*** 72,88 ****
      (a)->rnode = (xx_reln)->rd_node \
  )

  /*
   *    BufferDesc -- shared buffer cache metadata for a single
   *                  shared buffer descriptor.
   */
  typedef struct sbufdesc
  {
!     Buffer        freeNext;        /* links for freelist chain */
!     Buffer        freePrev;
      SHMEM_OFFSET data;            /* pointer to data in buf pool */

!     /* tag and id must be together for table lookup (still true?) */
      BufferTag    tag;            /* file/block identifier */
      int            buf_id;            /* buffer's index number (from 0) */

--- 72,100 ----
      (a)->rnode = (xx_reln)->rd_node \
  )

+ #define BUFFERTAG_EQUALS(a,xx_reln,xx_blockNum) \
+ ( \
+     (a)->rnode.tblNode == (xx_reln)->rd_node.tblNode && \
+     (a)->rnode.relNode == (xx_reln)->rd_node.relNode && \
+     (a)->blockNum == (xx_blockNum) \
+ )
+ #define BUFFERTAGS_EQUAL(a,b) \
+ ( \
+     (a)->rnode.tblNode == (b)->rnode.tblNode && \
+     (a)->rnode.relNode == (b)->rnode.relNode && \
+     (a)->blockNum == (b)->blockNum \
+ )
+
  /*
   *    BufferDesc -- shared buffer cache metadata for a single
   *                  shared buffer descriptor.
   */
  typedef struct sbufdesc
  {
!     Buffer        bufNext;        /* link in freelist chain */
      SHMEM_OFFSET data;            /* pointer to data in buf pool */

!     /* tag and id must be together for table lookup */
      BufferTag    tag;            /* file/block identifier */
      int            buf_id;            /* buffer's index number (from 0) */

***************
*** 107,112 ****
--- 119,125 ----

  #define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)

+
  /*
   * Each backend has its own BufferLocks[] array holding flag bits
   * showing what locks it has set on each buffer.
***************
*** 167,180 ****
  /*freelist.c*/
  extern void PinBuffer(BufferDesc *buf);
  extern void UnpinBuffer(BufferDesc *buf);
! extern BufferDesc *GetFreeBuffer(void);
! extern void InitFreeList(bool init);

  /* buf_table.c */
! extern void InitBufTable(void);
! extern BufferDesc *BufTableLookup(BufferTag *tagPtr);
! extern bool BufTableDelete(BufferDesc *buf);
! extern bool BufTableInsert(BufferDesc *buf);

  /* bufmgr.c */
  extern BufferDesc *BufferDescriptors;
--- 180,196 ----
  /*freelist.c*/
  extern void PinBuffer(BufferDesc *buf);
  extern void UnpinBuffer(BufferDesc *buf);
! extern BufferDesc *StrategyBufferLookup(BufferTag *tagPtr, bool recheck);
! extern BufferDesc *StrategyGetBuffer(void);
! extern void StrategyReplaceBuffer(BufferDesc *buf, Relation rnode, BlockNumber blockNum);
! extern void StrategyInvalidateBuffer(BufferDesc *buf);
! extern void StrategyInitialize(bool init);

  /* buf_table.c */
! extern void InitBufTable(int size);
! extern int BufTableLookup(BufferTag *tagPtr);
! extern bool BufTableInsert(BufferTag *tagPtr, Buffer buf_id);
! extern bool BufTableDelete(BufferTag *tagPtr);

  /* bufmgr.c */
  extern BufferDesc *BufferDescriptors;

pgsql-hackers by date:

Previous
From: Neil Conway
Date:
Subject: Re: adding support for posix_fadvise()
Next
From: Manfred Spraul
Date:
Subject: Re: adding support for posix_fadvise()