Proof-of-concept ARC removal patches - Mailing list pgsql-patches

From Tom Lane
Subject Proof-of-concept ARC removal patches
Date
Msg-id 9253.1107484912@sss.pgh.pa.us
Whole thread Raw
Responses Re: Proof-of-concept ARC removal patches
List pgsql-patches
Attached are two different variants of a patch to remove the ARC
cache algorithm in favor of variants of the "2Q" algorithm.

The first patch approximates the "full 2Q" algorithm described by
Johnson and Shasha, while the second approximates their "simplified 2Q"
method.  Full 2Q uses a list of pages that were recently in cache but
no longer are, while simplified 2Q does not; the first patch is
therefore a smaller change to the existing code.

To show how closely the code is related, I kept the ARC names of T1, T2,
and B1 for the corresponding 2Q lists (A1, Am, A1out respectively).
We'd probably rename to use the 2Q names if we apply this, but the
patches are smaller and easier to review as they are.

The patches are not exactly the J&S algorithms, in part because I kept
the special cases for VACUUM, and in part because I really don't believe
their idea about not promoting from T1 into T2 until the page has fallen
into B1.  This helped to minimize the change from the existing ARC code.

In some desultory testing with pgbench, there was no significant
performance difference among the three algorithms, which leads me to
think that simplified 2Q might be the best bet (it certainly has the
smallest memory footprint).  But it would be a good idea to do some
more measurements before believing that.  I hope that Mark Wong can give
these a try on his setup soon.

These should apply against CVS tip of either HEAD or REL8_0_STABLE
branch; note that "CVS tip" includes the bufmgr refactoring patch
I applied earlier today.

Comments?

            regards, tom lane

*** src/backend/storage/buffer/freelist.c.orig    Thu Feb  3 18:29:11 2005
--- src/backend/storage/buffer/freelist.c    Thu Feb  3 20:54:08 2005
***************
*** 34,47 ****
   * Definitions for the buffer replacement strategy
   */
  #define STRAT_LIST_UNUSED    (-1)
! #define STRAT_LIST_B1        0
! #define STRAT_LIST_T1        1
! #define STRAT_LIST_T2        2
! #define STRAT_LIST_B2        3
! #define STRAT_NUM_LISTS        4

  /*
!  * The Cache Directory Block (CDB) of the Adaptive Replacement Cache (ARC)
   */
  typedef struct
  {
--- 34,49 ----
   * Definitions for the buffer replacement strategy
   */
  #define STRAT_LIST_UNUSED    (-1)
! #define STRAT_LIST_B1        0    /* A1out */
! #define STRAT_LIST_T1        1    /* A1 */
! #define STRAT_LIST_T2        2    /* Am */
! #define STRAT_NUM_LISTS        3

  /*
!  * We have a cache directory block (CDB) for every file page currently being
!  * tracked by the strategy module.  There can be more CDBs than there are
!  * actual shared buffers, allowing pages no longer in cache to still be
!  * tracked.
   */
  typedef struct
  {
***************
*** 55,68 ****
  } BufferStrategyCDB;

  /*
!  * The shared ARC control information.
   */
  typedef struct
  {
      int            target_T1_size; /* What T1 size are we aiming for */
      int            listUnusedCDB;    /* All unused StrategyCDB */
!     int            listHead[STRAT_NUM_LISTS];        /* ARC lists B1, T1, T2
!                                                  * and B2 */
      int            listTail[STRAT_NUM_LISTS];
      int            listSize[STRAT_NUM_LISTS];
      Buffer        listFreeBuffers;    /* List of unused buffers */
--- 57,69 ----
  } BufferStrategyCDB;

  /*
!  * The shared strategy control information.
   */
  typedef struct
  {
      int            target_T1_size; /* What T1 size are we aiming for */
      int            listUnusedCDB;    /* All unused StrategyCDB */
!     int            listHead[STRAT_NUM_LISTS];        /* CDB lists */
      int            listTail[STRAT_NUM_LISTS];
      int            listSize[STRAT_NUM_LISTS];
      Buffer        listFreeBuffers;    /* List of unused buffers */
***************
*** 91,97 ****
  #define B1_LENGTH    (StrategyControl->listSize[STRAT_LIST_B1])
  #define T1_LENGTH    (StrategyControl->listSize[STRAT_LIST_T1])
  #define T2_LENGTH    (StrategyControl->listSize[STRAT_LIST_T2])
- #define B2_LENGTH    (StrategyControl->listSize[STRAT_LIST_B2])


  /*
--- 92,97 ----
***************
*** 178,185 ****
          long        all_hit,
                      b1_hit,
                      t1_hit,
!                     t2_hit,
!                     b2_hit;
          int            id,
                      t1_clean,
                      t2_clean;
--- 178,184 ----
          long        all_hit,
                      b1_hit,
                      t1_hit,
!                     t2_hit;
          int            id,
                      t1_clean,
                      t2_clean;
***************
*** 205,211 ****
          }

          if (StrategyControl->num_lookup == 0)
!             all_hit = b1_hit = t1_hit = t2_hit = b2_hit = 0;
          else
          {
              b1_hit = (StrategyControl->num_hit[STRAT_LIST_B1] * 100 /
--- 204,210 ----
          }

          if (StrategyControl->num_lookup == 0)
!             all_hit = b1_hit = t1_hit = t2_hit = 0;
          else
          {
              b1_hit = (StrategyControl->num_hit[STRAT_LIST_B1] * 100 /
***************
*** 214,231 ****
                        StrategyControl->num_lookup);
              t2_hit = (StrategyControl->num_hit[STRAT_LIST_T2] * 100 /
                        StrategyControl->num_lookup);
!             b2_hit = (StrategyControl->num_hit[STRAT_LIST_B2] * 100 /
!                       StrategyControl->num_lookup);
!             all_hit = b1_hit + t1_hit + t2_hit + b2_hit;
          }

          errcxtold = error_context_stack;
          error_context_stack = NULL;
!         elog(DEBUG1, "ARC T1target=%5d B1len=%5d T1len=%5d T2len=%5d B2len=%5d",
!              T1_TARGET, B1_LENGTH, T1_LENGTH, T2_LENGTH, B2_LENGTH);
!         elog(DEBUG1, "ARC total   =%4ld%% B1hit=%4ld%% T1hit=%4ld%% T2hit=%4ld%% B2hit=%4ld%%",
!              all_hit, b1_hit, t1_hit, t2_hit, b2_hit);
!         elog(DEBUG1, "ARC clean buffers at LRU       T1=   %5d T2=   %5d",
               t1_clean, t2_clean);
          error_context_stack = errcxtold;

--- 213,228 ----
                        StrategyControl->num_lookup);
              t2_hit = (StrategyControl->num_hit[STRAT_LIST_T2] * 100 /
                        StrategyControl->num_lookup);
!             all_hit = b1_hit + t1_hit + t2_hit;
          }

          errcxtold = error_context_stack;
          error_context_stack = NULL;
!         elog(DEBUG1, "2Q T1target=%5d B1len=%5d T1len=%5d T2len=%5d",
!              T1_TARGET, B1_LENGTH, T1_LENGTH, T2_LENGTH);
!         elog(DEBUG1, "2Q total   =%4ld%% B1hit=%4ld%% T1hit=%4ld%% T2hit=%4ld%%",
!              all_hit, b1_hit, t1_hit, t2_hit);
!         elog(DEBUG1, "2Q clean buffers at LRU       T1=   %5d T2=   %5d",
               t1_clean, t2_clean);
          error_context_stack = errcxtold;

***************
*** 233,239 ****
          StrategyControl->num_hit[STRAT_LIST_B1] = 0;
          StrategyControl->num_hit[STRAT_LIST_T1] = 0;
          StrategyControl->num_hit[STRAT_LIST_T2] = 0;
-         StrategyControl->num_hit[STRAT_LIST_B2] = 0;
          StrategyControl->stat_report = now;
      }
  }
--- 230,235 ----
***************
*** 242,248 ****
   * StrategyBufferLookup
   *
   *    Lookup a page request in the cache directory. A buffer is only
!  *    returned for a T1 or T2 cache hit. B1 and B2 hits are just
   *    remembered here, to possibly affect the behaviour later.
   *
   *    recheck indicates we are rechecking after I/O wait; do not change
--- 238,244 ----
   * StrategyBufferLookup
   *
   *    Lookup a page request in the cache directory. A buffer is only
!  *    returned for a T1 or T2 cache hit. B1 hits are just
   *    remembered here, to possibly affect the behaviour later.
   *
   *    recheck indicates we are rechecking after I/O wait; do not change
***************
*** 346,400 ****
      }

      /*
-      * In the case of a recheck we don't care about B1 or B2 hits here.
-      * The bufmgr does this call only to make sure no-one faulted in the
-      * block while we where busy flushing another; we don't want to doubly
-      * adjust the T1target.
-      *
-      * Now for this really to end up as a B1 or B2 cache hit, we must have
-      * been flushing for quite some time as the block not only must have
-      * been read, but also traveled through the queue and evicted from the
-      * T cache again already.
-      *
-      * VACUUM re-reads shouldn't adjust the target either.
-      */
-     if (recheck || strategy_hint_vacuum)
-         return NULL;
-
-     /*
-      * Adjust the target size of the T1 cache depending on if this is a B1
-      * or B2 hit.
-      */
-     switch (cdb->list)
-     {
-         case STRAT_LIST_B1:
-
-             /*
-              * B1 hit means that the T1 cache is probably too small.
-              * Adjust the T1 target size and continue below.
-              */
-             T1_TARGET = Min(T1_TARGET + Max(B2_LENGTH / B1_LENGTH, 1),
-                             NBuffers);
-             break;
-
-         case STRAT_LIST_B2:
-
-             /*
-              * B2 hit means that the T2 cache is probably too small.
-              * Adjust the T1 target size and continue below.
-              */
-             T1_TARGET = Max(T1_TARGET - Max(B1_LENGTH / B2_LENGTH, 1), 0);
-             break;
-
-         default:
-             elog(ERROR, "buffer hash table corrupted: CDB->list = %d",
-                  cdb->list);
-     }
-
-     /*
       * Even though we had seen the block in the past, its data is not
       * currently in memory ... cache miss to the bufmgr.
       */
      return NULL;
  }

--- 342,351 ----
      }

      /*
       * Even though we had seen the block in the past, its data is not
       * currently in memory ... cache miss to the bufmgr.
       */
+     Assert(cdb->list == STRAT_LIST_B1);
      return NULL;
  }

***************
*** 423,429 ****
           * We don't have a free buffer, must take one from T1 or T2.
           * Choose based on trying to converge T1len to T1target.
           */
!         if (T1_LENGTH >= Max(1, T1_TARGET))
          {
              /*
               * We should take the first unpinned buffer from T1.
--- 374,380 ----
           * We don't have a free buffer, must take one from T1 or T2.
           * Choose based on trying to converge T1len to T1target.
           */
!         if (T1_LENGTH >= T1_TARGET)
          {
              /*
               * We should take the first unpinned buffer from T1.
***************
*** 553,559 ****

      if (cdb_found_index >= 0)
      {
!         /* This must have been a ghost buffer cache hit (B1 or B2) */
          cdb_found = &StrategyCDB[cdb_found_index];

          /* Assert that the buffer remembered in cdb_found is the one */
--- 504,510 ----

      if (cdb_found_index >= 0)
      {
!         /* This must have been a ghost buffer cache hit (B1 list) */
          cdb_found = &StrategyCDB[cdb_found_index];

          /* Assert that the buffer remembered in cdb_found is the one */
***************
*** 573,586 ****
              Assert(BUFFERTAGS_EQUAL(cdb_replace->buf_tag, buf->tag));

              /*
!              * Under normal circumstances we move the evicted T list entry
!              * to the corresponding B list.  However, T1 entries that
!              * exist only because of VACUUM are just thrown into the
!              * unused list instead. We don't expect them to be touched
!              * again by the VACUUM, and if we put them into B1 then VACUUM
!              * would skew T1_target adjusting.
               */
!             if (cdb_replace->t1_vacuum)
              {
                  BufTableDelete(&(cdb_replace->buf_tag));
                  STRAT_LIST_REMOVE(cdb_replace);
--- 524,537 ----
              Assert(BUFFERTAGS_EQUAL(cdb_replace->buf_tag, buf->tag));

              /*
!              * Under normal circumstances we move evicted T1 list entries
!              * to the B1 list.  However, T1 entries that exist only because
!              * of VACUUM are just thrown into the unused list instead,
!              * since it's unlikely they'll be touched again soon.  Similarly,
!              * evicted T2 entries are thrown away; the LRU T2 entry cannot
!              * have been touched recently.
               */
!             if (cdb_replace->t1_vacuum || cdb_replace->list == STRAT_LIST_T2)
              {
                  BufTableDelete(&(cdb_replace->buf_tag));
                  STRAT_LIST_REMOVE(cdb_replace);
***************
*** 589,604 ****
              }
              else
              {
!                 if (cdb_replace->list == STRAT_LIST_T1)
!                 {
!                     STRAT_LIST_REMOVE(cdb_replace);
!                     STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B1);
!                 }
!                 else
!                 {
!                     STRAT_LIST_REMOVE(cdb_replace);
!                     STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B2);
!                 }
              }
              /* And clear its block reference */
              cdb_replace->buf_id = -1;
--- 540,547 ----
              }
              else
              {
!                 STRAT_LIST_REMOVE(cdb_replace);
!                 STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B1);
              }
              /* And clear its block reference */
              cdb_replace->buf_id = -1;
***************
*** 608,614 ****
              /* We are satisfying it with an unused buffer */
          }

!         /* Now the found B CDB gets the buffer and is moved to T2 */
          cdb_found->buf_id = buf->buf_id;
          STRAT_LIST_REMOVE(cdb_found);
          STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T2);
--- 551,557 ----
              /* We are satisfying it with an unused buffer */
          }

!         /* Now the found B1 CDB gets the buffer and is moved to T2 */
          cdb_found->buf_id = buf->buf_id;
          STRAT_LIST_REMOVE(cdb_found);
          STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T2);
***************
*** 617,652 ****
      {
          /*
           * This was a complete cache miss, so we need to create a new CDB.
!          * The goal is to keep T1len+B1len <= c.
           */
!         if (B1_LENGTH > 0 && (T1_LENGTH + B1_LENGTH) >= NBuffers)
          {
!             /* So if B1 isn't empty and T1len+B1len >= c we take B1-LRU */
!             cdb_found = &StrategyCDB[StrategyControl->listHead[STRAT_LIST_B1]];
!
!             BufTableDelete(&(cdb_found->buf_tag));
!             STRAT_LIST_REMOVE(cdb_found);
          }
          else
          {
!             /* Otherwise, we try to use a free one */
!             if (StrategyControl->listUnusedCDB >= 0)
!             {
!                 cdb_found = &StrategyCDB[StrategyControl->listUnusedCDB];
!                 StrategyControl->listUnusedCDB = cdb_found->next;
!             }
!             else
!             {
!                 /* If there isn't, we take B2-LRU ... except if */
!                 /* T1len+B1len+T2len = c ... oh my */
!                 if (B2_LENGTH > 0)
!                     cdb_found = &StrategyCDB[StrategyControl->listHead[STRAT_LIST_B2]];
!                 else
!                     cdb_found = &StrategyCDB[StrategyControl->listHead[STRAT_LIST_B1]];

!                 BufTableDelete(&(cdb_found->buf_tag));
!                 STRAT_LIST_REMOVE(cdb_found);
!             }
          }

          /* Set the CDB's buf_tag and insert it into the hash table */
--- 560,581 ----
      {
          /*
           * This was a complete cache miss, so we need to create a new CDB.
!          * We use a free one if available, else reclaim the tail end of B1.
           */
!         if (StrategyControl->listUnusedCDB >= 0)
          {
!             cdb_found = &StrategyCDB[StrategyControl->listUnusedCDB];
!             StrategyControl->listUnusedCDB = cdb_found->next;
          }
          else
          {
!             /* Can't fail because we have more CDBs than buffers... */
!             if (B1_LENGTH == 0)
!                 elog(PANIC, "StrategyReplaceBuffer: out of CDBs");
!             cdb_found = &StrategyCDB[StrategyControl->listHead[STRAT_LIST_B1]];

!             BufTableDelete(&(cdb_found->buf_tag));
!             STRAT_LIST_REMOVE(cdb_found);
          }

          /* Set the CDB's buf_tag and insert it into the hash table */
***************
*** 657,663 ****
          {
              /*
               * The buffer was formerly in a T list, move its CDB to the
!              * corresponding B list
               */
              cdb_replace = &StrategyCDB[cdb_replace_index];

--- 586,592 ----
          {
              /*
               * The buffer was formerly in a T list, move its CDB to the
!              * appropriate list: B1 if T1, else discard it, as above
               */
              cdb_replace = &StrategyCDB[cdb_replace_index];

***************
*** 673,680 ****
              }
              else
              {
                  STRAT_LIST_REMOVE(cdb_replace);
!                 STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B2);
              }
              /* And clear its block reference */
              cdb_replace->buf_id = -1;
--- 602,611 ----
              }
              else
              {
+                 BufTableDelete(&(cdb_replace->buf_tag));
                  STRAT_LIST_REMOVE(cdb_replace);
!                 cdb_replace->next = StrategyControl->listUnusedCDB;
!                 StrategyControl->listUnusedCDB = cdb_replace_index;
              }
              /* And clear its block reference */
              cdb_replace->buf_id = -1;
***************
*** 744,750 ****
      cdb = &StrategyCDB[cdb_id];

      /*
!      * Remove the CDB from the hashtable and the ARC queue it is currently
       * on.
       */
      BufTableDelete(&(cdb->buf_tag));
--- 675,681 ----
      cdb = &StrategyCDB[cdb_id];

      /*
!      * Remove the CDB from the hashtable and the queue it is currently
       * on.
       */
      BufTableDelete(&(cdb->buf_tag));
***************
*** 808,814 ****

      /*
       * Traverse the T1 and T2 list LRU to MRU in "parallel" and add all
!      * dirty buffers found in that order to the list. The ARC strategy
       * keeps all used buffers including pinned ones in the T1 or T2 list.
       * So we cannot miss any dirty buffers.
       */
--- 739,745 ----

      /*
       * Traverse the T1 and T2 list LRU to MRU in "parallel" and add all
!      * dirty buffers found in that order to the list. The 2Q strategy
       * keeps all used buffers including pinned ones in the T1 or T2 list.
       * So we cannot miss any dirty buffers.
       */
***************
*** 870,885 ****
  int
  StrategyShmemSize(void)
  {
      int            size = 0;

      /* size of CDB lookup hash table */
!     size += BufTableShmemSize(NBuffers * 2);

      /* size of the shared replacement strategy control block */
      size += MAXALIGN(sizeof(BufferStrategyControl));

!     /* size of the ARC directory blocks */
!     size += MAXALIGN(NBuffers * 2 * sizeof(BufferStrategyCDB));

      return size;
  }
--- 801,818 ----
  int
  StrategyShmemSize(void)
  {
+     /* A1out list can hold 50% of NBuffers, per Johnson and Shasha */
+     int            nCDBs = NBuffers + NBuffers / 2;
      int            size = 0;

      /* size of CDB lookup hash table */
!     size += BufTableShmemSize(nCDBs);

      /* size of the shared replacement strategy control block */
      size += MAXALIGN(sizeof(BufferStrategyControl));

!     /* size of the CDB directory */
!     size += MAXALIGN(nCDBs * sizeof(BufferStrategyCDB));

      return size;
  }
***************
*** 894,906 ****
  void
  StrategyInitialize(bool init)
  {
      bool        found;
      int            i;

      /*
       * Initialize the shared CDB lookup hashtable
       */
!     InitBufTable(NBuffers * 2);

      /*
       * Get or create the shared strategy control block and the CDB's
--- 827,841 ----
  void
  StrategyInitialize(bool init)
  {
+     /* A1out list can hold 50% of NBuffers, per Johnson and Shasha */
+     int            nCDBs = NBuffers + NBuffers / 2;
      bool        found;
      int            i;

      /*
       * Initialize the shared CDB lookup hashtable
       */
!     InitBufTable(nCDBs);

      /*
       * Get or create the shared strategy control block and the CDB's
***************
*** 908,914 ****
      StrategyControl = (BufferStrategyControl *)
          ShmemInitStruct("Buffer Strategy Status",
                          sizeof(BufferStrategyControl) +
!                         sizeof(BufferStrategyCDB) * (NBuffers * 2 - 1),
                          &found);
      StrategyCDB = &(StrategyControl->cdb[0]);

--- 843,849 ----
      StrategyControl = (BufferStrategyControl *)
          ShmemInitStruct("Buffer Strategy Status",
                          sizeof(BufferStrategyControl) +
!                         sizeof(BufferStrategyCDB) * (nCDBs - 1),
                          &found);
      StrategyCDB = &(StrategyControl->cdb[0]);

***************
*** 926,938 ****
          StrategyControl->listFreeBuffers = 0;

          /*
!          * We start off with a target T1 list size of half the available
!          * cache blocks.
           */
!         StrategyControl->target_T1_size = NBuffers / 2;

          /*
!          * Initialize B1, T1, T2 and B2 lists to be empty
           */
          for (i = 0; i < STRAT_NUM_LISTS; i++)
          {
--- 861,873 ----
          StrategyControl->listFreeBuffers = 0;

          /*
!          * We set the target T1 size to 1/4th of available buffers.
!          * Possibly this should be a runtime tunable.
           */
!         StrategyControl->target_T1_size = NBuffers / 4;

          /*
!          * Initialize all lists to be empty
           */
          for (i = 0; i < STRAT_NUM_LISTS; i++)
          {
***************
*** 947,960 ****
          /*
           * All CDB's are linked as the listUnusedCDB
           */
!         for (i = 0; i < NBuffers * 2; i++)
          {
              StrategyCDB[i].next = i + 1;
              StrategyCDB[i].list = STRAT_LIST_UNUSED;
              CLEAR_BUFFERTAG(StrategyCDB[i].buf_tag);
              StrategyCDB[i].buf_id = -1;
          }
!         StrategyCDB[NBuffers * 2 - 1].next = -1;
          StrategyControl->listUnusedCDB = 0;
      }
      else
--- 882,895 ----
          /*
           * All CDB's are linked as the listUnusedCDB
           */
!         for (i = 0; i < nCDBs; i++)
          {
              StrategyCDB[i].next = i + 1;
              StrategyCDB[i].list = STRAT_LIST_UNUSED;
              CLEAR_BUFFERTAG(StrategyCDB[i].buf_tag);
              StrategyCDB[i].buf_id = -1;
          }
!         StrategyCDB[nCDBs - 1].next = -1;
          StrategyControl->listUnusedCDB = 0;
      }
      else
*** src/backend/storage/buffer/freelist.c.orig    Thu Feb  3 18:29:11 2005
--- src/backend/storage/buffer/freelist.c    Thu Feb  3 19:21:01 2005
***************
*** 34,47 ****
   * Definitions for the buffer replacement strategy
   */
  #define STRAT_LIST_UNUSED    (-1)
! #define STRAT_LIST_B1        0
! #define STRAT_LIST_T1        1
! #define STRAT_LIST_T2        2
! #define STRAT_LIST_B2        3
! #define STRAT_NUM_LISTS        4

  /*
!  * The Cache Directory Block (CDB) of the Adaptive Replacement Cache (ARC)
   */
  typedef struct
  {
--- 34,48 ----
   * Definitions for the buffer replacement strategy
   */
  #define STRAT_LIST_UNUSED    (-1)
! #define STRAT_LIST_T1        0
! #define STRAT_LIST_T2        1
! #define STRAT_NUM_LISTS        2

  /*
!  * We have a cache directory block (CDB) for every file page currently being
!  * tracked by the strategy module.  In principle there can be more CDBs than
!  * there are actual shared buffers, allowing pages no longer in cache to
!  * still be tracked.  The current implementation doesn't do that though.
   */
  typedef struct
  {
***************
*** 55,68 ****
  } BufferStrategyCDB;

  /*
!  * The shared ARC control information.
   */
  typedef struct
  {
      int            target_T1_size; /* What T1 size are we aiming for */
      int            listUnusedCDB;    /* All unused StrategyCDB */
!     int            listHead[STRAT_NUM_LISTS];        /* ARC lists B1, T1, T2
!                                                  * and B2 */
      int            listTail[STRAT_NUM_LISTS];
      int            listSize[STRAT_NUM_LISTS];
      Buffer        listFreeBuffers;    /* List of unused buffers */
--- 56,68 ----
  } BufferStrategyCDB;

  /*
!  * The shared strategy control information.
   */
  typedef struct
  {
      int            target_T1_size; /* What T1 size are we aiming for */
      int            listUnusedCDB;    /* All unused StrategyCDB */
!     int            listHead[STRAT_NUM_LISTS];        /* CDB lists T1 and T2 */
      int            listTail[STRAT_NUM_LISTS];
      int            listSize[STRAT_NUM_LISTS];
      Buffer        listFreeBuffers;    /* List of unused buffers */
***************
*** 88,97 ****


  #define T1_TARGET    (StrategyControl->target_T1_size)
- #define B1_LENGTH    (StrategyControl->listSize[STRAT_LIST_B1])
  #define T1_LENGTH    (StrategyControl->listSize[STRAT_LIST_T1])
  #define T2_LENGTH    (StrategyControl->listSize[STRAT_LIST_T2])
- #define B2_LENGTH    (StrategyControl->listSize[STRAT_LIST_B2])


  /*
--- 88,95 ----
***************
*** 176,185 ****
      if (StrategyControl->stat_report + DebugSharedBuffers < now)
      {
          long        all_hit,
-                     b1_hit,
                      t1_hit,
!                     t2_hit,
!                     b2_hit;
          int            id,
                      t1_clean,
                      t2_clean;
--- 174,181 ----
      if (StrategyControl->stat_report + DebugSharedBuffers < now)
      {
          long        all_hit,
                      t1_hit,
!                     t2_hit;
          int            id,
                      t1_clean,
                      t2_clean;
***************
*** 205,239 ****
          }

          if (StrategyControl->num_lookup == 0)
!             all_hit = b1_hit = t1_hit = t2_hit = b2_hit = 0;
          else
          {
-             b1_hit = (StrategyControl->num_hit[STRAT_LIST_B1] * 100 /
-                       StrategyControl->num_lookup);
              t1_hit = (StrategyControl->num_hit[STRAT_LIST_T1] * 100 /
                        StrategyControl->num_lookup);
              t2_hit = (StrategyControl->num_hit[STRAT_LIST_T2] * 100 /
                        StrategyControl->num_lookup);
!             b2_hit = (StrategyControl->num_hit[STRAT_LIST_B2] * 100 /
!                       StrategyControl->num_lookup);
!             all_hit = b1_hit + t1_hit + t2_hit + b2_hit;
          }

          errcxtold = error_context_stack;
          error_context_stack = NULL;
!         elog(DEBUG1, "ARC T1target=%5d B1len=%5d T1len=%5d T2len=%5d B2len=%5d",
!              T1_TARGET, B1_LENGTH, T1_LENGTH, T2_LENGTH, B2_LENGTH);
!         elog(DEBUG1, "ARC total   =%4ld%% B1hit=%4ld%% T1hit=%4ld%% T2hit=%4ld%% B2hit=%4ld%%",
!              all_hit, b1_hit, t1_hit, t2_hit, b2_hit);
!         elog(DEBUG1, "ARC clean buffers at LRU       T1=   %5d T2=   %5d",
               t1_clean, t2_clean);
          error_context_stack = errcxtold;

          StrategyControl->num_lookup = 0;
-         StrategyControl->num_hit[STRAT_LIST_B1] = 0;
          StrategyControl->num_hit[STRAT_LIST_T1] = 0;
          StrategyControl->num_hit[STRAT_LIST_T2] = 0;
-         StrategyControl->num_hit[STRAT_LIST_B2] = 0;
          StrategyControl->stat_report = now;
      }
  }
--- 201,229 ----
          }

          if (StrategyControl->num_lookup == 0)
!             all_hit = t1_hit = t2_hit = 0;
          else
          {
              t1_hit = (StrategyControl->num_hit[STRAT_LIST_T1] * 100 /
                        StrategyControl->num_lookup);
              t2_hit = (StrategyControl->num_hit[STRAT_LIST_T2] * 100 /
                        StrategyControl->num_lookup);
!             all_hit = t1_hit + t2_hit;
          }

          errcxtold = error_context_stack;
          error_context_stack = NULL;
!         elog(DEBUG1, "2Q T1target=%5d T1len=%5d T2len=%5d",
!              T1_TARGET, T1_LENGTH, T2_LENGTH);
!         elog(DEBUG1, "2Q total   =%4ld%% T1hit=%4ld%% T2hit=%4ld%%",
!              all_hit, t1_hit, t2_hit);
!         elog(DEBUG1, "2Q clean buffers at LRU       T1=   %5d T2=   %5d",
               t1_clean, t2_clean);
          error_context_stack = errcxtold;

          StrategyControl->num_lookup = 0;
          StrategyControl->num_hit[STRAT_LIST_T1] = 0;
          StrategyControl->num_hit[STRAT_LIST_T2] = 0;
          StrategyControl->stat_report = now;
      }
  }
***************
*** 241,249 ****
  /*
   * StrategyBufferLookup
   *
!  *    Lookup a page request in the cache directory. A buffer is only
!  *    returned for a T1 or T2 cache hit. B1 and B2 hits are just
!  *    remembered here, to possibly affect the behaviour later.
   *
   *    recheck indicates we are rechecking after I/O wait; do not change
   *    internal status in this case.
--- 231,237 ----
  /*
   * StrategyBufferLookup
   *
!  *    Lookup a page request in the cache directory.
   *
   *    recheck indicates we are rechecking after I/O wait; do not change
   *    internal status in this case.
***************
*** 346,400 ****
      }

      /*
!      * In the case of a recheck we don't care about B1 or B2 hits here.
!      * The bufmgr does this call only to make sure no-one faulted in the
!      * block while we where busy flushing another; we don't want to doubly
!      * adjust the T1target.
!      *
!      * Now for this really to end up as a B1 or B2 cache hit, we must have
!      * been flushing for quite some time as the block not only must have
!      * been read, but also traveled through the queue and evicted from the
!      * T cache again already.
!      *
!      * VACUUM re-reads shouldn't adjust the target either.
!      */
!     if (recheck || strategy_hint_vacuum)
!         return NULL;
!
!     /*
!      * Adjust the target size of the T1 cache depending on if this is a B1
!      * or B2 hit.
!      */
!     switch (cdb->list)
!     {
!         case STRAT_LIST_B1:
!
!             /*
!              * B1 hit means that the T1 cache is probably too small.
!              * Adjust the T1 target size and continue below.
!              */
!             T1_TARGET = Min(T1_TARGET + Max(B2_LENGTH / B1_LENGTH, 1),
!                             NBuffers);
!             break;
!
!         case STRAT_LIST_B2:
!
!             /*
!              * B2 hit means that the T2 cache is probably too small.
!              * Adjust the T1 target size and continue below.
!              */
!             T1_TARGET = Max(T1_TARGET - Max(B1_LENGTH / B2_LENGTH, 1), 0);
!             break;
!
!         default:
!             elog(ERROR, "buffer hash table corrupted: CDB->list = %d",
!                  cdb->list);
!     }
!
!     /*
!      * Even though we had seen the block in the past, its data is not
!      * currently in memory ... cache miss to the bufmgr.
       */
      return NULL;
  }

--- 334,343 ----
      }

      /*
!      * Shouldn't get here...
       */
+     elog(ERROR, "buffer hash table corrupted: CDB->list = %d",
+          cdb->list);
      return NULL;
  }

***************
*** 423,429 ****
           * We don't have a free buffer, must take one from T1 or T2.
           * Choose based on trying to converge T1len to T1target.
           */
!         if (T1_LENGTH >= Max(1, T1_TARGET))
          {
              /*
               * We should take the first unpinned buffer from T1.
--- 366,372 ----
           * We don't have a free buffer, must take one from T1 or T2.
           * Choose based on trying to converge T1len to T1target.
           */
!         if (T1_LENGTH >= T1_TARGET)
          {
              /*
               * We should take the first unpinned buffer from T1.
***************
*** 548,720 ****
  StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
                        int cdb_found_index, int cdb_replace_index)
  {
!     BufferStrategyCDB *cdb_found;
!     BufferStrategyCDB *cdb_replace;

!     if (cdb_found_index >= 0)
      {
!         /* This must have been a ghost buffer cache hit (B1 or B2) */
!         cdb_found = &StrategyCDB[cdb_found_index];

!         /* Assert that the buffer remembered in cdb_found is the one */
!         /* the buffer manager is currently faulting in */
!         Assert(BUFFERTAGS_EQUAL(cdb_found->buf_tag, *newTag));
!
!         if (cdb_replace_index >= 0)
!         {
!             /* We are satisfying it with an evicted T buffer */
!             cdb_replace = &StrategyCDB[cdb_replace_index];
!
!             /* Assert that the buffer remembered in cdb_replace is */
!             /* the one the buffer manager has just evicted */
!             Assert(cdb_replace->list == STRAT_LIST_T1 ||
!                    cdb_replace->list == STRAT_LIST_T2);
!             Assert(cdb_replace->buf_id == buf->buf_id);
!             Assert(BUFFERTAGS_EQUAL(cdb_replace->buf_tag, buf->tag));

!             /*
!              * Under normal circumstances we move the evicted T list entry
!              * to the corresponding B list.  However, T1 entries that
!              * exist only because of VACUUM are just thrown into the
!              * unused list instead. We don't expect them to be touched
!              * again by the VACUUM, and if we put them into B1 then VACUUM
!              * would skew T1_target adjusting.
!              */
!             if (cdb_replace->t1_vacuum)
!             {
!                 BufTableDelete(&(cdb_replace->buf_tag));
!                 STRAT_LIST_REMOVE(cdb_replace);
!                 cdb_replace->next = StrategyControl->listUnusedCDB;
!                 StrategyControl->listUnusedCDB = cdb_replace_index;
!             }
!             else
!             {
!                 if (cdb_replace->list == STRAT_LIST_T1)
!                 {
!                     STRAT_LIST_REMOVE(cdb_replace);
!                     STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B1);
!                 }
!                 else
!                 {
!                     STRAT_LIST_REMOVE(cdb_replace);
!                     STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B2);
!                 }
!             }
!             /* And clear its block reference */
!             cdb_replace->buf_id = -1;
!         }
!         else
!         {
!             /* We are satisfying it with an unused buffer */
!         }
!
!         /* Now the found B CDB gets the buffer and is moved to T2 */
!         cdb_found->buf_id = buf->buf_id;
!         STRAT_LIST_REMOVE(cdb_found);
!         STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T2);
      }
      else
      {
!         /*
!          * This was a complete cache miss, so we need to create a new CDB.
!          * The goal is to keep T1len+B1len <= c.
!          */
!         if (B1_LENGTH > 0 && (T1_LENGTH + B1_LENGTH) >= NBuffers)
          {
!             /* So if B1 isn't empty and T1len+B1len >= c we take B1-LRU */
!             cdb_found = &StrategyCDB[StrategyControl->listHead[STRAT_LIST_B1]];
!
!             BufTableDelete(&(cdb_found->buf_tag));
!             STRAT_LIST_REMOVE(cdb_found);
          }
          else
          {
!             /* Otherwise, we try to use a free one */
!             if (StrategyControl->listUnusedCDB >= 0)
!             {
!                 cdb_found = &StrategyCDB[StrategyControl->listUnusedCDB];
!                 StrategyControl->listUnusedCDB = cdb_found->next;
!             }
!             else
!             {
!                 /* If there isn't, we take B2-LRU ... except if */
!                 /* T1len+B1len+T2len = c ... oh my */
!                 if (B2_LENGTH > 0)
!                     cdb_found = &StrategyCDB[StrategyControl->listHead[STRAT_LIST_B2]];
!                 else
!                     cdb_found = &StrategyCDB[StrategyControl->listHead[STRAT_LIST_B1]];
!
!                 BufTableDelete(&(cdb_found->buf_tag));
!                 STRAT_LIST_REMOVE(cdb_found);
!             }
          }

!         /* Set the CDB's buf_tag and insert it into the hash table */
!         cdb_found->buf_tag = *newTag;
!         BufTableInsert(&(cdb_found->buf_tag), (cdb_found - StrategyCDB));
!
!         if (cdb_replace_index >= 0)
!         {
!             /*
!              * The buffer was formerly in a T list, move its CDB to the
!              * corresponding B list
!              */
!             cdb_replace = &StrategyCDB[cdb_replace_index];

!             Assert(cdb_replace->list == STRAT_LIST_T1 ||
!                    cdb_replace->list == STRAT_LIST_T2);
!             Assert(cdb_replace->buf_id == buf->buf_id);
!             Assert(BUFFERTAGS_EQUAL(cdb_replace->buf_tag, buf->tag));

!             if (cdb_replace->list == STRAT_LIST_T1)
!             {
!                 STRAT_LIST_REMOVE(cdb_replace);
!                 STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B1);
!             }
!             else
!             {
!                 STRAT_LIST_REMOVE(cdb_replace);
!                 STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B2);
!             }
!             /* And clear its block reference */
!             cdb_replace->buf_id = -1;
!         }
          else
          {
!             /* We are satisfying it with an unused buffer */
          }
-
-         /* Assign the buffer id to the new CDB */
-         cdb_found->buf_id = buf->buf_id;
-
-         /*
-          * Specialized VACUUM optimization. If this complete cache miss
-          * happened because vacuum needed the page, we place it at the LRU
-          * position of T1; normally it goes at the MRU position.
-          */
-         if (strategy_hint_vacuum)
-         {
-             if (TransactionIdEquals(strategy_vacuum_xid,
-                                     GetTopTransactionId()))
-                 STRAT_LRU_INSERT(cdb_found, STRAT_LIST_T1);
-             else
-             {
-                 /* VACUUM must have been aborted by error, reset flag */
-                 strategy_hint_vacuum = false;
-                 STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T1);
-             }
-         }
-         else
-             STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T1);
-
-         /*
-          * Remember the Xid when this buffer went onto T1 to avoid a
-          * single UPDATE promoting a newcomer straight into T2. Also
-          * remember if it was loaded for VACUUM.
-          */
-         cdb_found->t1_xid = GetTopTransactionId();
-         cdb_found->t1_vacuum = strategy_hint_vacuum;
      }
  }


--- 491,565 ----
  StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag,
                        int cdb_found_index, int cdb_replace_index)
  {
!     BufferStrategyCDB *new_cdb;
!
!     /* StrategyBufferLookup must have failed */
!     Assert(cdb_found_index < 0);

!     if (cdb_replace_index >= 0)
      {
!         /* Recycle the buffer's old CDB entry */
!         new_cdb = &StrategyCDB[cdb_replace_index];

!         Assert(new_cdb->list == STRAT_LIST_T1 ||
!                new_cdb->list == STRAT_LIST_T2);
!         Assert(new_cdb->buf_id == buf->buf_id);
!         Assert(BUFFERTAGS_EQUAL(new_cdb->buf_tag, buf->tag));

!         BufTableDelete(&(new_cdb->buf_tag));
!         STRAT_LIST_REMOVE(new_cdb);
      }
      else
      {
!         /* Unused buffer, so assign it an unused CDB */
!         if (StrategyControl->listUnusedCDB >= 0)
          {
!             /* Use a free one */
!             new_cdb = &StrategyCDB[StrategyControl->listUnusedCDB];
!             StrategyControl->listUnusedCDB = new_cdb->next;
          }
          else
          {
!             /* we have more buffers than CDB's ?? */
!             elog(PANIC, "StrategyReplaceBuffer: out of CDBs");
!             new_cdb = NULL;        /* keep compiler quiet */
          }
+     }

!     /* Set the CDB's buf_tag and insert it into the hash table */
!     new_cdb->buf_tag = *newTag;
!     BufTableInsert(&(new_cdb->buf_tag), (new_cdb - StrategyCDB));

!     /* Assign the buffer id to the new CDB */
!     new_cdb->buf_id = buf->buf_id;

!     /*
!      * Specialized VACUUM optimization. If this complete cache miss
!      * happened because vacuum needed the page, we place it at the LRU
!      * position of T1; normally it goes at the MRU position.
!      */
!     if (strategy_hint_vacuum)
!     {
!         if (TransactionIdEquals(strategy_vacuum_xid,
!                                 GetTopTransactionId()))
!             STRAT_LRU_INSERT(new_cdb, STRAT_LIST_T1);
          else
          {
!             /* VACUUM must have been aborted by error, reset flag */
!             strategy_hint_vacuum = false;
!             STRAT_MRU_INSERT(new_cdb, STRAT_LIST_T1);
          }
      }
+     else
+         STRAT_MRU_INSERT(new_cdb, STRAT_LIST_T1);
+
+     /*
+      * Remember the Xid when this buffer went onto T1 to avoid a
+      * single UPDATE promoting a newcomer straight into T2. Also
+      * remember if it was loaded for VACUUM.
+      */
+     new_cdb->t1_xid = GetTopTransactionId();
+     new_cdb->t1_vacuum = strategy_hint_vacuum;
  }


***************
*** 744,750 ****
      cdb = &StrategyCDB[cdb_id];

      /*
!      * Remove the CDB from the hashtable and the ARC queue it is currently
       * on.
       */
      BufTableDelete(&(cdb->buf_tag));
--- 589,595 ----
      cdb = &StrategyCDB[cdb_id];

      /*
!      * Remove the CDB from the hashtable and the 2Q queue it is currently
       * on.
       */
      BufTableDelete(&(cdb->buf_tag));
***************
*** 808,814 ****

      /*
       * Traverse the T1 and T2 list LRU to MRU in "parallel" and add all
!      * dirty buffers found in that order to the list. The ARC strategy
       * keeps all used buffers including pinned ones in the T1 or T2 list.
       * So we cannot miss any dirty buffers.
       */
--- 653,659 ----

      /*
       * Traverse the T1 and T2 list LRU to MRU in "parallel" and add all
!      * dirty buffers found in that order to the list. The 2Q strategy
       * keeps all used buffers including pinned ones in the T1 or T2 list.
       * So we cannot miss any dirty buffers.
       */
***************
*** 870,885 ****
  int
  StrategyShmemSize(void)
  {
      int            size = 0;

      /* size of CDB lookup hash table */
!     size += BufTableShmemSize(NBuffers * 2);

      /* size of the shared replacement strategy control block */
      size += MAXALIGN(sizeof(BufferStrategyControl));

!     /* size of the ARC directory blocks */
!     size += MAXALIGN(NBuffers * 2 * sizeof(BufferStrategyCDB));

      return size;
  }
--- 715,731 ----
  int
  StrategyShmemSize(void)
  {
+     int            nCDBs = NBuffers;
      int            size = 0;

      /* size of CDB lookup hash table */
!     size += BufTableShmemSize(nCDBs);

      /* size of the shared replacement strategy control block */
      size += MAXALIGN(sizeof(BufferStrategyControl));

!     /* size of the CDB directory */
!     size += MAXALIGN(nCDBs * sizeof(BufferStrategyCDB));

      return size;
  }
***************
*** 894,906 ****
  void
  StrategyInitialize(bool init)
  {
      bool        found;
      int            i;

      /*
       * Initialize the shared CDB lookup hashtable
       */
!     InitBufTable(NBuffers * 2);

      /*
       * Get or create the shared strategy control block and the CDB's
--- 740,753 ----
  void
  StrategyInitialize(bool init)
  {
+     int            nCDBs = NBuffers;
      bool        found;
      int            i;

      /*
       * Initialize the shared CDB lookup hashtable
       */
!     InitBufTable(nCDBs);

      /*
       * Get or create the shared strategy control block and the CDB's
***************
*** 908,914 ****
      StrategyControl = (BufferStrategyControl *)
          ShmemInitStruct("Buffer Strategy Status",
                          sizeof(BufferStrategyControl) +
!                         sizeof(BufferStrategyCDB) * (NBuffers * 2 - 1),
                          &found);
      StrategyCDB = &(StrategyControl->cdb[0]);

--- 755,761 ----
      StrategyControl = (BufferStrategyControl *)
          ShmemInitStruct("Buffer Strategy Status",
                          sizeof(BufferStrategyControl) +
!                         sizeof(BufferStrategyCDB) * (nCDBs - 1),
                          &found);
      StrategyCDB = &(StrategyControl->cdb[0]);

***************
*** 926,938 ****
          StrategyControl->listFreeBuffers = 0;

          /*
!          * We start off with a target T1 list size of half the available
!          * cache blocks.
           */
!         StrategyControl->target_T1_size = NBuffers / 2;

          /*
!          * Initialize B1, T1, T2 and B2 lists to be empty
           */
          for (i = 0; i < STRAT_NUM_LISTS; i++)
          {
--- 773,785 ----
          StrategyControl->listFreeBuffers = 0;

          /*
!          * We set the target T1 size to 1/4th of available buffers.
!          * Possibly this should be a runtime tunable.
           */
!         StrategyControl->target_T1_size = NBuffers / 4;

          /*
!          * Initialize all lists to be empty
           */
          for (i = 0; i < STRAT_NUM_LISTS; i++)
          {
***************
*** 947,960 ****
          /*
           * All CDB's are linked as the listUnusedCDB
           */
!         for (i = 0; i < NBuffers * 2; i++)
          {
              StrategyCDB[i].next = i + 1;
              StrategyCDB[i].list = STRAT_LIST_UNUSED;
              CLEAR_BUFFERTAG(StrategyCDB[i].buf_tag);
              StrategyCDB[i].buf_id = -1;
          }
!         StrategyCDB[NBuffers * 2 - 1].next = -1;
          StrategyControl->listUnusedCDB = 0;
      }
      else
--- 794,807 ----
          /*
           * All CDB's are linked as the listUnusedCDB
           */
!         for (i = 0; i < nCDBs; i++)
          {
              StrategyCDB[i].next = i + 1;
              StrategyCDB[i].list = STRAT_LIST_UNUSED;
              CLEAR_BUFFERTAG(StrategyCDB[i].buf_tag);
              StrategyCDB[i].buf_id = -1;
          }
!         StrategyCDB[nCDBs - 1].next = -1;
          StrategyControl->listUnusedCDB = 0;
      }
      else

pgsql-patches by date:

Previous
From: Neil Conway
Date:
Subject: Re: Refactoring lock.c
Next
From: Serguei Mokhov
Date:
Subject: Re: [pgsql-patches] Proof-of-concept ARC removal patches