Synchronized Scan patch - Mailing list pgsql-patches

From Jeff Davis
Subject Synchronized Scan patch
Date
Msg-id 457C78FC.3060601@j-davis.com
Whole thread Raw
List pgsql-patches
This is the Synchronized Scan patch.

This patch only implements the simplest Synchronized Scan features; I
removed some of the other features for simplicity of testing.

This patch:
(1) allocates shared memory to store a table of (relation oid,
blocknumber) pairs.
(2) When fetching a new page in a scan, it hashes the Oid of the
relation and stores a hint entry in the shared memory table
(2) When starting a new scan, it reads the shared memory table for the
hint and starts the scan at that location (which is, hopefully, near an
already-existing scan).

This isn't complete. I still plan to add some features and do a lot of
testing, depending on feedback. I am posting it here so that anyone can
test it, and so when I show results I can refer to this specific patch.

Regards,
    Jeff Davis
Only in postgresql-8.2.0-ss/src/backend/access/heap: .heapam.c.swp
diff -cr postgresql-8.2.0/src/backend/access/heap/heapam.c postgresql-8.2.0-ss/src/backend/access/heap/heapam.c
*** postgresql-8.2.0/src/backend/access/heap/heapam.c    Fri Nov 17 10:00:14 2006
--- postgresql-8.2.0-ss/src/backend/access/heap/heapam.c    Sat Dec  9 11:43:26 2006
***************
*** 54,59 ****
--- 54,60 ----
  #include "utils/lsyscache.h"
  #include "utils/relcache.h"
  #include "utils/syscache.h"
+ #include "optimizer/cost.h"


  static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
***************
*** 65,70 ****
--- 66,162 ----
   * ----------------------------------------------------------------
   */

+ static BlockNumber ss_get_startloc(Oid);
+ static int         ss_report_loc(Oid,BlockNumber);
+ static int         ss_hash_relid(Oid);
+
+ /*
+  * ss_get_startloc:
+  *
+  * This function reads the Sync Scan Hint Table to
+  * find a possible location for an already running
+  * sequential scan on this relation.
+  *
+  * By starting a sequential scan near the location
+  * of an already running scan, we improve the chance
+  * of finding pages in cache.
+  *
+  * This does some basic sanity checking, but the
+  * result is not guaranteed to be within the bounds
+  * of the relation size.
+  */
+ static BlockNumber ss_get_startloc(Oid relid)
+ {
+     char *shm;
+     ss_scan_loc_t scanloc;
+     int offset;
+     bool found;
+
+     offset = ss_hash_relid(relid)*sizeof(ss_scan_loc_t);
+     shm = (char *)ShmemInitStruct("Sync Scan Hint Table",
+         SYNC_SCAN_TABLE_SIZE*sizeof(ss_scan_loc_t),&found);
+     if(!found)
+         return 0;
+
+     scanloc = *((ss_scan_loc_t*)(shm+offset));
+
+     /*TODO:
+      * If the relid matches this relid, return the hint minus
+      * a fraction of the effective_cache_size. By starting at
+      * an earlier offset, it's likely that all of the blocks
+      * will already be cached, and the scan will quickly
+      * catch up to the head.
+      * If the relid does not match, the hint does not apply, so
+      * return 0.
+      */
+     if(scanloc.relid == relid)
+         return scanloc.loc;
+     else
+         return 0;
+ }
+
+ /*
+  * This is a simplistic function to hash
+  * the Oid of the relation for placement in
+  * the Sync Scan Hint Table
+  */
+ static int ss_hash_relid(Oid relid)
+ {
+     return relid % SYNC_SCAN_TABLE_SIZE;
+ }
+
+ /*
+  * ss_report_loc:
+  *
+  * Writes an entry in the Sync Scan Hint Table
+  * of the form (relid,blocknumber). This will
+  * overwrite any existing entry that may collide
+  * with this entry in the table.
+  *
+  * No locking is performed here. The data is
+  * considered to be untrusted when read by
+  * ss_get_startloc(), and it's callers.
+  */
+ static int ss_report_loc(Oid relid, BlockNumber loc)
+ {
+     char *shm;
+     ss_scan_loc_t scanloc;
+     int offset;
+     bool found;
+
+     scanloc.relid = relid;
+     scanloc.loc = loc;
+     offset = ss_hash_relid(relid);
+     shm = ShmemInitStruct("Sync Scan Hint Table",
+         SYNC_SCAN_TABLE_SIZE*sizeof(ss_scan_loc_t), &found);
+     if(!found)
+         return 0;
+
+     *((ss_scan_loc_t*)(shm+offset)) = scanloc;
+
+     return 0;
+ }
+
  /* ----------------
   *        initscan - scan code common to heap_beginscan and heap_rescan
   * ----------------
***************
*** 81,86 ****
--- 173,191 ----
       */
      scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);

+     /*
+      * It's important to check the results retrieved from
+      * the ss_get_startloc call. There are some sanity checks
+      * within the function, but we need to make sure that the
+      * hint value is not larger than the number of blocks in
+      * the relation, for instance in the case of a VACUUM FULL
+      * between the time the hint was placed and now.
+      */
+     scan->rs_start_page = ss_get_startloc(RelationGetRelid(scan->rs_rd));
+     if(scan->rs_start_page >= scan->rs_nblocks ||
+         scan->rs_start_page < 0)
+         scan->rs_start_page = 0;
+
      scan->rs_inited = false;
      scan->rs_ctup.t_data = NULL;
      ItemPointerSetInvalid(&scan->rs_ctup.t_self);
***************
*** 223,229 ****
                  tuple->t_data = NULL;
                  return;
              }
!             page = 0;            /* first page */
              heapgetpage(scan, page);
              lineoff = FirstOffsetNumber;        /* first offnum */
              scan->rs_inited = true;
--- 328,342 ----
                  tuple->t_data = NULL;
                  return;
              }
!             /*
!              * start the scan at the location that we chose
!              * in ss_get_startloc()
!              */
!             page = scan->rs_start_page;
!             /*TODO:
!              * only report if page - start > effective_cache_size*start_offset
!              */
!             ss_report_loc(RelationGetRelid(scan->rs_rd),page);
              heapgetpage(scan, page);
              lineoff = FirstOffsetNumber;        /* first offnum */
              scan->rs_inited = true;
***************
*** 364,378 ****
          }

          /*
!          * if we get here, it means we've exhausted the items on this page and
           * it's time to move to the next.
           */
          LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);

          /*
!          * return NULL if we've exhausted all the pages
           */
!         if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks))
          {
              if (BufferIsValid(scan->rs_cbuf))
                  ReleaseBuffer(scan->rs_cbuf);
--- 477,508 ----
          }

          /*
!          * If we get here, it means we've exhausted the items on this page and
           * it's time to move to the next.
+          *
+          * For the forward scan, we need to wrap around to the beginning
+          * of the relation file if we reach the end.
           */
          LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);

+         if(backward)
+             page--;
+         else
+             page = (page + 1) % (scan->rs_nblocks);
+
+         /*TODO:
+          * only report if page - start > effective_cache_size*start_offset
+          */
+         ss_report_loc(RelationGetRelid(scan->rs_rd),page);
+
          /*
!          * Return NULL if we've exhausted all the pages.
!          * For reverse scans, that means we've reached 0. For
!          * forward scans, that means we've reached the page on
!          * which we started.
           */
!         if ((backward && (page == 0)) ||
!             ((page%(scan->rs_nblocks)) == scan->rs_start_page))
          {
              if (BufferIsValid(scan->rs_cbuf))
                  ReleaseBuffer(scan->rs_cbuf);
***************
*** 383,390 ****
              return;
          }

-         page = backward ? (page - 1) : (page + 1);
-
          heapgetpage(scan, page);

          LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
--- 513,518 ----
***************
*** 450,456 ****
                  tuple->t_data = NULL;
                  return;
              }
!             page = 0;            /* first page */
              heapgetpage(scan, page);
              lineindex = 0;
              scan->rs_inited = true;
--- 578,592 ----
                  tuple->t_data = NULL;
                  return;
              }
!             /*
!              * start the scan at the location that we chose
!              * in ss_get_startloc()
!              */
!             page = scan->rs_start_page;
!             /*TODO:
!              * only report if page - start > effective_cache_size*start_offset
!              */
!             ss_report_loc(RelationGetRelid(scan->rs_rd),page);
              heapgetpage(scan, page);
              lineindex = 0;
              scan->rs_inited = true;
***************
*** 585,598 ****
          }

          /*
!          * if we get here, it means we've exhausted the items on this page and
           * it's time to move to the next.
           */

          /*
!          * return NULL if we've exhausted all the pages
           */
!         if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks))
          {
              if (BufferIsValid(scan->rs_cbuf))
                  ReleaseBuffer(scan->rs_cbuf);
--- 721,750 ----
          }

          /*
!          * If we get here, it means we've exhausted the items on this page and
           * it's time to move to the next.
+          *
+          * For the forward scan, we need to wrap around to the beginning
+          * of the relation file if we reach the end.
+          */
+         if(backward)
+             page--;
+         else
+             page = (page + 1) % (scan->rs_nblocks);
+
+         /*TODO:
+          * only report if page - start > effective_cache_size*start_offset
           */
+         ss_report_loc(RelationGetRelid(scan->rs_rd),page);

          /*
!          * Return NULL if we've exhausted all the pages.
!          * For reverse scans, that means we've reached 0. For
!          * forward scans, that means we've reached the page on
!          * which we started.
           */
!         if ((backward && (page == 0)) ||
!             ((page%(scan->rs_nblocks)) == scan->rs_start_page))
          {
              if (BufferIsValid(scan->rs_cbuf))
                  ReleaseBuffer(scan->rs_cbuf);
***************
*** 603,609 ****
              return;
          }

-         page = backward ? (page - 1) : (page + 1);
          heapgetpage(scan, page);

          dp = (Page) BufferGetPage(scan->rs_cbuf);
--- 755,760 ----
***************
*** 616,621 ****
--- 767,783 ----
      }
  }

+ /*
+  * SyncScanShmemSize:
+  *
+  * Called by CreateSharedMemoryAndSemaphores()
+  * to find out how much room the Sync Scan Hint
+  * Table will need to occupy.
+  */
+ Size SyncScanShmemSize(void)
+ {
+     return SYNC_SCAN_TABLE_SIZE*sizeof(ss_scan_loc_t);
+ }

  #if defined(DISABLE_COMPLEX_MACRO)
  /*
diff -cr postgresql-8.2.0/src/backend/storage/ipc/ipci.c postgresql-8.2.0-ss/src/backend/storage/ipc/ipci.c
*** postgresql-8.2.0/src/backend/storage/ipc/ipci.c    Sun Oct 15 15:04:07 2006
--- postgresql-8.2.0-ss/src/backend/storage/ipc/ipci.c    Mon Dec  4 23:47:43 2006
***************
*** 19,24 ****
--- 19,25 ----
  #include "access/nbtree.h"
  #include "access/subtrans.h"
  #include "access/twophase.h"
+ #include "access/heapam.h"
  #include "miscadmin.h"
  #include "pgstat.h"
  #include "postmaster/bgwriter.h"
***************
*** 110,115 ****
--- 111,117 ----
          size = add_size(size, FreeSpaceShmemSize());
          size = add_size(size, BgWriterShmemSize());
          size = add_size(size, BTreeShmemSize());
+         size = add_size(size, SyncScanShmemSize());
  #ifdef EXEC_BACKEND
          size = add_size(size, ShmemBackendArraySize());
  #endif
diff -cr postgresql-8.2.0/src/include/access/heapam.h postgresql-8.2.0-ss/src/include/access/heapam.h
*** postgresql-8.2.0/src/include/access/heapam.h    Sun Nov  5 14:42:10 2006
--- postgresql-8.2.0-ss/src/include/access/heapam.h    Sat Dec  9 11:33:35 2006
***************
*** 24,29 ****
--- 24,48 ----
  #include "storage/lmgr.h"
  #include "utils/rel.h"
  #include "utils/tqual.h"
+ #include <sys/shm.h>
+
+ /*
+  * Size of the Sync Scan Hint Table.
+  */
+ #define SYNC_SCAN_TABLE_SIZE 1024
+
+ #define SYNC_SCAN_THRESHOLD 2
+ #define SYNC_SCAN_START_OFFSET 0
+ /*
+  * Structure of an entry in the
+  * Sync Scan Hint Table.
+  */
+ typedef struct {
+     Oid relid;
+     BlockNumber loc;
+ } ss_scan_loc_t;
+
+ extern Size SyncScanShmemSize(void);

  /* ----------------
   *        fastgetattr
diff -cr postgresql-8.2.0/src/include/access/relscan.h postgresql-8.2.0-ss/src/include/access/relscan.h
*** postgresql-8.2.0/src/include/access/relscan.h    Tue Oct  3 17:30:07 2006
--- postgresql-8.2.0-ss/src/include/access/relscan.h    Wed Dec  6 22:31:07 2006
***************
*** 33,38 ****
--- 33,39 ----
      bool        rs_inited;        /* false = scan not init'd yet */
      HeapTupleData rs_ctup;        /* current tuple in scan, if any */
      BlockNumber rs_cblock;        /* current block # in scan, if any */
+     BlockNumber rs_start_page;  /* page where this scan began */
      Buffer        rs_cbuf;        /* current buffer in scan, if any */
      /* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */
      ItemPointerData rs_mctid;    /* marked scan position, if any */

pgsql-patches by date:

Previous
From: Tom Lane
Date:
Subject: Proposed fixes for planner regressions from June to release
Next
From: "Simon Riggs"
Date:
Subject: Re: Proposed fixes for planner regressions from June torelease