Synchronized Scan patch - Mailing list pgsql-patches
From | Jeff Davis |
---|---|
Subject | Synchronized Scan patch |
Date | |
Msg-id | 457C78FC.3060601@j-davis.com Whole thread Raw |
List | pgsql-patches |
This is the Synchronized Scan patch. This patch only implements the simplest Synchronized Scan features; I removed some of the other features for simplicity of testing. This patch: (1) allocates shared memory to store a table of (relation oid, blocknumber) pairs. (2) When fetching a new page in a scan, it hashes the Oid of the relation and stores a hint entry in the shared memory table (2) When starting a new scan, it reads the shared memory table for the hint and starts the scan at that location (which is, hopefully, near an already-existing scan). This isn't complete. I still plan to add some features and do a lot of testing, depending on feedback. I am posting it here so that anyone can test it, and so when I show results I can refer to this specific patch. Regards, Jeff Davis Only in postgresql-8.2.0-ss/src/backend/access/heap: .heapam.c.swp diff -cr postgresql-8.2.0/src/backend/access/heap/heapam.c postgresql-8.2.0-ss/src/backend/access/heap/heapam.c *** postgresql-8.2.0/src/backend/access/heap/heapam.c Fri Nov 17 10:00:14 2006 --- postgresql-8.2.0-ss/src/backend/access/heap/heapam.c Sat Dec 9 11:43:26 2006 *************** *** 54,59 **** --- 54,60 ---- #include "utils/lsyscache.h" #include "utils/relcache.h" #include "utils/syscache.h" + #include "optimizer/cost.h" static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, *************** *** 65,70 **** --- 66,162 ---- * ---------------------------------------------------------------- */ + static BlockNumber ss_get_startloc(Oid); + static int ss_report_loc(Oid,BlockNumber); + static int ss_hash_relid(Oid); + + /* + * ss_get_startloc: + * + * This function reads the Sync Scan Hint Table to + * find a possible location for an already running + * sequential scan on this relation. + * + * By starting a sequential scan near the location + * of an already running scan, we improve the chance + * of finding pages in cache. + * + * This does some basic sanity checking, but the + * result is not guaranteed to be within the bounds + * of the relation size. + */ + static BlockNumber ss_get_startloc(Oid relid) + { + char *shm; + ss_scan_loc_t scanloc; + int offset; + bool found; + + offset = ss_hash_relid(relid)*sizeof(ss_scan_loc_t); + shm = (char *)ShmemInitStruct("Sync Scan Hint Table", + SYNC_SCAN_TABLE_SIZE*sizeof(ss_scan_loc_t),&found); + if(!found) + return 0; + + scanloc = *((ss_scan_loc_t*)(shm+offset)); + + /*TODO: + * If the relid matches this relid, return the hint minus + * a fraction of the effective_cache_size. By starting at + * an earlier offset, it's likely that all of the blocks + * will already be cached, and the scan will quickly + * catch up to the head. + * If the relid does not match, the hint does not apply, so + * return 0. + */ + if(scanloc.relid == relid) + return scanloc.loc; + else + return 0; + } + + /* + * This is a simplistic function to hash + * the Oid of the relation for placement in + * the Sync Scan Hint Table + */ + static int ss_hash_relid(Oid relid) + { + return relid % SYNC_SCAN_TABLE_SIZE; + } + + /* + * ss_report_loc: + * + * Writes an entry in the Sync Scan Hint Table + * of the form (relid,blocknumber). This will + * overwrite any existing entry that may collide + * with this entry in the table. + * + * No locking is performed here. The data is + * considered to be untrusted when read by + * ss_get_startloc(), and it's callers. + */ + static int ss_report_loc(Oid relid, BlockNumber loc) + { + char *shm; + ss_scan_loc_t scanloc; + int offset; + bool found; + + scanloc.relid = relid; + scanloc.loc = loc; + offset = ss_hash_relid(relid); + shm = ShmemInitStruct("Sync Scan Hint Table", + SYNC_SCAN_TABLE_SIZE*sizeof(ss_scan_loc_t), &found); + if(!found) + return 0; + + *((ss_scan_loc_t*)(shm+offset)) = scanloc; + + return 0; + } + /* ---------------- * initscan - scan code common to heap_beginscan and heap_rescan * ---------------- *************** *** 81,86 **** --- 173,191 ---- */ scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd); + /* + * It's important to check the results retrieved from + * the ss_get_startloc call. There are some sanity checks + * within the function, but we need to make sure that the + * hint value is not larger than the number of blocks in + * the relation, for instance in the case of a VACUUM FULL + * between the time the hint was placed and now. + */ + scan->rs_start_page = ss_get_startloc(RelationGetRelid(scan->rs_rd)); + if(scan->rs_start_page >= scan->rs_nblocks || + scan->rs_start_page < 0) + scan->rs_start_page = 0; + scan->rs_inited = false; scan->rs_ctup.t_data = NULL; ItemPointerSetInvalid(&scan->rs_ctup.t_self); *************** *** 223,229 **** tuple->t_data = NULL; return; } ! page = 0; /* first page */ heapgetpage(scan, page); lineoff = FirstOffsetNumber; /* first offnum */ scan->rs_inited = true; --- 328,342 ---- tuple->t_data = NULL; return; } ! /* ! * start the scan at the location that we chose ! * in ss_get_startloc() ! */ ! page = scan->rs_start_page; ! /*TODO: ! * only report if page - start > effective_cache_size*start_offset ! */ ! ss_report_loc(RelationGetRelid(scan->rs_rd),page); heapgetpage(scan, page); lineoff = FirstOffsetNumber; /* first offnum */ scan->rs_inited = true; *************** *** 364,378 **** } /* ! * if we get here, it means we've exhausted the items on this page and * it's time to move to the next. */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); /* ! * return NULL if we've exhausted all the pages */ ! if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks)) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); --- 477,508 ---- } /* ! * If we get here, it means we've exhausted the items on this page and * it's time to move to the next. + * + * For the forward scan, we need to wrap around to the beginning + * of the relation file if we reach the end. */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + if(backward) + page--; + else + page = (page + 1) % (scan->rs_nblocks); + + /*TODO: + * only report if page - start > effective_cache_size*start_offset + */ + ss_report_loc(RelationGetRelid(scan->rs_rd),page); + /* ! * Return NULL if we've exhausted all the pages. ! * For reverse scans, that means we've reached 0. For ! * forward scans, that means we've reached the page on ! * which we started. */ ! if ((backward && (page == 0)) || ! ((page%(scan->rs_nblocks)) == scan->rs_start_page)) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); *************** *** 383,390 **** return; } - page = backward ? (page - 1) : (page + 1); - heapgetpage(scan, page); LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); --- 513,518 ---- *************** *** 450,456 **** tuple->t_data = NULL; return; } ! page = 0; /* first page */ heapgetpage(scan, page); lineindex = 0; scan->rs_inited = true; --- 578,592 ---- tuple->t_data = NULL; return; } ! /* ! * start the scan at the location that we chose ! * in ss_get_startloc() ! */ ! page = scan->rs_start_page; ! /*TODO: ! * only report if page - start > effective_cache_size*start_offset ! */ ! ss_report_loc(RelationGetRelid(scan->rs_rd),page); heapgetpage(scan, page); lineindex = 0; scan->rs_inited = true; *************** *** 585,598 **** } /* ! * if we get here, it means we've exhausted the items on this page and * it's time to move to the next. */ /* ! * return NULL if we've exhausted all the pages */ ! if (backward ? (page == 0) : (page + 1 >= scan->rs_nblocks)) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); --- 721,750 ---- } /* ! * If we get here, it means we've exhausted the items on this page and * it's time to move to the next. + * + * For the forward scan, we need to wrap around to the beginning + * of the relation file if we reach the end. + */ + if(backward) + page--; + else + page = (page + 1) % (scan->rs_nblocks); + + /*TODO: + * only report if page - start > effective_cache_size*start_offset */ + ss_report_loc(RelationGetRelid(scan->rs_rd),page); /* ! * Return NULL if we've exhausted all the pages. ! * For reverse scans, that means we've reached 0. For ! * forward scans, that means we've reached the page on ! * which we started. */ ! if ((backward && (page == 0)) || ! ((page%(scan->rs_nblocks)) == scan->rs_start_page)) { if (BufferIsValid(scan->rs_cbuf)) ReleaseBuffer(scan->rs_cbuf); *************** *** 603,609 **** return; } - page = backward ? (page - 1) : (page + 1); heapgetpage(scan, page); dp = (Page) BufferGetPage(scan->rs_cbuf); --- 755,760 ---- *************** *** 616,621 **** --- 767,783 ---- } } + /* + * SyncScanShmemSize: + * + * Called by CreateSharedMemoryAndSemaphores() + * to find out how much room the Sync Scan Hint + * Table will need to occupy. + */ + Size SyncScanShmemSize(void) + { + return SYNC_SCAN_TABLE_SIZE*sizeof(ss_scan_loc_t); + } #if defined(DISABLE_COMPLEX_MACRO) /* diff -cr postgresql-8.2.0/src/backend/storage/ipc/ipci.c postgresql-8.2.0-ss/src/backend/storage/ipc/ipci.c *** postgresql-8.2.0/src/backend/storage/ipc/ipci.c Sun Oct 15 15:04:07 2006 --- postgresql-8.2.0-ss/src/backend/storage/ipc/ipci.c Mon Dec 4 23:47:43 2006 *************** *** 19,24 **** --- 19,25 ---- #include "access/nbtree.h" #include "access/subtrans.h" #include "access/twophase.h" + #include "access/heapam.h" #include "miscadmin.h" #include "pgstat.h" #include "postmaster/bgwriter.h" *************** *** 110,115 **** --- 111,117 ---- size = add_size(size, FreeSpaceShmemSize()); size = add_size(size, BgWriterShmemSize()); size = add_size(size, BTreeShmemSize()); + size = add_size(size, SyncScanShmemSize()); #ifdef EXEC_BACKEND size = add_size(size, ShmemBackendArraySize()); #endif diff -cr postgresql-8.2.0/src/include/access/heapam.h postgresql-8.2.0-ss/src/include/access/heapam.h *** postgresql-8.2.0/src/include/access/heapam.h Sun Nov 5 14:42:10 2006 --- postgresql-8.2.0-ss/src/include/access/heapam.h Sat Dec 9 11:33:35 2006 *************** *** 24,29 **** --- 24,48 ---- #include "storage/lmgr.h" #include "utils/rel.h" #include "utils/tqual.h" + #include <sys/shm.h> + + /* + * Size of the Sync Scan Hint Table. + */ + #define SYNC_SCAN_TABLE_SIZE 1024 + + #define SYNC_SCAN_THRESHOLD 2 + #define SYNC_SCAN_START_OFFSET 0 + /* + * Structure of an entry in the + * Sync Scan Hint Table. + */ + typedef struct { + Oid relid; + BlockNumber loc; + } ss_scan_loc_t; + + extern Size SyncScanShmemSize(void); /* ---------------- * fastgetattr diff -cr postgresql-8.2.0/src/include/access/relscan.h postgresql-8.2.0-ss/src/include/access/relscan.h *** postgresql-8.2.0/src/include/access/relscan.h Tue Oct 3 17:30:07 2006 --- postgresql-8.2.0-ss/src/include/access/relscan.h Wed Dec 6 22:31:07 2006 *************** *** 33,38 **** --- 33,39 ---- bool rs_inited; /* false = scan not init'd yet */ HeapTupleData rs_ctup; /* current tuple in scan, if any */ BlockNumber rs_cblock; /* current block # in scan, if any */ + BlockNumber rs_start_page; /* page where this scan began */ Buffer rs_cbuf; /* current buffer in scan, if any */ /* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ ItemPointerData rs_mctid; /* marked scan position, if any */
pgsql-patches by date: