--- src/backend/executor/nodeIndexscan.c.orig 2012-10-31 15:24:12.083163547 -0400 +++ src/backend/executor/nodeIndexscan.c 2012-11-01 11:45:16.244967963 -0400 @@ -35,8 +35,13 @@ #include "utils/rel.h" + static TupleTableSlot *IndexNext(IndexScanState *node); +#ifdef USE_PREFETCH +extern unsigned int prefetch_dbOid; /* database oid of relations on which prefetching to be done - 0 means all */ +extern unsigned int prefetch_index_scans; /* boolean whether to prefetch bitmap heap scans */ +#endif /* USE_PREFETCH */ /* ---------------------------------------------------------------- * IndexNext @@ -418,7 +423,17 @@ ExecEndIndexScan(IndexScanState *node) * close the index relation (no-op if we didn't open it) */ if (indexScanDesc) + { index_endscan(indexScanDesc); + +#ifdef USE_PREFETCH + if ( indexScanDesc->do_prefetch + && ( (struct BlockIdData*)0 != indexScanDesc->pfch_list ) + ) { + pfree(indexScanDesc->pfch_list); + } +#endif /* USE_PREFETCH */ + } if (indexRelationDesc) index_close(indexRelationDesc, NoLock); @@ -609,6 +624,25 @@ ExecInitIndexScan(IndexScan *node, EStat indexstate->iss_NumScanKeys, indexstate->iss_NumOrderByKeys); +#ifdef USE_PREFETCH + /* initialize prefetching */ + if ( prefetch_index_scans + && (!RelationUsesLocalBuffers(indexstate->iss_ScanDesc->heapRelation)) /* I think this must always be true for an indexed heap ? */ + && ( ( (prefetch_dbOid > 0) + && (prefetch_dbOid == indexstate->iss_ScanDesc->heapRelation->rd_node.dbNode) + ) + || (prefetch_dbOid == 0) + ) + ) { + indexstate->iss_ScanDesc->pfch_list = palloc( target_prefetch_pages * sizeof(struct BlockIdData) ); + if ( (struct BlockIdData*)0 != indexstate->iss_ScanDesc->pfch_list ) { + indexstate->iss_ScanDesc->pfch_used = 0; + indexstate->iss_ScanDesc->pfch_next = target_prefetch_pages; /* ensure first entry is at index 0 */ + indexstate->iss_ScanDesc->do_prefetch = 1; + } + } +#endif /* USE_PREFETCH */ + /* * If no run-time keys to calculate, go ahead and pass the scankeys to the * index AM. --- src/backend/executor/instrument.c.orig 2012-10-31 15:24:12.083163547 -0400 +++ src/backend/executor/instrument.c 2012-11-01 11:36:52.855258120 -0400 @@ -41,6 +41,12 @@ InstrAlloc(int n, int instrument_options { instr[i].need_bufusage = need_buffers; instr[i].need_timer = need_timer; + instr[i].bufusage_start.aio_read_noneed = 0; + instr[i].bufusage_start.aio_read_noblok = 0; + instr[i].bufusage_start.aio_read_failed = 0; + instr[i].bufusage_start.aio_read_wasted = 0; + instr[i].bufusage_start.aio_read_waited = 0; + instr[i].bufusage_start.aio_read_ontime = 0; } } @@ -145,6 +151,14 @@ BufferUsageAccumDiff(BufferUsage *dst, dst->local_blks_written += add->local_blks_written - sub->local_blks_written; dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read; dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written; + + dst->aio_read_noneed += add->aio_read_noneed - sub->aio_read_noneed; + dst->aio_read_noblok += add->aio_read_noblok - sub->aio_read_noblok; + dst->aio_read_failed += add->aio_read_failed - sub->aio_read_failed; + dst->aio_read_wasted += add->aio_read_wasted - sub->aio_read_wasted; + dst->aio_read_waited += add->aio_read_waited - sub->aio_read_waited; + dst->aio_read_ontime += add->aio_read_ontime - sub->aio_read_ontime; + INSTR_TIME_ACCUM_DIFF(dst->blk_read_time, add->blk_read_time, sub->blk_read_time); INSTR_TIME_ACCUM_DIFF(dst->blk_write_time, --- src/backend/access/nbtree/nbtree.c.orig 2012-11-01 08:58:22.972791424 -0400 +++ src/backend/access/nbtree/nbtree.c 2012-11-01 11:36:52.827258026 -0400 @@ -30,6 +30,8 @@ #include "tcop/tcopprot.h" #include "utils/memutils.h" +Datum +btpeeknexttuple(IndexScanDesc scan); /* Working state for btbuild and its callback */ typedef struct @@ -338,6 +340,66 @@ btgettuple(PG_FUNCTION_ARGS) PG_RETURN_BOOL(res); } +/* + * btpeeknexttuple() -- peek at the next tuple different from any blocknum in pfch_list + * without reading a new index page + * and without causing any side-effects such as altering values in control blocks + * if found, store blocknum in next element of pfch_list + */ +Datum +btpeeknexttuple(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + bool res = false; + int itemIndex; /* current index in items[] */ + + /* + * If we've already initialized this scan, we can just advance it in + * the appropriate direction. If we haven't done so yet, bail out + */ + if ( BTScanPosIsValid(so->currPos) ) { + + itemIndex = so->currPos.itemIndex+1; /* next item */ + + /* This loop handles advancing till we find different data block or end of index page */ + while (itemIndex <= so->currPos.lastItem) { + unsigned short int pfchx; /* index in BlockIdData array */ + for (pfchx = 0; pfchx < scan->pfch_used; pfchx++) { + if (BlockIdEquals(((scan->pfch_list)+pfchx) , &(so->currPos.items[itemIndex].heapTid.ip_blkid))) { + goto block_match; + } + } + + /* if we reach here, no block in list matched this item */ + res = true; + /* set item in prefetch list + ** prefer unused entry if there is one, else overwrite + */ + if (scan->pfch_used < target_prefetch_pages) { + scan->pfch_next = scan->pfch_used; + } else { + scan->pfch_next++; + if (scan->pfch_next >= target_prefetch_pages) { + scan->pfch_next = 0; + } + } + + BlockIdCopy((scan->pfch_list + scan->pfch_next) , &(so->currPos.items[itemIndex].heapTid.ip_blkid)); + /* elog(LOG,"btpeeknexttuple added blocknum %u" ,BlockIdGetBlockNumber(&(so->currPos.items[itemIndex].heapTid.ip_blkid))); */ + if (scan->pfch_used <= scan->pfch_next) { + scan->pfch_used = (scan->pfch_next + 1); + } + + goto peek_complete; + + block_match: itemIndex++; + } + } + + peek_complete: + PG_RETURN_BOOL(res); +} + /* * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap */ --- src/backend/access/heap/syncscan.c.orig 2012-10-31 15:24:12.047163492 -0400 +++ src/backend/access/heap/syncscan.c 2012-11-01 11:36:52.799257932 -0400 @@ -90,6 +90,7 @@ typedef struct ss_scan_location_t { RelFileNode relfilenode; /* identity of a relation */ BlockNumber location; /* last-reported location in the relation */ + BlockNumber prefetchHWM; /* high-water-mark of prefetched Blocknum */ } ss_scan_location_t; typedef struct ss_lru_item_t @@ -113,7 +114,7 @@ static ss_scan_locations_t *scan_locatio /* prototypes for internal functions */ static BlockNumber ss_search(RelFileNode relfilenode, - BlockNumber location, bool set); + BlockNumber location, bool set , BlockNumber *prefetchHWMp); /* @@ -160,6 +161,7 @@ SyncScanShmemInit(void) item->location.relfilenode.dbNode = InvalidOid; item->location.relfilenode.relNode = InvalidOid; item->location.location = InvalidBlockNumber; + item->location.prefetchHWM = InvalidBlockNumber; item->prev = (i > 0) ? (&scan_locations->items[i - 1]) : NULL; @@ -185,7 +187,7 @@ SyncScanShmemInit(void) * data structure. */ static BlockNumber -ss_search(RelFileNode relfilenode, BlockNumber location, bool set) +ss_search(RelFileNode relfilenode, BlockNumber location, bool set , BlockNumber *prefetchHWMp) { ss_lru_item_t *item; @@ -206,6 +208,22 @@ ss_search(RelFileNode relfilenode, Block { item->location.relfilenode = relfilenode; item->location.location = location; + /* if prefetch information requested, + ** then reconcile and either update or report back the new HWM. + */ + if (prefetchHWMp) + { + if ( (item->location.prefetchHWM == InvalidBlockNumber) + || (item->location.prefetchHWM < *prefetchHWMp) + ) + { + item->location.prefetchHWM = *prefetchHWMp; + } + else + { + *prefetchHWMp = item->location.prefetchHWM; + } + } } else if (set) item->location.location = location; @@ -252,7 +270,7 @@ ss_get_location(Relation rel, BlockNumbe BlockNumber startloc; LWLockAcquire(SyncScanLock, LW_EXCLUSIVE); - startloc = ss_search(rel->rd_node, 0, false); + startloc = ss_search(rel->rd_node, 0, false , 0); LWLockRelease(SyncScanLock); /* @@ -282,7 +300,7 @@ ss_get_location(Relation rel, BlockNumbe * same relfilenode. */ void -ss_report_location(Relation rel, BlockNumber location) +ss_report_location(Relation rel, BlockNumber location , BlockNumber *prefetchHWMp) { #ifdef TRACE_SYNCSCAN if (trace_syncscan) @@ -306,7 +324,7 @@ ss_report_location(Relation rel, BlockNu { if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE)) { - (void) ss_search(rel->rd_node, location, true); + (void) ss_search(rel->rd_node, location, true , prefetchHWMp); LWLockRelease(SyncScanLock); } #ifdef TRACE_SYNCSCAN --- src/backend/access/index/indexam.c.orig 2012-11-01 08:58:22.928791446 -0400 +++ src/backend/access/index/indexam.c 2012-11-01 11:41:33.380206093 -0400 @@ -76,6 +76,45 @@ #include "utils/tqual.h" +#ifdef USE_PREFETCH +bool BlocknotinBuffer(Buffer buffer, Relation relation, BlockNumber blockNum); +BlockNumber BlocknumOfBuffer(Buffer buffer); +void index_evict_block(IndexScanDesc scan , BlockNumber blocknumber); + +/* if specified block number is present in the prefetch array, then evict it */ +void index_evict_block(IndexScanDesc scan , BlockNumber blocknumber) +{ + unsigned short int pfchx , pfchy , pfchz; /* indexes in BlockIdData array */ + + if ( scan->do_prefetch + && ((struct BlockIdData*)0 != scan->pfch_list) + /* no need to check for scan->pfch_next < target_prefetch_pages + ** since we will do nothing if scan->pfch_used == 0 + */ + ) { + /* search the prefetch list to find if the block is a member */ + for (pfchx = 0; pfchx < scan->pfch_used; pfchx++) { + if (BlockIdGetBlockNumber((scan->pfch_list)+pfchx) == blocknumber) { + /* shuffle all following the evictee to the left + ** and update next pointer if its element moves + */ + pfchy = (scan->pfch_used - 1); /* current rightmost */ + scan->pfch_used = pfchy; + + while (pfchy > pfchx) { + pfchz = pfchx + 1; + BlockIdCopy((scan->pfch_list)+pfchx, (scan->pfch_list)+pfchz); + if (scan->pfch_next == pfchz) { + scan->pfch_next = pfchx; + } + pfchx++; + } + } + } + } +} +#endif /* USE_PREFETCH */ + /* ---------------------------------------------------------------- * macros used in index_ routines * @@ -243,6 +282,10 @@ index_beginscan(Relation heapRelation, */ scan->heapRelation = heapRelation; scan->xs_snapshot = snapshot; +#ifdef USE_PREFETCH + scan->do_prefetch = 0; /* no prefetching by default */ + scan->pfch_list = (struct BlockIdData*)0; +#endif /* USE_PREFETCH */ return scan; } @@ -267,6 +310,10 @@ index_beginscan_bitmap(Relation indexRel * up by RelationGetIndexScan. */ scan->xs_snapshot = snapshot; +#ifdef USE_PREFETCH + scan->do_prefetch = 0; /* no prefetching by default */ + scan->pfch_list = (struct BlockIdData*)0; +#endif /* USE_PREFETCH */ return scan; } @@ -332,6 +379,12 @@ index_rescan(IndexScanDesc scan, /* Release any held pin on a heap page */ if (BufferIsValid(scan->xs_cbuf)) { +#ifdef USE_PREFETCH + /* if specified block number is present in the prefetch array, then evict it */ + if (scan->do_prefetch) { + index_evict_block(scan , BlocknumOfBuffer(scan->xs_cbuf)); + } +#endif /* USE_PREFETCH */ ReleaseBuffer(scan->xs_cbuf); scan->xs_cbuf = InvalidBuffer; } @@ -363,10 +416,28 @@ index_endscan(IndexScanDesc scan) /* Release any held pin on a heap page */ if (BufferIsValid(scan->xs_cbuf)) { +#ifdef USE_PREFETCH + /* if specified block number is present in the prefetch array, then evict it */ + if (scan->do_prefetch) { + index_evict_block(scan , BlocknumOfBuffer(scan->xs_cbuf)); + } +#endif /* USE_PREFETCH */ ReleaseBuffer(scan->xs_cbuf); scan->xs_cbuf = InvalidBuffer; } +#ifdef USE_PREFETCH + /* discard prefetched but unread buffers */ + if ( scan->do_prefetch + && ((struct BlockIdData*)0 != scan->pfch_list) + ) { + unsigned short int pfchx; /* index in BlockIdData array */ + for (pfchx = 0; pfchx < scan->pfch_used; pfchx++) { + DiscardBuffer(scan->heapRelation, MAIN_FORKNUM, BlockIdGetBlockNumber((scan->pfch_list)+pfchx)); + } + } +#endif /* USE_PREFETCH */ + /* End the AM's scan */ FunctionCall1(procedure, PointerGetDatum(scan)); @@ -462,6 +533,12 @@ index_getnext_tid(IndexScanDesc scan, Sc /* ... but first, release any held pin on a heap page */ if (BufferIsValid(scan->xs_cbuf)) { +#ifdef USE_PREFETCH + /* if specified block number is present in the prefetch array, then evict it */ + if (scan->do_prefetch) { + index_evict_block(scan , BlocknumOfBuffer(scan->xs_cbuf)); + } +#endif /* USE_PREFETCH */ ReleaseBuffer(scan->xs_cbuf); scan->xs_cbuf = InvalidBuffer; } @@ -492,6 +569,11 @@ index_getnext_tid(IndexScanDesc scan, Sc * enough information to do it efficiently in the general case. * ---------------- */ +#ifdef USE_PREFETCH +extern Datum btgettuple(PG_FUNCTION_ARGS); +extern Datum btpeeknexttuple(IndexScanDesc scan); +#endif /* USE_PREFETCH */ + HeapTuple index_fetch_heap(IndexScanDesc scan) { @@ -505,10 +587,55 @@ index_fetch_heap(IndexScanDesc scan) /* Switch to correct buffer if we don't have it already */ Buffer prev_buf = scan->xs_cbuf; +#ifdef USE_PREFETCH + /* if the old block is different from new block, then evict old + ** block from prefetched array. It is arguable we should leave it + ** in the array because it's likely to remain in the buffer pool + ** for a while, but in that case , if we encounter the block + ** again, prefetching it again does no harm. + ** (although unfortunately , if it's not pinned, prefetching it will + ** not pin it since prefetch is a noop for a buffer in the buffer pool) + */ + if ( scan->do_prefetch + && ( BufferIsValid(prev_buf) ) + && (BlocknotinBuffer(prev_buf,scan->heapRelation,ItemPointerGetBlockNumber(tid))) + && (scan->pfch_next < target_prefetch_pages) /* ensure there is an entry */ + ) { + index_evict_block(scan , BlocknumOfBuffer(prev_buf)); + } + +#endif /* USE_PREFETCH */ scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf, scan->heapRelation, ItemPointerGetBlockNumber(tid)); +#ifdef USE_PREFETCH + /* try prefetching next data block + ** (next meaning one containing TIDs from matching keys + ** in same index page and different from any block + ** we previously prefetched and listed in prefetched array) + */ + { + FmgrInfo *procedure; + bool found; + + if (scan->do_prefetch) { + /* GET_SCAN_PROCEDURE(ampeeknexttuple); is correct but requires adding function to catalog which I did not do so instead */ + procedure = &scan->indexRelation->rd_aminfo->ampeeknexttuple; + /* elog(LOG,"index_fetch_heap procedure= %p procedure->fn_addr= %p" ,procedure , (procedure ? procedure->fn_addr : 0)); */ + /* if (procedure && procedure->fn_addr) { ** does the index access method support peektuple? - procedure->fn_addr is null since not in catalog so instead */ + if (procedure) { /* does the index access method support peektuple? */ + /* note we trust InitIndexScan verified this scan is forwards only and so set that */ + /* found = DatumGetBool(FunctionCall1(procedure, PointerGetDatum(scan))); cant use fmgr to call it because not in catalog so instead */ + found = DatumGetBool(btpeeknexttuple(scan)); + if (found) { + PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, BlockIdGetBlockNumber(scan->pfch_list + scan->pfch_next) , 0); + } + } + } + } +#endif /* USE_PREFETCH */ + /* * Prune page, but only if we weren't already on this page */ --- src/include/executor/instrument.h.orig 2012-10-31 15:24:12.331163938 -0400 +++ src/include/executor/instrument.h 2012-11-01 11:36:52.915258321 -0400 @@ -28,6 +28,14 @@ typedef struct BufferUsage long local_blks_written; /* # of local disk blocks written */ long temp_blks_read; /* # of temp blocks read */ long temp_blks_written; /* # of temp blocks written */ + + long aio_read_noneed; /* # of prefetches for which no need for prefetch as block already in buffer pool */ + long aio_read_noblok; /* # of prefetches for which no available BufferAiocb */ + long aio_read_failed; /* # of aio reads for which aio itself failed or the read failed with an errno */ + long aio_read_wasted; /* # of aio reads for which disk block not used */ + long aio_read_waited; /* # of aio reads for which disk block used but had to wait for it */ + long aio_read_ontime; /* # of aio reads for which disk block used and ready on time when requested */ + instr_time blk_read_time; /* time spent reading */ instr_time blk_write_time; /* time spent writing */ } BufferUsage; --- src/include/catalog/pg_am.h.orig 2012-10-31 15:24:12.323163928 -0400 +++ src/include/catalog/pg_am.h 2012-11-01 11:48:17.841593881 -0400 @@ -67,6 +67,7 @@ CATALOG(pg_am,2601) regproc amcanreturn; /* can indexscan return IndexTuples? */ regproc amcostestimate; /* estimate cost of an indexscan */ regproc amoptions; /* parse AM-specific parameters */ + regproc ampeeknexttuple; /* peek at the next tuple different from any blocknum in pfch_list without reading a new index page */ } FormData_pg_am; /* ---------------- @@ -117,19 +118,19 @@ typedef FormData_pg_am *Form_pg_am; * ---------------- */ -DATA(insert OID = 403 ( btree 5 2 t f t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcanreturn btcostestimate btoptions )); +DATA(insert OID = 403 ( btree 5 2 t f t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcanreturn btcostestimate btoptions btpeeknexttuple )); DESCR("b-tree index access method"); #define BTREE_AM_OID 403 -DATA(insert OID = 405 ( hash 1 1 f f t f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup - hashcostestimate hashoptions )); +DATA(insert OID = 405 ( hash 1 1 f f t f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup - hashcostestimate hashoptions - )); DESCR("hash index access method"); #define HASH_AM_OID 405 -DATA(insert OID = 783 ( gist 0 8 f t f f t t f t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup - gistcostestimate gistoptions )); +DATA(insert OID = 783 ( gist 0 8 f t f f t t f t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup - gistcostestimate gistoptions - )); DESCR("GiST index access method"); #define GIST_AM_OID 783 -DATA(insert OID = 2742 ( gin 0 5 f f f f t t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup - gincostestimate ginoptions )); +DATA(insert OID = 2742 ( gin 0 5 f f f f t t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup - gincostestimate ginoptions - )); DESCR("GIN index access method"); #define GIN_AM_OID 2742 -DATA(insert OID = 4000 ( spgist 0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions )); +DATA(insert OID = 4000 ( spgist 0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions - )); DESCR("SP-GiST index access method"); #define SPGIST_AM_OID 4000 --- src/include/utils/rel.h.orig 2012-10-31 15:24:12.351163971 -0400 +++ src/include/utils/rel.h 2012-11-01 11:48:17.853593923 -0400 @@ -67,6 +67,7 @@ typedef struct RelationAmInfo FmgrInfo amcanreturn; FmgrInfo amcostestimate; FmgrInfo amoptions; + FmgrInfo ampeeknexttuple; /* peek at the next tuple different from any blocknum in pfch_list without reading a new index page */ } RelationAmInfo; --- src/include/access/heapam.h.orig 2012-10-31 15:24:12.319163922 -0400 +++ src/include/access/heapam.h 2012-11-01 11:59:40.307980735 -0400 @@ -160,7 +160,7 @@ extern void heap_page_prune_execute(Buff extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets); /* in heap/syncscan.c */ -extern void ss_report_location(Relation rel, BlockNumber location); +extern void ss_report_location(Relation rel, BlockNumber location , BlockNumber *prefetchHWMp); extern BlockNumber ss_get_location(Relation rel, BlockNumber relnblocks); extern void SyncScanShmemInit(void); extern Size SyncScanShmemSize(void); --- src/include/access/relscan.h.orig 2012-11-01 08:58:22.980791419 -0400 +++ src/include/access/relscan.h 2012-11-01 11:36:52.895258256 -0400 @@ -43,6 +43,10 @@ typedef struct HeapScanDescData bool rs_inited; /* false = scan not init'd yet */ HeapTupleData rs_ctup; /* current tuple in scan, if any */ BlockNumber rs_cblock; /* current block # in scan, if any */ +#ifdef USE_PREFETCH + int rs_prefetch_target; /* target distance (numblocks) for prefetch to reach beyond main scan */ + BlockNumber rs_pfchblock; /* next block # to be prefetched in scan, if any */ +#endif /* USE_PREFETCH */ Buffer rs_cbuf; /* current buffer in scan, if any */ /* NB: if rs_cbuf is not InvalidBuffer, we hold a pin on that buffer */ ItemPointerData rs_mctid; /* marked scan position, if any */ @@ -74,8 +78,14 @@ typedef struct IndexScanDescData /* signaling to index AM about killing index tuples */ bool kill_prior_tuple; /* last-returned tuple is dead */ bool ignore_killed_tuples; /* do not return killed entries */ - bool xactStartedInRecovery; /* prevents killing/seeing killed - * tuples */ + bool xactStartedInRecovery; /* prevents killing/seeing killed tuples */ + +#ifdef USE_PREFETCH + struct BlockIdData* pfch_list; /* array of BlockIds which we will/have prefetched */ + unsigned short int pfch_used; /* number of used elements in BlockIdData array */ + unsigned short int pfch_next; /* next element for prefetch in BlockIdData array */ + int do_prefetch; /* should I prefetch ? */ +#endif /* USE_PREFETCH */ /* index access method's private state */ void *opaque; /* access-method-specific info */ --- contrib/pg_stat_statements/pg_stat_statements.c.orig 2012-10-31 15:24:11.943163326 -0400 +++ contrib/pg_stat_statements/pg_stat_statements.c 2012-11-01 11:36:52.795257920 -0400 @@ -113,6 +113,14 @@ typedef struct Counters int64 local_blks_written; /* # of local disk blocks written */ int64 temp_blks_read; /* # of temp blocks read */ int64 temp_blks_written; /* # of temp blocks written */ + + int64 aio_read_noneed; /* # of prefetches for which no need for prefetch as block already in buffer pool */ + int64 aio_read_noblok; /* # of prefetches for which no available BufferAiocb */ + int64 aio_read_failed; /* # of aio reads for which aio itself failed or the read failed with an errno */ + int64 aio_read_wasted; /* # of aio reads for which disk block not used */ + int64 aio_read_waited; /* # of aio reads for which disk block used but had to wait for it */ + int64 aio_read_ontime; /* # of aio reads for which disk block used and ready on time when requested */ + double blk_read_time; /* time spent reading, in msec */ double blk_write_time; /* time spent writing, in msec */ double usage; /* usage factor */ @@ -861,7 +869,21 @@ pgss_ProcessUtility(Node *parsetree, con bufusage.temp_blks_read = pgBufferUsage.temp_blks_read - bufusage_start.temp_blks_read; bufusage.temp_blks_written = - pgBufferUsage.temp_blks_written - bufusage_start.temp_blks_written; + pgBufferUsage.temp_blks_written - bufusage.temp_blks_written; + + bufusage.aio_read_noneed = + pgBufferUsage.aio_read_noneed - bufusage.aio_read_noneed; + bufusage.aio_read_noblok = + pgBufferUsage.aio_read_noblok - bufusage.aio_read_noblok; + bufusage.aio_read_failed = + pgBufferUsage.aio_read_failed - bufusage.aio_read_failed; + bufusage.aio_read_wasted = + pgBufferUsage.aio_read_wasted - bufusage.aio_read_wasted; + bufusage.aio_read_waited = + pgBufferUsage.aio_read_waited - bufusage.aio_read_waited; + bufusage.aio_read_ontime = + pgBufferUsage.aio_read_ontime - bufusage.aio_read_ontime; + bufusage.blk_read_time = pgBufferUsage.blk_read_time; INSTR_TIME_SUBTRACT(bufusage.blk_read_time, bufusage_start.blk_read_time); bufusage.blk_write_time = pgBufferUsage.blk_write_time; @@ -876,6 +898,7 @@ pgss_ProcessUtility(Node *parsetree, con rows, &bufusage, NULL); + } else { @@ -1037,6 +1060,14 @@ pgss_store(const char *query, uint32 que e->counters.local_blks_written += bufusage->local_blks_written; e->counters.temp_blks_read += bufusage->temp_blks_read; e->counters.temp_blks_written += bufusage->temp_blks_written; + + e->counters.aio_read_noneed += bufusage->aio_read_noneed; + e->counters.aio_read_noblok += bufusage->aio_read_noblok; + e->counters.aio_read_failed += bufusage->aio_read_failed; + e->counters.aio_read_wasted += bufusage->aio_read_wasted; + e->counters.aio_read_waited += bufusage->aio_read_waited; + e->counters.aio_read_ontime += bufusage->aio_read_ontime; + e->counters.blk_read_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_read_time); e->counters.blk_write_time += INSTR_TIME_GET_MILLISEC(bufusage->blk_write_time); e->counters.usage += USAGE_EXEC(total_time); @@ -1066,7 +1097,7 @@ pg_stat_statements_reset(PG_FUNCTION_ARG } #define PG_STAT_STATEMENTS_COLS_V1_0 14 -#define PG_STAT_STATEMENTS_COLS 18 +#define PG_STAT_STATEMENTS_COLS 24 /* * Retrieve statement statistics. @@ -1177,6 +1208,14 @@ pg_stat_statements(PG_FUNCTION_ARGS) values[i++] = Int64GetDatumFast(tmp.local_blks_written); values[i++] = Int64GetDatumFast(tmp.temp_blks_read); values[i++] = Int64GetDatumFast(tmp.temp_blks_written); + + values[i++] = Int64GetDatumFast(tmp.aio_read_noneed); + values[i++] = Int64GetDatumFast(tmp.aio_read_noblok); + values[i++] = Int64GetDatumFast(tmp.aio_read_failed); + values[i++] = Int64GetDatumFast(tmp.aio_read_wasted); + values[i++] = Int64GetDatumFast(tmp.aio_read_waited); + values[i++] = Int64GetDatumFast(tmp.aio_read_ontime); + if (sql_supports_v1_1_counters) { values[i++] = Float8GetDatumFast(tmp.blk_read_time); --- contrib/pg_stat_statements/pg_stat_statements--1.1.sql.orig 2012-10-31 15:24:11.943163326 -0400 +++ contrib/pg_stat_statements/pg_stat_statements--1.1.sql 2012-11-01 11:36:52.735257720 -0400 @@ -26,6 +26,12 @@ CREATE FUNCTION pg_stat_statements( OUT local_blks_written int8, OUT temp_blks_read int8, OUT temp_blks_written int8, + OUT aio_read_noneed int8, + OUT aio_read_noblok int8, + OUT aio_read_failed int8, + OUT aio_read_wasted int8, + OUT aio_read_waited int8, + OUT aio_read_ontime int8, OUT blk_read_time float8, OUT blk_write_time float8 ) --- contrib/pg_stat_statements/pg_stat_statements--1.0--1.1.sql.orig 2012-10-31 15:24:11.943163326 -0400 +++ contrib/pg_stat_statements/pg_stat_statements--1.0--1.1.sql 2012-11-01 11:36:52.727257692 -0400 @@ -29,6 +29,12 @@ CREATE FUNCTION pg_stat_statements( OUT local_blks_written int8, OUT temp_blks_read int8, OUT temp_blks_written int8, + OUT aio_read_noneed int8, + OUT aio_read_noblok int8, + OUT aio_read_failed int8, + OUT aio_read_wasted int8, + OUT aio_read_waited int8, + OUT aio_read_ontime int8, OUT blk_read_time float8, OUT blk_write_time float8 )