From b3dee96bfaf31a4dff31b903ad05a7d5430d23e2 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Sat, 12 Oct 2024 09:57:17 +1300 Subject: [PATCH 2/2] Use AutoReadStream for btree leaf page scans. Leaf pages tend to be sequential on recently built/clustered indexes, so there is an opportunity to do I/O combining and look-ahead. It's hard to do it 'precisely' with a ReadStream because they form a linked list. XXX POC --- src/backend/access/nbtree/nbtpage.c | 22 ++++++++++++++++++++++ src/backend/access/nbtree/nbtree.c | 5 +++++ src/backend/access/nbtree/nbtsearch.c | 2 +- src/include/access/nbtree.h | 5 +++++ 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index c79dd38ee18..e617bf108dd 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -856,6 +856,28 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access) return buf; } +/* + * Like _bt_getbuf(), but use auto_stream instead of reading directly. This + * allows I/O to be combined if the blocks happen to be sequential on disk. + */ +Buffer +_bt_getbuf_auto(AutoReadStream *auto_stream, + Relation rel, + BlockNumber blkno, + int access) +{ + Buffer buf; + + Assert(BlockNumberIsValid(blkno)); + + /* Read an existing block of the relation */ + buf = auto_read_buffer(auto_stream, blkno); + _bt_lockbuf(rel, buf, access); + _bt_checkpage(rel, buf); + + return buf; +} + /* * _bt_allocbuf() -- Allocate a new block/page. * diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index fdff960c130..309147bea15 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -364,6 +364,9 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so->killedItems = NULL; /* until needed */ so->numKilled = 0; + /* XXX defer until it looks like it's worth it? */ + so->auto_stream = auto_read_stream_begin(NULL, rel, MAIN_FORKNUM); + /* * We don't know yet whether the scan will be index-only, so we do not * allocate the tuple workspace arrays until btrescan. However, we set up @@ -483,6 +486,8 @@ btendscan(IndexScanDesc scan) so->markItemIndex = -1; BTScanPosUnpinIfPinned(so->markPos); + auto_read_stream_end(so->auto_stream); + /* No need to invalidate positions, the RAM is about to be freed. */ /* Release storage */ diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index d69798795b4..f15e05042b4 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -2408,7 +2408,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, { /* read blkno, but check for interrupts first */ CHECK_FOR_INTERRUPTS(); - so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); + so->currPos.buf = _bt_getbuf_auto(so->auto_stream, rel, blkno, BT_READ); } else { diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index e709d2e0afe..c4b82cd4466 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -22,6 +22,7 @@ #include "catalog/pg_am_d.h" #include "catalog/pg_index.h" #include "lib/stringinfo.h" +#include "storage/auto_read_stream.h" #include "storage/bufmgr.h" #include "storage/shm_toc.h" #include "utils/skipsupport.h" @@ -1072,6 +1073,8 @@ typedef struct BTScanOpaqueData int numKilled; /* number of currently stored items */ bool dropPin; /* drop leaf pin before btgettuple returns? */ + AutoReadStream *auto_stream; + /* * If we are doing an index-only scan, these are the tuple storage * workspaces for the currPos and markPos respectively. Each is of size @@ -1273,6 +1276,8 @@ extern void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage); extern void _bt_checkpage(Relation rel, Buffer buf); extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); +extern Buffer _bt_getbuf_auto(AutoReadStream *auto_stream, + Relation rel, BlockNumber blkno, int access); extern Buffer _bt_allocbuf(Relation rel, Relation heaprel); extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access); -- 2.39.5 (Apple Git-154)