From e2a94f1f38fb73289a9d7701ab3a2e5bb5370374 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Wed, 1 Jan 2025 22:10:37 +0100 Subject: [PATCH v20250501 3/7] WIP: Don't read the same block repeatedly --- src/backend/access/heap/heapam_handler.c | 57 ++++++++++++++++++++++-- src/backend/access/index/indexam.c | 13 ++++++ src/backend/storage/buffer/bufmgr.c | 40 +++++++++++++++++ src/include/access/relscan.h | 2 + src/include/storage/bufmgr.h | 2 + 5 files changed, 111 insertions(+), 3 deletions(-) diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index f79d97a8c64..326d5fed681 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -136,6 +136,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, { /* Switch to correct buffer if we don't have it already */ Buffer prev_buf = hscan->xs_cbuf; + bool release_prev = true; /* * Read the block for the requested TID. With a read stream, simply @@ -157,7 +158,56 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, * API. */ if (scan->rs) - hscan->xs_cbuf = read_stream_next_buffer(scan->rs, NULL); + { + /* + * If we're trying to read the same block as the last time, don't + * try reading it from the stream again, but just return the last + * buffer. We need to check if the previous buffer is still pinned + * and contains the correct block (it might have been unpinned, + * used for a different block, so we need to be careful). + * + * The place scheduling the blocks (index_scan_stream_read_next) + * needs to do the same thing and not schedule the blocks if it + * matches the previous one. Otherwise the stream will get out of + * sync, causing confusion. + * + * This is what ReleaseAndReadBuffer does too, but it does not + * have a queue of requests scheduled from somewhere else, so it + * does not need to worry about that. + * + * XXX Maybe we should remember the block in IndexFetchTableData, + * so that we can make the check even cheaper, without looking at + * the buffer descriptor? But that assumes the buffer was not + * unpinned (or repinned) elsewhere, before we got back here. But + * can that even happen? If yes, I guess we shouldn't be releasing + * the prev buffer anyway. + * + * XXX This has undesired impact on prefetch distance. The read + * stream schedules reads for a certain number of future blocks, + * but if we skip duplicate blocks, the prefetch distance may get + * unexpectedly large (e.g. for correlated indexes, with long runs + * of TIDs from the same heap page). This may spend a lot of CPU + * time in the index_scan_stream_read_next callback, but more + * importantly it may require reading (and keeping) a lot of leaf + * pages from the index. + * + * XXX What if we pinned the buffer twice (increase the refcount), + * so that if the caller unpins the buffer, we still keep the + * second pin. Wouldn't that mean we don't need to worry about the + * possibility someone loaded another page into the buffer? + * + * XXX We might also keep a longer history of recent blocks, not + * just the immediately preceding one. But that makes it harder, + * because the two places (read_next callback and here) need to + * have a slightly different view. + */ + if (BufferMatches(hscan->xs_cbuf, + hscan->xs_base.rel, + ItemPointerGetBlockNumber(tid))) + release_prev = false; + else + hscan->xs_cbuf = read_stream_next_buffer(scan->rs, NULL); + } else hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf, hscan->xs_base.rel, @@ -181,7 +231,8 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf); /* - * When using the read stream, release the old buffer. + * When using the read stream, release the old buffer - but only if + * we're reading a different block. * * XXX Not sure this is really needed, or maybe this is not the right * place to do this, and buffers should be released elsewhere. The @@ -199,7 +250,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, * XXX Does this do the right thing when reading the same page? That * should return the same buffer, so won't we release it prematurely? */ - if (scan->rs && (prev_buf != InvalidBuffer)) + if (scan->rs && (prev_buf != InvalidBuffer) && release_prev) { ReleaseBuffer(prev_buf); } diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index c5ba89499f1..f0fda6d761c 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -1915,6 +1915,15 @@ index_scan_stream_read_next(ReadStream *stream, continue; } + /* same block as before, don't need to read it */ + if (scan->xs_batches->lastBlock == ItemPointerGetBlockNumber(tid)) + { + DEBUG_LOG("index_scan_stream_read_next: skip block (lastBlock)"); + continue; + } + + scan->xs_batches->lastBlock = ItemPointerGetBlockNumber(tid); + return ItemPointerGetBlockNumber(tid); } @@ -2118,6 +2127,7 @@ index_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction) */ scan->xs_batches->direction = direction; scan->xs_batches->finished = false; + scan->xs_batches->lastBlock = InvalidBlockNumber; index_batch_pos_reset(scan, &scan->xs_batches->streamPos); read_stream_reset(scan->xs_heapfetch->rs); @@ -2232,6 +2242,7 @@ index_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction) scan->xs_batches->readPos.batch, scan->xs_batches->readPos.index); scan->xs_batches->reset = false; + scan->xs_batches->lastBlock = InvalidBlockNumber; /* * Need to reset the stream position, it might be too far behind. @@ -2311,6 +2322,7 @@ index_batch_init(IndexScanDesc scan) index_batch_pos_reset(scan, &scan->xs_batches->markPos); // scan->xs_batches->currentBatch = NULL; + scan->xs_batches->lastBlock = InvalidBlockNumber; } /* @@ -2393,6 +2405,7 @@ index_batch_reset(IndexScanDesc scan, bool complete) batches->finished = false; batches->reset = false; // batches->currentBatch = NULL; + batches->lastBlock = InvalidBlockNumber; AssertCheckBatches(scan); } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 0b317d2d809..35c3526e250 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3045,6 +3045,46 @@ ReleaseAndReadBuffer(Buffer buffer, return ReadBuffer(relation, blockNum); } +/* + * BufferMatches + * Check if the buffer (still) contains the expected page. + * + * Check if the buffer contains the expected page. The buffer may be invalid, + * or valid and pinned. + */ +bool +BufferMatches(Buffer buffer, + Relation relation, + BlockNumber blockNum) +{ + ForkNumber forkNum = MAIN_FORKNUM; + BufferDesc *bufHdr; + + if (BufferIsValid(buffer)) + { + Assert(BufferIsPinned(buffer)); + if (BufferIsLocal(buffer)) + { + bufHdr = GetLocalBufferDescriptor(-buffer - 1); + if (bufHdr->tag.blockNum == blockNum && + BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) && + BufTagGetForkNum(&bufHdr->tag) == forkNum) + return true; + } + else + { + bufHdr = GetBufferDescriptor(buffer - 1); + /* we have pin, so it's ok to examine tag without spinlock */ + if (bufHdr->tag.blockNum == blockNum && + BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) && + BufTagGetForkNum(&bufHdr->tag) == forkNum) + return true; + } + } + + return false; +} + /* * PinBuffer -- make buffer unavailable for replacement. * diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index b63af845ca6..2bbd0db0223 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -242,6 +242,8 @@ typedef struct IndexScanBatches bool finished; bool reset; + BlockNumber lastBlock; + /* * Current scan direction, for the currently loaded batches. This is used * to load data in the read stream API callback, etc. diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 41fdc1e7693..3b7d4e6a6a2 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -237,6 +237,8 @@ extern void IncrBufferRefCount(Buffer buffer); extern void CheckBufferIsPinnedOnce(Buffer buffer); extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum); +extern bool BufferMatches(Buffer buffer, Relation relation, + BlockNumber blockNum); extern Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, -- 2.49.0