From 63cb731176a62320d296f968b12a5d4d36e703d0 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Wed, 18 Mar 2026 11:17:57 -0400 Subject: [PATCH v6 8/8] AIO: Don't wait for already in-progress IO When a backend attempts to start a read IO and finds the first buffer already has I/O in progress, previously it waited for that I/O to complete before initiating reads for any of the subsequent buffers. Although the backend must wait for the I/O to finish when acquiring the buffer, there's no reason for it to wait when setting up the read operation. Waiting at this point prevents the backend from starting I/O on subsequent buffers and can significantly reduce concurrency. This matters in two workloads: when multiple backends scan the same relation concurrently, and when a single backend requests the same block multiple times within the readahead distance. If backends wait each time they encounter an in-progress read, the access pattern effectively degenerates into synchronous I/O. To fix this, when encountering an already in-progress IO for the head buffer, a backend now records the buffer's wait reference and defers waiting until WaitReadBuffers(), when it actually needs to acquire the buffer. In rare cases, a backend may still need to wait synchronously at IO start time: if another backend has set BM_IO_IN_PROGRESS on the buffer but has not yet set the wait reference. Such windows should be brief and uncommon. Author: Melanie Plageman Reviewed-by: Andres Freund Reviewed-by: Nazir Bilal Yavuz Discussion: https://postgr.es/m/flat/zljergweqti7x67lg5ije2rzjusie37nslsnkjkkby4laqqbfw%403p3zu522yykv --- src/backend/storage/buffer/bufmgr.c | 201 +++++++++++++++++++--------- src/include/storage/bufmgr.h | 4 +- src/tools/pgindent/typedefs.list | 1 + 3 files changed, 145 insertions(+), 61 deletions(-) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 2179ade07cc..31d1563a69f 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -185,6 +185,20 @@ typedef struct SMgrSortArray SMgrRelation srel; } SMgrSortArray; +/* + * In AsyncReadBuffers(), when preparing a buffer for reading and setting + * BM_IO_IN_PROGRESS, the buffer may already have I/O in progress or may + * already contain the desired block. AsyncReadBuffers() must distinguish + * between these cases (and the case where it should initiate I/O) so it can + * mark an in-progress buffer as foreign I/O rather than waiting on it. + */ +typedef enum PrepareReadBufferStatus +{ + READ_BUFFER_ALREADY_DONE, + READ_BUFFER_IN_PROGRESS, + READ_BUFFER_READY_FOR_IO, +} PrepareReadBufferStatus; + /* GUC variables */ bool zero_damaged_pages = false; int bgwriter_lru_maxpages = 100; @@ -1663,8 +1677,9 @@ CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete) * Local version of PrepareHeadBufferReadIO(). Here instead of localbuf.c to * avoid an external function call. */ -static bool -PrepareHeadLocalBufferReadIO(Buffer buffer) +static PrepareReadBufferStatus +PrepareHeadLocalBufferReadIO(ReadBuffersOperation *operation, + Buffer buffer) { BufferDesc *desc = GetLocalBufferDescriptor(-buffer - 1); uint64 buf_state = pg_atomic_read_u64(&desc->state); @@ -1675,49 +1690,60 @@ PrepareHeadLocalBufferReadIO(Buffer buffer) * handle). Only the owning backend can set BM_VALID on a local buffer. */ if (buf_state & BM_VALID) - return false; + return READ_BUFFER_ALREADY_DONE; /* * Submit any staged IO before checking for in-progress IO. Without this, * the wref check below could find IO that this backend staged but hasn't - * submitted yet. Waiting on that would PANIC because the owner can't wait - * on its own staged IO. + * submitted yet. If we returned READ_BUFFER_IN_PROGRESS and + * WaitReadBuffers() then tried to wait on it, we'd PANIC because the + * owner can't wait on its own staged IO. */ pgaio_submit_staged(); - /* Wait for in-progress IO */ + /* We've already asynchronously started this IO, so join it */ if (pgaio_wref_valid(&desc->io_wref)) { - PgAioWaitRef iow = desc->io_wref; - - pgaio_wref_wait(&iow); - - buf_state = pg_atomic_read_u64(&desc->state); + operation->io_wref = desc->io_wref; + operation->foreign_io = true; + return READ_BUFFER_IN_PROGRESS; } - /* - * If BM_VALID is set, we waited on IO and it completed successfully. - * Otherwise, we'll initiate IO on the buffer. - */ - return !(buf_state & BM_VALID); + /* Prepare to start IO on this buffer */ + return READ_BUFFER_READY_FOR_IO; } /* * Try to start IO on the first buffer in a new run of blocks. If AIO is in - * progress, be it in this backend or another backend, we wait for it to - * finish and then check the result. + * progress, be it in this backend or another backend, we just associate the + * wait reference with the operation and wait in WaitReadBuffers(). This turns + * out to be important for performance in two workloads: + * + * 1) A read stream that has to read the same block multiple times within the + * readahead distance. This can happen e.g. for the table accesses of an + * index scan. + * + * 2) Concurrent scans by multiple backends on the same relation. + * + * If we were to synchronously wait for the in-progress IO, we'd not be able + * to keep enough I/O in flight. + * + * If we do find there is ongoing I/O for the buffer, we set up a 1-block + * ReadBuffersOperation that WaitReadBuffers then can wait on. * - * Returns true if the buffer is ready for IO, false if the buffer is already - * valid. + * It's possible that another backend has started IO on the buffer but not yet + * set its wait reference. In this case, we have no choice but to wait for + * either the wait reference to be valid or the IO to be done. */ -static bool -PrepareHeadBufferReadIO(Buffer buffer) +static PrepareReadBufferStatus +PrepareHeadBufferReadIO(ReadBuffersOperation *operation, + Buffer buffer) { uint64 buf_state; BufferDesc *desc; if (BufferIsLocal(buffer)) - return PrepareHeadLocalBufferReadIO(buffer); + return PrepareHeadLocalBufferReadIO(operation, buffer); ResourceOwnerEnlarge(CurrentResourceOwner); desc = GetBufferDescriptor(buffer - 1); @@ -1732,11 +1758,25 @@ PrepareHeadBufferReadIO(Buffer buffer) if (buf_state & BM_VALID) { UnlockBufHdr(desc); - return false; + return READ_BUFFER_ALREADY_DONE; } if (buf_state & BM_IO_IN_PROGRESS) { + /* Join existing read */ + if (pgaio_wref_valid(&desc->io_wref)) + { + operation->io_wref = desc->io_wref; + operation->foreign_io = true; + UnlockBufHdr(desc); + return READ_BUFFER_IN_PROGRESS; + } + + /* + * If the wait ref is not valid but the IO is in progress, someone + * else started IO but hasn't set the wait ref yet. We have no + * choice but to wait until the IO completes. + */ UnlockBufHdr(desc); /* @@ -1758,7 +1798,7 @@ PrepareHeadBufferReadIO(Buffer buffer) ResourceOwnerRememberBufferIO(CurrentResourceOwner, BufferDescriptorGetBuffer(desc)); - return true; + return READ_BUFFER_READY_FOR_IO; } } @@ -1939,8 +1979,11 @@ WaitReadBuffers(ReadBuffersOperation *operation) * b) reports some time as waiting, even if we never waited * * we first check if we already know the IO is complete. + * + * Note that operation->io_return is uninitialized for foreign IO, + * so we cannot use the cheaper PGAIO_RS_UNKNOWN pre-check. */ - if (aio_ret->result.status == PGAIO_RS_UNKNOWN && + if ((operation->foreign_io || aio_ret->result.status == PGAIO_RS_UNKNOWN) && !pgaio_wref_check_done(&operation->io_wref)) { instr_time io_start = pgstat_prepare_io_time(track_io_timing); @@ -1959,11 +2002,45 @@ WaitReadBuffers(ReadBuffersOperation *operation) Assert(pgaio_wref_check_done(&operation->io_wref)); } - /* - * We now are sure the IO completed. Check the results. This - * includes reporting on errors if there were any. - */ - ProcessReadBuffersResult(operation); + if (unlikely(operation->foreign_io)) + { + Buffer buffer = operation->buffers[operation->nblocks_done]; + BufferDesc *desc = BufferIsLocal(buffer) ? + GetLocalBufferDescriptor(-buffer - 1) : + GetBufferDescriptor(buffer - 1); + uint64 buf_state = pg_atomic_read_u64(&desc->state); + + if (buf_state & BM_VALID) + { + BlockNumber blocknum = operation->blocknum + operation->nblocks_done; + + operation->nblocks_done += 1; + Assert(operation->nblocks_done <= operation->nblocks); + + /* + * Track this as a 'hit' for this backend. The backend + * performing the IO will track it as a 'read'. + */ + TrackBufferHit(io_object, io_context, + operation->rel, operation->persistence, + operation->smgr, operation->forknum, + blocknum); + } + + /* + * If the foreign IO failed and left the buffer invalid, + * nblocks_done is not incremented. The retry loop below will + * call AsyncReadBuffers() which will attempt the IO itself. + */ + } + else + { + /* + * We now are sure the IO completed. Check the results. This + * includes reporting on errors if there were any. + */ + ProcessReadBuffersResult(operation); + } } /* @@ -2009,7 +2086,8 @@ WaitReadBuffers(ReadBuffersOperation *operation) * affected by the call. If the first buffer is valid, *nblocks_progress is * set to 1 and operation->nblocks_done is incremented. * - * Returns true if IO was initiated, false if no IO was necessary. + * Returns true if IO was initiated or is already in progress (foreign IO), + * false if the buffer was already valid. */ static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) @@ -2028,6 +2106,7 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) IOContext io_context; IOObject io_object; instr_time io_start; + PrepareReadBufferStatus status; if (persistence == RELPERSISTENCE_TEMP) { @@ -2066,40 +2145,42 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) ioh = pgaio_io_acquire(CurrentResourceOwner, &operation->io_return); } + operation->foreign_io = false; pgaio_wref_clear(&operation->io_wref); - /* - * Check if we can start IO on the first to-be-read buffer. - * - * If an I/O is already in progress in another backend, we want to wait - * for the outcome: either done, or something went wrong and we will - * retry. - */ - if (!PrepareHeadBufferReadIO(buffers[nblocks_done])) + /* Check if we can start IO on the first to-be-read buffer */ + status = PrepareHeadBufferReadIO(operation, buffers[nblocks_done]); + if (status != READ_BUFFER_READY_FOR_IO) { - /* - * Someone has already completed this block, we're done. - * - * When IO is necessary, ->nblocks_done is updated in - * ProcessReadBuffersResult(), but that is not called if no IO is - * necessary. Thus update here. - */ - operation->nblocks_done += 1; + pgaio_io_release(ioh); *nblocks_progress = 1; + if (status == READ_BUFFER_ALREADY_DONE) + { + /* + * Someone has already completed this block, we're done. + * + * When IO is necessary, ->nblocks_done is updated in + * ProcessReadBuffersResult(), but that is not called if no IO is + * necessary. Thus update here. + */ + operation->nblocks_done += 1; + Assert(operation->nblocks_done <= operation->nblocks); - pgaio_io_release(ioh); - pgaio_wref_clear(&operation->io_wref); + /* + * Report and track this as a 'hit' for this backend, even though + * it must have started out as a miss in PinBufferForBlock(). The + * other backend will track this as a 'read'. + */ + TrackBufferHit(io_object, io_context, + operation->rel, operation->persistence, + operation->smgr, operation->forknum, + blocknum); + return false; + } - /* - * Report and track this as a 'hit' for this backend, even though it - * must have started out as a miss in PinBufferForBlock(). The other - * backend will track this as a 'read'. - */ - TrackBufferHit(io_object, io_context, - operation->rel, operation->persistence, - operation->smgr, operation->forknum, - blocknum); - return false; + /* The IO is already in-progress */ + Assert(status == READ_BUFFER_IN_PROGRESS); + return true; } /* diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 4017896f951..dd41b92f944 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -144,9 +144,11 @@ struct ReadBuffersOperation */ Buffer *buffers; BlockNumber blocknum; - int flags; + uint16 flags; int16 nblocks; int16 nblocks_done; + /* true if waiting on another backend's IO */ + bool foreign_io; PgAioWaitRef io_wref; PgAioReturn io_return; }; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 340662cf72c..ffaea427952 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2365,6 +2365,7 @@ PredicateLockData PredicateLockTargetType PrefetchBufferResult PrepParallelRestorePtrType +PrepareReadBufferStatus PrepareStmt PreparedStatement PresortedKeyData -- 2.43.0