From 3a2ab0230c1e71f45d607a58697db98b5861e4b5 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Sun, 22 Mar 2026 15:19:08 -0400 Subject: [PATCH v18 19/19] WIP: aio: bufmgr: Fix race condition leading to deadlocks with io_uring If backend A is in the process of starting IO for a buffer, there is a short period in which the buffer is marked as IO_IN_PROGRESS without having an associated AIO wait reference. If a backend B does WaitIO() on that buffer, it'll wait for the buffer's IO condition variable to be set. Most of the time that is OK, when the IO on the buffer finishes, the CV will be signalled. However, with io_uring, it is possible that the issuer (A) of the IO never gets around to doing so, e.g. because it is waiting for something done by B. To fix that, we need to signal the CV when staging IO. That's annoying as CV broadcasts are not cheap. So we at least avoid it for the common case of IO being executed synchronously. I hope that eventually we can get away from needing multiple systems for signalling IO completion, but we are clearly not there yet. --- src/backend/storage/buffer/bufmgr.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 896ec2a4e..e1c44f05d 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8364,7 +8364,7 @@ MarkDirtyAllUnpinnedBuffers(int32 *buffers_dirtied, * replaced while IO is ongoing. */ static pg_attribute_always_inline void -buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp) +buffer_stage_common(PgAioHandle *ioh, uint8 cb_data, bool is_write, bool is_temp) { uint64 *io_data; uint8 handle_data_len; @@ -8463,7 +8463,23 @@ buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp) * keeps track. */ if (!is_temp) + { ResourceOwnerForgetBufferIO(CurrentResourceOwner, buffer); + + /* + * A backend might have started waiting for the IO using the + * buffer's condition variable, but once the IO is submitted, it + * should wait via the AIO subsystem, as a waiter might need to + * complete the IO. + * + * However, doing broadcasts is not free, so we like to avoid it + * when not necessary. If the IO is being executed synchronously, + * this backend will always end up signalling the IOCV without + * further waiting, therefore avoid doing so here. + */ + if (!(cb_data & READ_BUFFERS_SYNCHRONOUSLY)) + ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf_hdr)); + } } } @@ -8952,7 +8968,7 @@ buffer_readv_report(PgAioResult result, const PgAioTargetData *td, static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data) { - buffer_stage_common(ioh, false, false); + buffer_stage_common(ioh, cb_data, false, false); } static PgAioResult @@ -9003,7 +9019,7 @@ shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data) { - buffer_stage_common(ioh, false, true); + buffer_stage_common(ioh, cb_data, false, true); } static PgAioResult -- 2.53.0