From 5457ef3b17fd28be63c1ba31fcfc1d845a3010ca Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Wed, 19 Feb 2025 01:25:40 +1300 Subject: [PATCH v3 4/6] Look ahead more when sequential in read_stream.c. Previously, sequential reads would cause the look-ahead distance to fall back to io_combine_limit, on the basis that kernel read-ahead should start helping. It also meant that we'd have to ramp the distance back up when a sequential region was followed by a burst of random jumps, with little hope of avoiding a stall, which is not a good trade-off and is incompatible with AIO plans (you have to look ahead if you have to start real I/O). Simplify the algorithm: now only cache hits make the look-ahead distance drop off, and cache misses still make it grow rapidly. Random vs sequential heuristics are no longer taken into consideration while making that decision. Reviewed-by: Andres Freund Discussion: https://postgr.es/m/CA%2BhUKGK_%3D4CVmMHvsHjOVrK6t4F%3DLBpFzsrr3R%2BaJYN8kcTfWg%40mail.gmail.com --- src/backend/storage/aio/read_stream.c | 92 ++++++++++----------------- 1 file changed, 33 insertions(+), 59 deletions(-) diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c index a8a96baf8c1..57cde89cfdc 100644 --- a/src/backend/storage/aio/read_stream.c +++ b/src/backend/storage/aio/read_stream.c @@ -17,30 +17,12 @@ * pending read. When that isn't possible, the existing pending read is sent * to StartReadBuffers() so that a new one can begin to form. * - * The algorithm for controlling the look-ahead distance tries to classify the - * stream into three ideal behaviors: + * The algorithm for controlling the look-ahead distance is based on recent + * cache hits and misses: * - * A) No I/O is necessary, because the requested blocks are fully cached - * already. There is no benefit to looking ahead more than one block, so - * distance is 1. This is the default initial assumption. - * - * B) I/O is necessary, but read-ahead advice is undesirable because the - * access is sequential and we can rely on the kernel's read-ahead heuristics, - * or impossible because direct I/O is enabled, or the system doesn't support - * read-ahead advice. There is no benefit in looking ahead more than - * io_combine_limit, because in this case the only goal is larger read system - * calls. Looking further ahead would pin many buffers and perform - * speculative work for no benefit. - * - * C) I/O is necessary, it appears to be random, and this system supports - * read-ahead advice. We'll look further ahead in order to reach the - * configured level of I/O concurrency. - * - * The distance increases rapidly and decays slowly, so that it moves towards - * those levels as different I/O patterns are discovered. For example, a - * sequential scan of fully cached data doesn't bother looking ahead, but a - * sequential scan that hits a region of uncached blocks will start issuing - * increasingly wide read calls until it plateaus at io_combine_limit. + * When no I/O is necessary, there is no point in looking ahead more than one + * block. This is the default initial assumption. Otherwise rapidly increase + * the distance to try to benefit from I/O combining and I/O concurrency. * * The main data structure is a circular queue of buffers of size * max_pinned_buffers plus some extra space for technical reasons, ready to be @@ -336,7 +318,7 @@ read_stream_start_pending_read(ReadStream *stream) /* Remember whether we need to wait before returning this buffer. */ if (!need_wait) { - /* Look-ahead distance decays, no I/O necessary (behavior A). */ + /* Look-ahead distance decays, no I/O necessary. */ if (stream->distance > 1) stream->distance--; } @@ -517,6 +499,15 @@ read_stream_begin_impl(int flags, else max_ios = get_tablespace_io_concurrency(tablespace_id); + /* + * XXX Since we don't have asynchronous I/O yet, if direct I/O is enabled + * then just behave as though I/O concurrency is set to 0. Otherwise we + * would look ahead pinning many buffers for no benefit, for lack of + * advice and AIO. + */ + if (io_direct_flags & IO_DIRECT_DATA) + max_ios = 0; + /* Cap to INT16_MAX to avoid overflowing below */ max_ios = Min(max_ios, PG_INT16_MAX); @@ -637,7 +628,7 @@ read_stream_begin_impl(int flags, /* * Skip the initial ramp-up phase if the caller says we're going to be * reading the whole relation. This way we start out assuming we'll be - * doing full io_combine_limit sized reads (behavior B). + * doing full io_combine_limit sized reads. */ if (flags & READ_STREAM_FULL) stream->distance = Min(max_pinned_buffers, stream->io_combine_limit); @@ -728,10 +719,10 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) #ifndef READ_STREAM_DISABLE_FAST_PATH /* - * A fast path for all-cached scans (behavior A). This is the same as the - * usual algorithm, but it is specialized for no I/O and no per-buffer - * data, so we can skip the queue management code, stay in the same buffer - * slot and use singular StartReadBuffer(). + * A fast path for all-cached scans. This is the same as the usual + * algorithm, but it is specialized for no I/O and no per-buffer data, so + * we can skip the queue management code, stay in the same buffer slot and + * use singular StartReadBuffer(). */ if (likely(stream->fast_path)) { @@ -851,37 +842,20 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data) if (++stream->oldest_io_index == stream->max_ios) stream->oldest_io_index = 0; - if (stream->ios[io_index].op.flags & READ_BUFFERS_ISSUE_ADVICE) - { - /* Distance ramps up fast (behavior C). */ - distance = stream->distance * 2; - distance = Min(distance, stream->max_pinned_buffers); - stream->distance = distance; + /* Look-ahead distance ramps up quickly after we do I/O. */ + distance = stream->distance * 2; + distance = Min(distance, stream->max_pinned_buffers); + stream->distance = distance; - /* - * If we've caught up with the first advice issued for the current - * sequential region, cancel further advice until the next random - * jump. The kernel should be able to see the pattern now that - * we're actually making sequential preadv() calls. - */ - if (stream->ios[io_index].op.blocknum == stream->seq_until_processed) - stream->seq_until_processed = InvalidBlockNumber; - } - else - { - /* No advice; move towards io_combine_limit (behavior B). */ - if (stream->distance > stream->io_combine_limit) - { - stream->distance--; - } - else - { - distance = stream->distance * 2; - distance = Min(distance, stream->io_combine_limit); - distance = Min(distance, stream->max_pinned_buffers); - stream->distance = distance; - } - } + /* + * If we've caught up with the first advice issued for the current + * sequential region, cancel further advice until the next random + * jump. The kernel should be able to see the pattern now that we're + * actually making sequential preadv() calls. + */ + if (stream->advice_enabled && + stream->ios[io_index].op.blocknum == stream->seq_until_processed) + stream->seq_until_processed = InvalidBlockNumber; } #ifdef CLOBBER_FREED_MEMORY -- 2.39.5