From 98b5bf6872b5c8b1e2d4f42c7d8a7e01a1c7858f Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Wed, 15 Oct 2025 10:53:48 -0400 Subject: [PATCH v8 1/7] Refactor goto into for loop in GetVictimBuffer() GetVictimBuffer() implemented a loop to optimistically lock a clean victim buffer using a goto. Future commits will add batch flushing functionality to GetVictimBuffer. The new logic works better with standard for loop flow control. This commit is only a refactor and does not introduce any new functionality. Author: Melanie Plageman Reviewed-by: Nazir Bilal Yavuz Reviewed-by: Chao Li Discussion: https://postgr.es/m/2FA0BAC7-5413-4ABD-94CA-4398FE77750D%40gmail.com Discussion: https://postgr.es/m/flat/CAAKRu_Yjn4mvN9NBxtmsCQSGwup45CoA4e05nhR7ADP-v0WCig%40mail.gmail.com --- src/backend/storage/buffer/bufmgr.c | 189 ++++++++++++-------------- src/backend/storage/buffer/freelist.c | 19 ++- src/include/storage/buf_internals.h | 5 + 3 files changed, 110 insertions(+), 103 deletions(-) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 327ddb7adc8..90c24b8d93d 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -68,10 +68,6 @@ #include "utils/timestamp.h" -/* Note: these two macros only work on shared buffers, not local ones! */ -#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ)) -#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr))) - /* Note: this macro only works on local buffers, not shared ones! */ #define LocalBufHdrGetBlock(bufHdr) \ LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)] @@ -2331,125 +2327,116 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) ReservePrivateRefCountEntry(); ResourceOwnerEnlarge(CurrentResourceOwner); - /* we return here if a prospective victim buffer gets used concurrently */ -again: - - /* - * Select a victim buffer. The buffer is returned pinned and owned by - * this backend. - */ - buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring); - buf = BufferDescriptorGetBuffer(buf_hdr); - - /* - * We shouldn't have any other pins for this buffer. - */ - CheckBufferIsPinnedOnce(buf); - - /* - * If the buffer was dirty, try to write it out. There is a race - * condition here, in that someone might dirty it after we released the - * buffer header lock above, or even while we are writing it out (since - * our share-lock won't prevent hint-bit updates). We will recheck the - * dirty bit after re-locking the buffer header. - */ - if (buf_state & BM_DIRTY) + /* Select a victim buffer using an optimistic locking scheme. */ + for (;;) { - LWLock *content_lock; - Assert(buf_state & BM_TAG_VALID); - Assert(buf_state & BM_VALID); + /* Attempt to claim a victim buffer. Buffer is returned pinned. */ + buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring); + buf = BufferDescriptorGetBuffer(buf_hdr); /* - * We need a share-lock on the buffer contents to write it out (else - * we might write invalid data, eg because someone else is compacting - * the page contents while we write). We must use a conditional lock - * acquisition here to avoid deadlock. Even though the buffer was not - * pinned (and therefore surely not locked) when StrategyGetBuffer - * returned it, someone else could have pinned and exclusive-locked it - * by the time we get here. If we try to get the lock unconditionally, - * we'd block waiting for them; if they later block waiting for us, - * deadlock ensues. (This has been observed to happen when two - * backends are both trying to split btree index pages, and the second - * one just happens to be trying to split the page the first one got - * from StrategyGetBuffer.) + * We shouldn't have any other pins for this buffer. */ - content_lock = BufferDescriptorGetContentLock(buf_hdr); - if (!LWLockConditionalAcquire(content_lock, LW_SHARED)) - { - /* - * Someone else has locked the buffer, so give it up and loop back - * to get another one. - */ - UnpinBuffer(buf_hdr); - goto again; - } + CheckBufferIsPinnedOnce(buf); /* - * If using a nondefault strategy, and writing the buffer would - * require a WAL flush, let the strategy decide whether to go ahead - * and write/reuse the buffer or to choose another victim. We need a - * lock to inspect the page LSN, so this can't be done inside - * StrategyGetBuffer. + * If the buffer was dirty, try to write it out. There is a race + * condition here, in that someone might dirty it after we released + * the buffer header lock above, or even while we are writing it out + * (since our share-lock won't prevent hint-bit updates). We will + * recheck the dirty bit after re-locking the buffer header. */ - if (strategy != NULL) + if (buf_state & BM_DIRTY) { - XLogRecPtr lsn; + LWLock *content_lock; - /* Read the LSN while holding buffer header lock */ - buf_state = LockBufHdr(buf_hdr); - lsn = BufferGetLSN(buf_hdr); - UnlockBufHdr(buf_hdr); + Assert(buf_state & BM_TAG_VALID); + Assert(buf_state & BM_VALID); + + /* + * We need a share-lock on the buffer contents to write it out + * (else we might write invalid data, eg because someone else is + * compacting the page contents while we write). We must use a + * conditional lock acquisition here to avoid deadlock. Even + * though the buffer was not pinned (and therefore surely not + * locked) when StrategyGetBuffer returned it, someone else could + * have pinned and exclusive-locked it by the time we get here. If + * we try to get the lock unconditionally, we'd block waiting for + * them; if they later block waiting for us, deadlock ensues. + * (This has been observed to happen when two backends are both + * trying to split btree index pages, and the second one just + * happens to be trying to split the page the first one got from + * StrategyGetBuffer.) + */ + content_lock = BufferDescriptorGetContentLock(buf_hdr); + if (!LWLockConditionalAcquire(content_lock, LW_SHARED)) + { + /* + * Someone else has locked the buffer, so give it up and loop + * back to get another one. + */ + UnpinBuffer(buf_hdr); + continue; + } - if (XLogNeedsFlush(lsn) - && StrategyRejectBuffer(strategy, buf_hdr, from_ring)) + /* + * If using a nondefault strategy, and writing the buffer would + * require a WAL flush, let the strategy decide whether to go + * ahead and write/reuse the buffer or to choose another victim. + * We need the content lock to inspect the page LSN, so this can't + * be done inside StrategyGetBuffer. + */ + if (StrategyRejectBuffer(strategy, buf_hdr, from_ring)) { LWLockRelease(content_lock); UnpinBuffer(buf_hdr); - goto again; + continue; } - } - /* OK, do the I/O */ - FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context); - LWLockRelease(content_lock); + /* OK, do the I/O */ + FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context); + LWLockRelease(content_lock); - ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context, - &buf_hdr->tag); - } + ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context, + &buf_hdr->tag); + } - if (buf_state & BM_VALID) - { + if (buf_state & BM_VALID) + { + /* + * When a BufferAccessStrategy is in use, blocks evicted from + * shared buffers are counted as IOOP_EVICT in the corresponding + * context (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted + * by a strategy in two cases: 1) while initially claiming buffers + * for the strategy ring 2) to replace an existing strategy ring + * buffer because it is pinned or in use and cannot be reused. + * + * Blocks evicted from buffers already in the strategy ring are + * counted as IOOP_REUSE in the corresponding strategy context. + * + * At this point, we can accurately count evictions and reuses, + * because we have successfully claimed the valid buffer. + * Previously, we may have been forced to release the buffer due + * to concurrent pinners or erroring out. + */ + pgstat_count_io_op(IOOBJECT_RELATION, io_context, + from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0); + } + /* - * When a BufferAccessStrategy is in use, blocks evicted from shared - * buffers are counted as IOOP_EVICT in the corresponding context - * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a - * strategy in two cases: 1) while initially claiming buffers for the - * strategy ring 2) to replace an existing strategy ring buffer - * because it is pinned or in use and cannot be reused. - * - * Blocks evicted from buffers already in the strategy ring are - * counted as IOOP_REUSE in the corresponding strategy context. - * - * At this point, we can accurately count evictions and reuses, - * because we have successfully claimed the valid buffer. Previously, - * we may have been forced to release the buffer due to concurrent - * pinners or erroring out. + * If the buffer has an entry in the buffer mapping table, delete it. + * This can fail because another backend could have pinned or dirtied + * the buffer. */ - pgstat_count_io_op(IOOBJECT_RELATION, io_context, - from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0); - } + if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr)) + { + UnpinBuffer(buf_hdr); + continue; + } - /* - * If the buffer has an entry in the buffer mapping table, delete it. This - * can fail because another backend could have pinned or dirtied the - * buffer. - */ - if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr)) - { - UnpinBuffer(buf_hdr); - goto again; + break; } /* a final set of sanity checks */ diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 28d952b3534..acbabeb3c3b 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -15,6 +15,7 @@ */ #include "postgres.h" +#include "access/xlog.h" #include "pgstat.h" #include "port/atomics.h" #include "storage/buf_internals.h" @@ -780,12 +781,20 @@ IOContextForStrategy(BufferAccessStrategy strategy) * be written out and doing so would require flushing WAL too. This gives us * a chance to choose a different victim. * + * The buffer must be pinned and content locked and the buffer header spinlock + * must not be held. We must hold the content lock to examine the LSN. + * * Returns true if buffer manager should ask for a new victim, and false * if this buffer should be written and re-used. */ bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring) { + XLogRecPtr lsn; + + if (!strategy) + return false; + /* We only do this in bulkread mode */ if (strategy->btype != BAS_BULKREAD) return false; @@ -795,11 +804,17 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_r strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf)) return false; + LockBufHdr(buf); + lsn = BufferGetLSN(buf); + UnlockBufHdr(buf); + + if (!XLogNeedsFlush(lsn)) + return false; + /* - * Remove the dirty buffer from the ring; necessary to prevent infinite + * Remove the dirty buffer from the ring; necessary to prevent an infinite * loop if all ring members are dirty. */ strategy->buffers[strategy->current] = InvalidBuffer; - return true; } diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 5400c56a965..04fdea64f83 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -486,6 +486,11 @@ ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer) /* * Internal buffer management routines */ + +/* Note: these two macros only work on shared buffers, not local ones! */ +#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ)) +#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr))) + /* bufmgr.c */ extern void WritebackContextInit(WritebackContext *context, int *max_pending); extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context); -- 2.43.0