From 8fd1a6a68d80e3473ce34f86b0ebb38c15641bab Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Tue, 2 Sep 2025 11:00:44 -0400 Subject: [PATCH v3 1/9] Refactor goto into for loop in GetVictimBuffer() GetVictimBuffer() implemented a loop to optimistically lock a clean victim buffer using a goto. Future commits will add batch flushing functionality to GetVictimBuffer. The new logic works better with a regular for loop flow control. This commit is only a refactor and does not introduce any new functionality. --- src/backend/storage/buffer/bufmgr.c | 200 ++++++++++++-------------- src/backend/storage/buffer/freelist.c | 17 +++ src/include/storage/buf_internals.h | 5 + 3 files changed, 116 insertions(+), 106 deletions(-) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index fe470de63f2..f3668051574 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -68,10 +68,6 @@ #include "utils/timestamp.h" -/* Note: these two macros only work on shared buffers, not local ones! */ -#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ)) -#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr))) - /* Note: this macro only works on local buffers, not shared ones! */ #define LocalBufHdrGetBlock(bufHdr) \ LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)] @@ -2344,130 +2340,122 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) ReservePrivateRefCountEntry(); ResourceOwnerEnlarge(CurrentResourceOwner); - /* we return here if a prospective victim buffer gets used concurrently */ -again: - - /* - * Select a victim buffer. The buffer is returned with its header - * spinlock still held! - */ - buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring); - buf = BufferDescriptorGetBuffer(buf_hdr); - - Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0); - - /* Pin the buffer and then release the buffer spinlock */ - PinBuffer_Locked(buf_hdr); - - /* - * We shouldn't have any other pins for this buffer. - */ - CheckBufferIsPinnedOnce(buf); - - /* - * If the buffer was dirty, try to write it out. There is a race - * condition here, in that someone might dirty it after we released the - * buffer header lock above, or even while we are writing it out (since - * our share-lock won't prevent hint-bit updates). We will recheck the - * dirty bit after re-locking the buffer header. - */ - if (buf_state & BM_DIRTY) + /* Select a victim buffer using an optimistic locking scheme. */ + for (;;) { - LWLock *content_lock; + /* + * Attempt to claim a victim buffer. The buffer is returned with its + * header spinlock still held! + */ + buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring); + buf = BufferDescriptorGetBuffer(buf_hdr); - Assert(buf_state & BM_TAG_VALID); - Assert(buf_state & BM_VALID); + Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0); + + /* Pin the buffer and then release the buffer spinlock */ + PinBuffer_Locked(buf_hdr); /* - * We need a share-lock on the buffer contents to write it out (else - * we might write invalid data, eg because someone else is compacting - * the page contents while we write). We must use a conditional lock - * acquisition here to avoid deadlock. Even though the buffer was not - * pinned (and therefore surely not locked) when StrategyGetBuffer - * returned it, someone else could have pinned and exclusive-locked it - * by the time we get here. If we try to get the lock unconditionally, - * we'd block waiting for them; if they later block waiting for us, - * deadlock ensues. (This has been observed to happen when two - * backends are both trying to split btree index pages, and the second - * one just happens to be trying to split the page the first one got - * from StrategyGetBuffer.) + * We shouldn't have any other pins for this buffer. */ - content_lock = BufferDescriptorGetContentLock(buf_hdr); - if (!LWLockConditionalAcquire(content_lock, LW_SHARED)) - { - /* - * Someone else has locked the buffer, so give it up and loop back - * to get another one. - */ - UnpinBuffer(buf_hdr); - goto again; - } + CheckBufferIsPinnedOnce(buf); /* - * If using a nondefault strategy, and writing the buffer would - * require a WAL flush, let the strategy decide whether to go ahead - * and write/reuse the buffer or to choose another victim. We need a - * lock to inspect the page LSN, so this can't be done inside - * StrategyGetBuffer. + * If the buffer was dirty, try to write it out. There is a race + * condition here, in that someone might dirty it after we released + * the buffer header lock above, or even while we are writing it out + * (since our share-lock won't prevent hint-bit updates). We will + * recheck the dirty bit after re-locking the buffer header. */ - if (strategy != NULL) + if (buf_state & BM_DIRTY) { - XLogRecPtr lsn; + LWLock *content_lock; - /* Read the LSN while holding buffer header lock */ - buf_state = LockBufHdr(buf_hdr); - lsn = BufferGetLSN(buf_hdr); - UnlockBufHdr(buf_hdr, buf_state); + Assert(buf_state & BM_TAG_VALID); + Assert(buf_state & BM_VALID); - if (XLogNeedsFlush(lsn) - && StrategyRejectBuffer(strategy, buf_hdr, from_ring)) + /* + * We need a share-lock on the buffer contents to write it out + * (else we might write invalid data, eg because someone else is + * compacting the page contents while we write). We must use a + * conditional lock acquisition here to avoid deadlock. Even + * though the buffer was not pinned (and therefore surely not + * locked) when StrategyGetBuffer returned it, someone else could + * have pinned and exclusive-locked it by the time we get here. If + * we try to get the lock unconditionally, we'd block waiting for + * them; if they later block waiting for us, deadlock ensues. + * (This has been observed to happen when two backends are both + * trying to split btree index pages, and the second one just + * happens to be trying to split the page the first one got from + * StrategyGetBuffer.) + */ + content_lock = BufferDescriptorGetContentLock(buf_hdr); + if (!LWLockConditionalAcquire(content_lock, LW_SHARED)) + { + /* + * Someone else has locked the buffer, so give it up and loop + * back to get another one. + */ + UnpinBuffer(buf_hdr); + continue; + } + + /* + * If using a nondefault strategy, and writing the buffer would + * require a WAL flush, let the strategy decide whether to go + * ahead and write/reuse the buffer or to choose another victim. + * We need the content lock to inspect the page LSN, so this can't + * be done inside StrategyGetBuffer. + */ + if (StrategyRejectBuffer(strategy, buf_hdr, from_ring)) { LWLockRelease(content_lock); UnpinBuffer(buf_hdr); - goto again; + continue; } - } - /* OK, do the I/O */ - FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context); - LWLockRelease(content_lock); + /* OK, do the I/O */ + FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context); + LWLockRelease(content_lock); - ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context, - &buf_hdr->tag); - } + ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context, + &buf_hdr->tag); + } + if (buf_state & BM_VALID) + { + /* + * When a BufferAccessStrategy is in use, blocks evicted from + * shared buffers are counted as IOOP_EVICT in the corresponding + * context (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted + * by a strategy in two cases: 1) while initially claiming buffers + * for the strategy ring 2) to replace an existing strategy ring + * buffer because it is pinned or in use and cannot be reused. + * + * Blocks evicted from buffers already in the strategy ring are + * counted as IOOP_REUSE in the corresponding strategy context. + * + * At this point, we can accurately count evictions and reuses, + * because we have successfully claimed the valid buffer. + * Previously, we may have been forced to release the buffer due + * to concurrent pinners or erroring out. + */ + pgstat_count_io_op(IOOBJECT_RELATION, io_context, + from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0); + } - if (buf_state & BM_VALID) - { /* - * When a BufferAccessStrategy is in use, blocks evicted from shared - * buffers are counted as IOOP_EVICT in the corresponding context - * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a - * strategy in two cases: 1) while initially claiming buffers for the - * strategy ring 2) to replace an existing strategy ring buffer - * because it is pinned or in use and cannot be reused. - * - * Blocks evicted from buffers already in the strategy ring are - * counted as IOOP_REUSE in the corresponding strategy context. - * - * At this point, we can accurately count evictions and reuses, - * because we have successfully claimed the valid buffer. Previously, - * we may have been forced to release the buffer due to concurrent - * pinners or erroring out. + * If the buffer has an entry in the buffer mapping table, delete it. + * This can fail because another backend could have pinned or dirtied + * the buffer. Then loop around and try again. */ - pgstat_count_io_op(IOOBJECT_RELATION, io_context, - from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0); - } + if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr)) + { + UnpinBuffer(buf_hdr); + continue; + } - /* - * If the buffer has an entry in the buffer mapping table, delete it. This - * can fail because another backend could have pinned or dirtied the - * buffer. - */ - if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr)) - { - UnpinBuffer(buf_hdr); - goto again; + break; } /* a final set of sanity checks */ diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 7d59a92bd1a..a90a7ed4e16 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -15,6 +15,7 @@ */ #include "postgres.h" +#include "access/xlog.h" #include "pgstat.h" #include "port/atomics.h" #include "storage/buf_internals.h" @@ -716,12 +717,21 @@ IOContextForStrategy(BufferAccessStrategy strategy) * be written out and doing so would require flushing WAL too. This gives us * a chance to choose a different victim. * + * The buffer must pinned and content locked and the buffer header spinlock + * must not be held. We must have the content lock to examine the LSN. + * * Returns true if buffer manager should ask for a new victim, and false * if this buffer should be written and re-used. */ bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring) { + uint32 buf_state; + XLogRecPtr lsn; + + if (!strategy) + return false; + /* We only do this in bulkread mode */ if (strategy->btype != BAS_BULKREAD) return false; @@ -731,6 +741,13 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_r strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf)) return false; + buf_state = LockBufHdr(buf); + lsn = BufferGetLSN(buf); + UnlockBufHdr(buf, buf_state); + + if (XLogNeedsFlush(lsn)) + return true; + /* * Remove the dirty buffer from the ring; necessary to prevent infinite * loop if all ring members are dirty. diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index dfd614f7ca4..b1b81f31419 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -419,6 +419,11 @@ ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer) /* * Internal buffer management routines */ + +/* Note: these two macros only work on shared buffers, not local ones! */ +#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ)) +#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr))) + /* bufmgr.c */ extern void WritebackContextInit(WritebackContext *context, int *max_pending); extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context); -- 2.43.0