From 5de90fcd14e5d32c3165c3e7a278adaa44f4d9d1 Mon Sep 17 00:00:00 2001 From: Alexandre Felipe Date: Wed, 11 Mar 2026 12:05:50 +0000 Subject: [PATCH 2/5] Refactoring reference counting This moves the private reference count logic out of backend/storage/buffer/buffmgr.c that is still 8000+ lines long. Preparing for the changes to come. --- .gitignore | 2 + src/backend/storage/buffer/Makefile | 3 + src/backend/storage/buffer/bufmgr.c | 704 +++------------------ src/backend/storage/buffer/bufmgr_refcnt.c | 615 ++++++++++++++++++ src/backend/storage/buffer/bufmgr_refcnt.h | 59 ++ 5 files changed, 751 insertions(+), 632 deletions(-) create mode 100644 src/backend/storage/buffer/bufmgr_refcnt.c create mode 100644 src/backend/storage/buffer/bufmgr_refcnt.h diff --git a/.gitignore b/.gitignore index 4e911395fe..fddb7f861d 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,5 @@ lib*.pc /Release/ /tmp_install/ /portlock/ + +.* \ No newline at end of file diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile index fd7c40dcb0..6f45674cae 100644 --- a/src/backend/storage/buffer/Makefile +++ b/src/backend/storage/buffer/Makefile @@ -20,3 +20,6 @@ OBJS = \ localbuf.o include $(top_srcdir)/src/backend/common.mk + +# bufmgr.c includes bufmgr_refcnt.c for inlining +bufmgr.o: bufmgr_refcnt.c bufmgr_refcnt.h \ No newline at end of file diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 0546ee0193..82af282f70 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -54,6 +54,7 @@ #include "storage/aio.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "bufmgr_refcnt.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/lmgr.h" @@ -70,7 +71,6 @@ #include "utils/timestamp.h" #include "utils/wait_event.h" - /* Note: these two macros only work on shared buffers, not local ones! */ #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ)) #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr))) @@ -93,43 +93,6 @@ */ #define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32) -/* - * This is separated out from PrivateRefCountEntry to allow for copying all - * the data members via struct assignment. - */ -typedef struct PrivateRefCountData -{ - /* - * How many times has the buffer been pinned by this backend. - */ - int32 refcount; - - /* - * Is the buffer locked by this backend? BUFFER_LOCK_UNLOCK indicates that - * the buffer is not locked. - */ - BufferLockMode lockmode; -} PrivateRefCountData; - -typedef struct PrivateRefCountEntry -{ - /* - * Note that this needs to be same as the entry's corresponding - * PrivateRefCountArrayKeys[i], if the entry is stored in the array. We - * store it in both places as this is used for the hashtable key and - * because it is more convenient (passing around a PrivateRefCountEntry - * suffices to identify the buffer) and faster (checking the keys array is - * faster when checking many entries, checking the entry is faster if just - * checking a single entry). - */ - Buffer buffer; - - PrivateRefCountData data; -} PrivateRefCountEntry; - -/* 64 bytes, about the size of a cache line on common systems */ -#define REFCOUNT_ARRAY_ENTRIES 8 - /* * Status of buffers to checkpoint for a particular tablespace, used * internally in BufferSync. @@ -213,55 +176,6 @@ int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER; /* local state for LockBufferForCleanup */ static BufferDesc *PinCountWaitBuf = NULL; -/* - * Backend-Private refcount management: - * - * Each buffer also has a private refcount that keeps track of the number of - * times the buffer is pinned in the current process. This is so that the - * shared refcount needs to be modified only once if a buffer is pinned more - * than once by an individual backend. It's also used to check that no - * buffers are still pinned at the end of transactions and when exiting. We - * also use this mechanism to track whether this backend has a buffer locked, - * and, if so, in what mode. - * - * - * To avoid - as we used to - requiring an array with NBuffers entries to keep - * track of local buffers, we use a small sequentially searched array - * (PrivateRefCountArrayKeys, with the corresponding data stored in - * PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to - * keep track of backend local pins. - * - * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all - * refcounts are kept track of in the array; after that, new array entries - * displace old ones into the hash table. That way a frequently used entry - * can't get "stuck" in the hashtable while infrequent ones clog the array. - * - * Note that in most scenarios the number of pinned buffers will not exceed - * REFCOUNT_ARRAY_ENTRIES. - * - * - * To enter a buffer into the refcount tracking mechanism first reserve a free - * entry using ReservePrivateRefCountEntry() and then later, if necessary, - * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing - * memory allocations in NewPrivateRefCountEntry() which can be important - * because in some scenarios it's called with a spinlock held... - */ -static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES]; -static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]; -static HTAB *PrivateRefCountHash = NULL; -static int32 PrivateRefCountOverflowed = 0; -static uint32 PrivateRefCountClock = 0; -static int ReservedRefCountSlot = -1; -static int PrivateRefCountEntryLast = -1; - -static uint32 MaxProportionalPins; - -static void ReservePrivateRefCountEntry(void); -static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer); -static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move); -static inline int32 GetPrivateRefCount(Buffer buffer); -static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref); - /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */ static void ResOwnerReleaseBufferIO(Datum res); static char *ResOwnerPrintBufferIO(Datum res); @@ -286,301 +200,6 @@ const ResourceOwnerDesc buffer_resowner_desc = .DebugPrint = ResOwnerPrintBuffer }; -/* - * Ensure that the PrivateRefCountArray has sufficient space to store one more - * entry. This has to be called before using NewPrivateRefCountEntry() to fill - * a new entry - but it's perfectly fine to not use a reserved entry. - */ -static void -ReservePrivateRefCountEntry(void) -{ - /* Already reserved (or freed), nothing to do */ - if (ReservedRefCountSlot != -1) - return; - - /* - * First search for a free entry the array, that'll be sufficient in the - * majority of cases. - */ - { - int i; - - for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) - { - if (PrivateRefCountArrayKeys[i] == InvalidBuffer) - { - ReservedRefCountSlot = i; - - /* - * We could return immediately, but iterating till the end of - * the array allows compiler-autovectorization. - */ - } - } - - if (ReservedRefCountSlot != -1) - return; - } - - /* - * No luck. All array entries are full. Move one array entry into the hash - * table. - */ - { - /* - * Move entry from the current clock position in the array into the - * hashtable. Use that slot. - */ - int victim_slot; - PrivateRefCountEntry *victim_entry; - PrivateRefCountEntry *hashent; - bool found; - - /* select victim slot */ - victim_slot = PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES; - victim_entry = &PrivateRefCountArray[victim_slot]; - ReservedRefCountSlot = victim_slot; - - /* Better be used, otherwise we shouldn't get here. */ - Assert(PrivateRefCountArrayKeys[victim_slot] != InvalidBuffer); - Assert(PrivateRefCountArray[victim_slot].buffer != InvalidBuffer); - Assert(PrivateRefCountArrayKeys[victim_slot] == PrivateRefCountArray[victim_slot].buffer); - - /* enter victim array entry into hashtable */ - hashent = hash_search(PrivateRefCountHash, - &PrivateRefCountArrayKeys[victim_slot], - HASH_ENTER, - &found); - Assert(!found); - /* move data from the entry in the array to the hash entry */ - hashent->data = victim_entry->data; - - /* clear the now free array slot */ - PrivateRefCountArrayKeys[victim_slot] = InvalidBuffer; - victim_entry->buffer = InvalidBuffer; - - /* clear the whole data member, just for future proofing */ - memset(&victim_entry->data, 0, sizeof(victim_entry->data)); - victim_entry->data.refcount = 0; - victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK; - - PrivateRefCountOverflowed++; - } -} - -/* - * Fill a previously reserved refcount entry. - */ -static PrivateRefCountEntry * -NewPrivateRefCountEntry(Buffer buffer) -{ - PrivateRefCountEntry *res; - - /* only allowed to be called when a reservation has been made */ - Assert(ReservedRefCountSlot != -1); - - /* use up the reserved entry */ - res = &PrivateRefCountArray[ReservedRefCountSlot]; - - /* and fill it */ - PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer; - res->buffer = buffer; - res->data.refcount = 0; - res->data.lockmode = BUFFER_LOCK_UNLOCK; - - /* update cache for the next lookup */ - PrivateRefCountEntryLast = ReservedRefCountSlot; - - ReservedRefCountSlot = -1; - - return res; -} - -/* - * Slow-path for GetPrivateRefCountEntry(). This is big enough to not be worth - * inlining. This particularly seems to be true if the compiler is capable of - * auto-vectorizing the code, as that imposes additional stack-alignment - * requirements etc. - */ -static pg_noinline PrivateRefCountEntry * -GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move) -{ - PrivateRefCountEntry *res; - int match = -1; - int i; - - /* - * First search for references in the array, that'll be sufficient in the - * majority of cases. - */ - for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) - { - if (PrivateRefCountArrayKeys[i] == buffer) - { - match = i; - /* see ReservePrivateRefCountEntry() for why we don't return */ - } - } - - if (likely(match != -1)) - { - /* update cache for the next lookup */ - PrivateRefCountEntryLast = match; - - return &PrivateRefCountArray[match]; - } - - /* - * By here we know that the buffer, if already pinned, isn't residing in - * the array. - * - * Only look up the buffer in the hashtable if we've previously overflowed - * into it. - */ - if (PrivateRefCountOverflowed == 0) - return NULL; - - res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL); - - if (res == NULL) - return NULL; - else if (!do_move) - { - /* caller doesn't want us to move the hash entry into the array */ - return res; - } - else - { - /* move buffer from hashtable into the free array slot */ - bool found; - PrivateRefCountEntry *free; - - /* Ensure there's a free array slot */ - ReservePrivateRefCountEntry(); - - /* Use up the reserved slot */ - Assert(ReservedRefCountSlot != -1); - free = &PrivateRefCountArray[ReservedRefCountSlot]; - Assert(PrivateRefCountArrayKeys[ReservedRefCountSlot] == free->buffer); - Assert(free->buffer == InvalidBuffer); - - /* and fill it */ - free->buffer = buffer; - free->data = res->data; - PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer; - /* update cache for the next lookup */ - PrivateRefCountEntryLast = match; - - ReservedRefCountSlot = -1; - - - /* delete from hashtable */ - hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found); - Assert(found); - Assert(PrivateRefCountOverflowed > 0); - PrivateRefCountOverflowed--; - - return free; - } -} - -/* - * Return the PrivateRefCount entry for the passed buffer. - * - * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if - * do_move is true, and the entry resides in the hashtable the entry is - * optimized for frequent access by moving it to the array. - */ -static inline PrivateRefCountEntry * -GetPrivateRefCountEntry(Buffer buffer, bool do_move) -{ - Assert(BufferIsValid(buffer)); - Assert(!BufferIsLocal(buffer)); - - /* - * It's very common to look up the same buffer repeatedly. To make that - * fast, we have a one-entry cache. - * - * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it - * faster to check PrivateRefCountArray[].buffer, as in the case of a hit - * fewer addresses are computed and fewer cachelines are accessed. Whereas - * in GetPrivateRefCountEntrySlow()'s case, checking - * PrivateRefCountArrayKeys saves a lot of memory accesses. - */ - if (likely(PrivateRefCountEntryLast != -1) && - likely(PrivateRefCountArray[PrivateRefCountEntryLast].buffer == buffer)) - { - return &PrivateRefCountArray[PrivateRefCountEntryLast]; - } - - /* - * The code for the cached lookup is small enough to be worth inlining - * into the caller. In the miss case however, that empirically doesn't - * seem worth it. - */ - return GetPrivateRefCountEntrySlow(buffer, do_move); -} - -/* - * Returns how many times the passed buffer is pinned by this backend. - * - * Only works for shared memory buffers! - */ -static inline int32 -GetPrivateRefCount(Buffer buffer) -{ - PrivateRefCountEntry *ref; - - Assert(BufferIsValid(buffer)); - Assert(!BufferIsLocal(buffer)); - - /* - * Not moving the entry - that's ok for the current users, but we might - * want to change this one day. - */ - ref = GetPrivateRefCountEntry(buffer, false); - - if (ref == NULL) - return 0; - return ref->data.refcount; -} - -/* - * Release resources used to track the reference count of a buffer which we no - * longer have pinned and don't want to pin again immediately. - */ -static void -ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref) -{ - Assert(ref->data.refcount == 0); - Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK); - - if (ref >= &PrivateRefCountArray[0] && - ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]) - { - ref->buffer = InvalidBuffer; - PrivateRefCountArrayKeys[ref - PrivateRefCountArray] = InvalidBuffer; - - - /* - * Mark the just used entry as reserved - in many scenarios that - * allows us to avoid ever having to search the array/hash for free - * entries. - */ - ReservedRefCountSlot = ref - PrivateRefCountArray; - } - else - { - bool found; - Buffer buffer = ref->buffer; - - hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found); - Assert(found); - Assert(PrivateRefCountOverflowed > 0); - PrivateRefCountOverflowed--; - } -} - /* * BufferIsPinned * True iff the buffer is pinned (also checks for valid buffer number). @@ -596,7 +215,7 @@ ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref) BufferIsLocal(bufnum) ? \ (LocalRefCount[-(bufnum) - 1] > 0) \ : \ - (GetPrivateRefCount(bufnum) > 0) \ + (GetSharedBufferEntry(bufnum) != NULL) \ ) @@ -653,7 +272,6 @@ static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent); static void AtProcExit_Buffers(int code, Datum arg); -static void CheckForBufferLeaks(void); #ifdef USE_ASSERT_CHECKING static void AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode); #endif @@ -812,7 +430,6 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN Assert(BufferIsValid(recent_buffer)); ResourceOwnerEnlarge(CurrentResourceOwner); - ReservePrivateRefCountEntry(); InitBufferTag(&tag, &rlocator, forkNum, blockNum); if (BufferIsLocal(recent_buffer)) @@ -2115,7 +1732,6 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, /* Make sure we will have room to remember the buffer pin */ ResourceOwnerEnlarge(CurrentResourceOwner); - ReservePrivateRefCountEntry(); /* create a tag so we can lookup the buffer */ InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum); @@ -2327,7 +1943,7 @@ retry: UnlockBufHdr(buf); LWLockRelease(oldPartitionLock); /* safety check: should definitely not be our *own* pin */ - if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0) + if (GetSharedBufferEntry(BufferDescriptorGetBuffer(buf)) != NULL) elog(ERROR, "buffer is pinned in InvalidateBuffer"); WaitIO(buf); goto retry; @@ -2380,7 +1996,7 @@ InvalidateVictimBuffer(BufferDesc *buf_hdr) LWLock *partition_lock; BufferTag tag; - Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1); + Assert(GetSharedBufferEntry(BufferDescriptorGetBuffer(buf_hdr)) != NULL); /* have buffer pinned, so it's safe to read tag without lock */ tag = buf_hdr->tag; @@ -2461,7 +2077,6 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) * Ensure, before we pin a victim buffer, that there's a free refcount * entry and resource owner slot for the pin. */ - ReservePrivateRefCountEntry(); ResourceOwnerEnlarge(CurrentResourceOwner); /* we return here if a prospective victim buffer gets used concurrently */ @@ -2591,64 +2206,6 @@ again: return buf; } -/* - * Return the maximum number of buffers that a backend should try to pin once, - * to avoid exceeding its fair share. This is the highest value that - * GetAdditionalPinLimit() could ever return. Note that it may be zero on a - * system with a very small buffer pool relative to max_connections. - */ -uint32 -GetPinLimit(void) -{ - return MaxProportionalPins; -} - -/* - * Return the maximum number of additional buffers that this backend should - * pin if it wants to stay under the per-backend limit, considering the number - * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit - * return by this function can be zero. - */ -uint32 -GetAdditionalPinLimit(void) -{ - uint32 estimated_pins_held; - - /* - * We get the number of "overflowed" pins for free, but don't know the - * number of pins in PrivateRefCountArray. The cost of calculating that - * exactly doesn't seem worth it, so just assume the max. - */ - estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES; - - /* Is this backend already holding more than its fair share? */ - if (estimated_pins_held > MaxProportionalPins) - return 0; - - return MaxProportionalPins - estimated_pins_held; -} - -/* - * Limit the number of pins a batch operation may additionally acquire, to - * avoid running out of pinnable buffers. - * - * One additional pin is always allowed, on the assumption that the operation - * requires at least one to make progress. - */ -void -LimitAdditionalPins(uint32 *additional_pins) -{ - uint32 limit; - - if (*additional_pins <= 1) - return; - - limit = GetAdditionalPinLimit(); - limit = Max(limit, 1); - if (limit < *additional_pins) - *additional_pins = limit; -} - /* * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to * avoid duplicating the tracing and relpersistence related logic. @@ -2812,7 +2369,6 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, /* in case we need to pin an existing buffer below */ ResourceOwnerEnlarge(CurrentResourceOwner); - ReservePrivateRefCountEntry(); InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork, first_block + i); @@ -3185,9 +2741,8 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, PrivateRefCountEntry *ref; Assert(!BufferIsLocal(b)); - Assert(ReservedRefCountSlot != -1); - ref = GetPrivateRefCountEntry(b, true); + ref = GetSharedBufferEntry(b); if (ref == NULL) { @@ -3257,8 +2812,7 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, */ result = (pg_atomic_read_u64(&buf->state) & BM_VALID) != 0; - Assert(ref->data.refcount > 0); - ref->data.refcount++; + SharedBufferRefExisting(ref); ResourceOwnerRememberBuffer(CurrentResourceOwner, b); } @@ -3296,7 +2850,7 @@ PinBuffer_Locked(BufferDesc *buf) * As explained, We don't expect any preexisting pins. That allows us to * manipulate the PrivateRefCount after releasing the spinlock */ - Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL); + Assert(GetSharedBufferEntry(BufferDescriptorGetBuffer(buf)) == NULL); /* * Since we hold the buffer spinlock, we can update the buffer state and @@ -3373,11 +2927,10 @@ UnpinBufferNoOwner(BufferDesc *buf) Assert(!BufferIsLocal(b)); /* not moving as we're likely deleting it soon anyway */ - ref = GetPrivateRefCountEntry(b, false); + ref = GetSharedBufferEntry(b); Assert(ref != NULL); - Assert(ref->data.refcount > 0); - ref->data.refcount--; - if (ref->data.refcount == 0) + + if (SharedBufferUnref(ref)) { uint64 old_buf_state; @@ -3402,8 +2955,6 @@ UnpinBufferNoOwner(BufferDesc *buf) /* Support LockBufferForCleanup() */ if (old_buf_state & BM_PIN_COUNT_WAITER) WakePinCountWaiter(buf); - - ForgetPrivateRefCountEntry(ref); } } @@ -3414,10 +2965,7 @@ UnpinBufferNoOwner(BufferDesc *buf) inline void TrackNewBufferPin(Buffer buf) { - PrivateRefCountEntry *ref; - - ref = NewPrivateRefCountEntry(buf); - ref->data.refcount++; + SharedBufferCreateRef(buf); ResourceOwnerRememberBuffer(CurrentResourceOwner, buf); @@ -4037,7 +3585,6 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) BufferTag tag; /* Make sure we can handle the pin */ - ReservePrivateRefCountEntry(); ResourceOwnerEnlarge(CurrentResourceOwner); /* @@ -4101,11 +3648,9 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) void AtEOXact_Buffers(bool isCommit) { - CheckForBufferLeaks(); + CheckPrivateRefCountLeaks(); AtEOXact_LocalBuffers(isCommit); - - Assert(PrivateRefCountOverflowed == 0); } /* @@ -4118,25 +3663,8 @@ AtEOXact_Buffers(bool isCommit) void InitBufferManagerAccess(void) { - HASHCTL hash_ctl; - - /* - * An advisory limit on the number of pins each backend should hold, based - * on shared_buffers and the maximum number of connections possible. - * That's very pessimistic, but outside toy-sized shared_buffers it should - * allow plenty of pins. LimitAdditionalPins() and - * GetAdditionalPinLimit() can be used to check the remaining balance. - */ - MaxProportionalPins = NBuffers / (MaxBackends + NUM_AUXILIARY_PROCS); - - memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray)); - memset(&PrivateRefCountArrayKeys, 0, sizeof(PrivateRefCountArrayKeys)); - - hash_ctl.keysize = sizeof(Buffer); - hash_ctl.entrysize = sizeof(PrivateRefCountEntry); - PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl, - HASH_ELEM | HASH_BLOBS); + InitPrivateRefCount(); /* * AtProcExit_Buffers needs LWLock access, and thereby has to be called at @@ -4155,112 +3683,13 @@ AtProcExit_Buffers(int code, Datum arg) { UnlockBuffers(); - CheckForBufferLeaks(); + CheckPrivateRefCountLeaks(); /* localbuf.c needs a chance too */ AtProcExit_LocalBuffers(); } -/* - * CheckForBufferLeaks - ensure this backend holds no buffer pins - * - * As of PostgreSQL 8.0, buffer pins should get released by the - * ResourceOwner mechanism. This routine is just a debugging - * cross-check that no pins remain. - */ -static void -CheckForBufferLeaks(void) -{ #ifdef USE_ASSERT_CHECKING - int RefCountErrors = 0; - PrivateRefCountEntry *res; - int i; - char *s; - - /* check the array */ - for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) - { - if (PrivateRefCountArrayKeys[i] != InvalidBuffer) - { - res = &PrivateRefCountArray[i]; - - s = DebugPrintBufferRefcount(res->buffer); - elog(WARNING, "buffer refcount leak: %s", s); - pfree(s); - - RefCountErrors++; - } - } - - /* if necessary search the hash */ - if (PrivateRefCountOverflowed) - { - HASH_SEQ_STATUS hstat; - - hash_seq_init(&hstat, PrivateRefCountHash); - while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL) - { - s = DebugPrintBufferRefcount(res->buffer); - elog(WARNING, "buffer refcount leak: %s", s); - pfree(s); - RefCountErrors++; - } - } - - Assert(RefCountErrors == 0); -#endif -} - -#ifdef USE_ASSERT_CHECKING -/* - * Check for exclusive-locked catalog buffers. This is the core of - * AssertCouldGetRelation(). - * - * A backend would self-deadlock on the content lock if the catalog scan read - * the exclusive-locked buffer. The main threat is exclusive-locked buffers - * of catalogs used in relcache, because a catcache search on any catalog may - * build that catalog's relcache entry. We don't have an inventory of - * catalogs relcache uses, so just check buffers of most catalogs. - * - * It's better to minimize waits while holding an exclusive buffer lock, so it - * would be nice to broaden this check not to be catalog-specific. However, - * bttextcmp() accesses pg_collation, and non-core opclasses might similarly - * read tables. That is deadlock-free as long as there's no loop in the - * dependency graph: modifying table A may cause an opclass to read table B, - * but it must not cause a read of table A. - */ -void -AssertBufferLocksPermitCatalogRead(void) -{ - PrivateRefCountEntry *res; - - /* check the array */ - for (int i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) - { - if (PrivateRefCountArrayKeys[i] != InvalidBuffer) - { - res = &PrivateRefCountArray[i]; - - if (res->buffer == InvalidBuffer) - continue; - - AssertNotCatalogBufferLock(res->buffer, res->data.lockmode); - } - } - - /* if necessary search the hash */ - if (PrivateRefCountOverflowed) - { - HASH_SEQ_STATUS hstat; - - hash_seq_init(&hstat, PrivateRefCountHash); - while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL) - { - AssertNotCatalogBufferLock(res->buffer, res->data.lockmode); - } - } -} - static void AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode) { @@ -4312,8 +3741,10 @@ DebugPrintBufferRefcount(Buffer buffer) } else { + PrivateRefCountEntry *ref = GetSharedBufferEntry(buffer); + buf = GetBufferDescriptor(buffer - 1); - loccount = GetPrivateRefCount(buffer); + loccount = ref ? SharedBufferRefCount(ref) : 0; backend = INVALID_PROC_NUMBER; } @@ -5100,7 +4531,6 @@ FlushRelationBuffers(Relation rel) error_context_stack = &errcallback; /* Make sure we can handle the pin */ - ReservePrivateRefCountEntry(); ResourceOwnerEnlarge(CurrentResourceOwner); /* @@ -5136,7 +4566,6 @@ FlushRelationBuffers(Relation rel) continue; /* Make sure we can handle the pin */ - ReservePrivateRefCountEntry(); ResourceOwnerEnlarge(CurrentResourceOwner); buf_state = LockBufHdr(bufHdr); @@ -5231,7 +4660,6 @@ FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels) continue; /* Make sure we can handle the pin */ - ReservePrivateRefCountEntry(); ResourceOwnerEnlarge(CurrentResourceOwner); buf_state = LockBufHdr(bufHdr); @@ -5457,7 +4885,6 @@ FlushDatabaseBuffers(Oid dbid) continue; /* Make sure we can handle the pin */ - ReservePrivateRefCountEntry(); ResourceOwnerEnlarge(CurrentResourceOwner); buf_state = LockBufHdr(bufHdr); @@ -5532,17 +4959,18 @@ UnlockReleaseBuffer(Buffer buffer) void IncrBufferRefCount(Buffer buffer) { - Assert(BufferIsPinned(buffer)); ResourceOwnerEnlarge(CurrentResourceOwner); if (BufferIsLocal(buffer)) + { + Assert(LocalRefCount[-buffer - 1] > 0); LocalRefCount[-buffer - 1]++; + } else { - PrivateRefCountEntry *ref; + PrivateRefCountEntry *ref = GetSharedBufferEntry(buffer); - ref = GetPrivateRefCountEntry(buffer, true); Assert(ref != NULL); - ref->data.refcount++; + SharedBufferRefExisting(ref); } ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer); } @@ -5561,11 +4989,9 @@ MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate, { Page page = BufferGetPage(buffer); - Assert(GetPrivateRefCount(buffer) > 0); - + Assert(GetSharedBufferEntry(buffer) != NULL); /* here, either share-exclusive or exclusive lock is OK */ - Assert(BufferLockHeldByMeInMode(bufHdr, BUFFER_LOCK_EXCLUSIVE) || - BufferLockHeldByMeInMode(bufHdr, BUFFER_LOCK_SHARE_EXCLUSIVE)); + Assert(BufferIsLockedByMe(buffer)); /* * This routine might get called many times on the same page, if we are @@ -5777,12 +5203,12 @@ BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode) * Get reference to the refcount entry before we hold the lock, it seems * better to do before holding the lock. */ - entry = GetPrivateRefCountEntry(buffer, true); + entry = GetSharedBufferEntry(buffer); /* * We better not already hold a lock on the buffer. */ - Assert(entry->data.lockmode == BUFFER_LOCK_UNLOCK); + Assert(SharedBufferGetLockMode(entry) == BUFFER_LOCK_UNLOCK); /* * Lock out cancel/die interrupts until we exit the code section protected @@ -5871,7 +5297,7 @@ BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode) } /* Remember that we now hold this lock */ - entry->data.lockmode = mode; + SharedBufferSetLockMode(entry, mode); /* * Fix the process wait semaphore's count for any absorbed wakeups. @@ -5922,7 +5348,7 @@ BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr) static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode) { - PrivateRefCountEntry *entry = GetPrivateRefCountEntry(buffer, true); + PrivateRefCountEntry *entry = GetSharedBufferEntry(buffer); bool mustwait; /* @@ -5930,7 +5356,7 @@ BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode) * already has locked, return false, independent of the existing and * desired lock level. */ - if (entry->data.lockmode != BUFFER_LOCK_UNLOCK) + if (SharedBufferGetLockMode(entry) != BUFFER_LOCK_UNLOCK) return false; /* @@ -5950,7 +5376,7 @@ BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode) } else { - entry->data.lockmode = mode; + SharedBufferSetLockMode(entry, mode); } return !mustwait; @@ -6160,11 +5586,11 @@ BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr) BufferLockMode mode; PrivateRefCountEntry *ref; - ref = GetPrivateRefCountEntry(buffer, false); + ref = GetSharedBufferEntry(buffer); if (ref == NULL) elog(ERROR, "lock %d is not held", buffer); - mode = ref->data.lockmode; - ref->data.lockmode = BUFFER_LOCK_UNLOCK; + mode = SharedBufferGetLockMode(ref); + SharedBufferSetLockMode(ref, BUFFER_LOCK_UNLOCK); return mode; } @@ -6398,12 +5824,12 @@ static bool BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode) { PrivateRefCountEntry *entry = - GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf_hdr), false); + GetSharedBufferEntry(BufferDescriptorGetBuffer(buf_hdr)); if (!entry) return false; else - return entry->data.lockmode == mode; + return SharedBufferGetLockMode(entry) == mode; } /* @@ -6416,12 +5842,12 @@ static bool BufferLockHeldByMe(BufferDesc *buf_hdr) { PrivateRefCountEntry *entry = - GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf_hdr), false); + GetSharedBufferEntry(BufferDescriptorGetBuffer(buf_hdr)); if (!entry) return false; else - return entry->data.lockmode != BUFFER_LOCK_UNLOCK; + return SharedBufferGetLockMode(entry) != BUFFER_LOCK_UNLOCK; } /* @@ -6517,9 +5943,13 @@ CheckBufferIsPinnedOnce(Buffer buffer) } else { - if (GetPrivateRefCount(buffer) != 1) - elog(ERROR, "incorrect local pin count: %d", - GetPrivateRefCount(buffer)); + { + PrivateRefCountEntry *ref = GetSharedBufferEntry(buffer); + + if (!ref || !SharedBufferHasSingleRef(ref)) + elog(ERROR, "incorrect pin count: %d", + ref ? SharedBufferRefCount(ref) : 0); + } } } @@ -6700,7 +6130,7 @@ HoldingBufferPinThatDelaysRecovery(void) if (bufid < 0) return false; - if (GetPrivateRefCount(bufid + 1) > 0) + if (GetSharedBufferEntry(bufid + 1) != NULL) return true; return false; @@ -6735,10 +6165,13 @@ ConditionalLockBufferForCleanup(Buffer buffer) } /* There should be exactly one local pin */ - refcount = GetPrivateRefCount(buffer); - Assert(refcount); - if (refcount != 1) - return false; + { + PrivateRefCountEntry *ref = GetSharedBufferEntry(buffer); + + Assert(ref != NULL); + if (!SharedBufferHasSingleRef(ref)) + return false; + } /* Try to acquire lock */ if (!ConditionalLockBuffer(buffer)) @@ -6790,8 +6223,12 @@ IsBufferCleanupOK(Buffer buffer) } /* There should be exactly one local pin */ - if (GetPrivateRefCount(buffer) != 1) - return false; + { + PrivateRefCountEntry *ref = GetSharedBufferEntry(buffer); + + if (!SharedBufferHasSingleRef(ref)) + return false; + } bufHdr = GetBufferDescriptor(buffer - 1); @@ -6827,12 +6264,12 @@ SharedBufferBeginSetHintBits(Buffer buffer, BufferDesc *buf_hdr, uint64 *locksta PrivateRefCountEntry *ref; BufferLockMode mode; - ref = GetPrivateRefCountEntry(buffer, true); + ref = GetSharedBufferEntry(buffer); if (ref == NULL) elog(ERROR, "buffer is not pinned"); - mode = ref->data.lockmode; + mode = SharedBufferGetLockMode(ref); if (mode == BUFFER_LOCK_UNLOCK) elog(ERROR, "buffer is not locked"); @@ -6874,7 +6311,7 @@ SharedBufferBeginSetHintBits(Buffer buffer, BufferDesc *buf_hdr, uint64 *locksta if (likely(pg_atomic_compare_exchange_u64(&buf_hdr->state, &old_state, desired_state))) { - ref->data.lockmode = BUFFER_LOCK_SHARE_EXCLUSIVE; + SharedBufferSetLockMode(ref, BUFFER_LOCK_SHARE_EXCLUSIVE); *lockstate = desired_state; return true; @@ -7647,7 +7084,7 @@ ResOwnerReleaseBuffer(Datum res) { PrivateRefCountEntry *ref; - ref = GetPrivateRefCountEntry(buffer, false); + ref = GetSharedBufferEntry(buffer); /* not having a private refcount would imply resowner corruption */ Assert(ref != NULL); @@ -7656,7 +7093,7 @@ ResOwnerReleaseBuffer(Datum res) * If the buffer was locked at the time of the resowner release, * release the lock now. This should only happen after errors. */ - if (ref->data.lockmode != BUFFER_LOCK_UNLOCK) + if (SharedBufferGetLockMode(ref) != BUFFER_LOCK_UNLOCK) { BufferDesc *buf = GetBufferDescriptor(buffer - 1); @@ -7749,7 +7186,6 @@ EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed) /* Make sure we can pin the buffer. */ ResourceOwnerEnlarge(CurrentResourceOwner); - ReservePrivateRefCountEntry(); desc = GetBufferDescriptor(buf - 1); LockBufHdr(desc); @@ -7790,7 +7226,6 @@ EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed, continue; ResourceOwnerEnlarge(CurrentResourceOwner); - ReservePrivateRefCountEntry(); LockBufHdr(desc); @@ -7844,7 +7279,6 @@ EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, /* Make sure we can pin the buffer. */ ResourceOwnerEnlarge(CurrentResourceOwner); - ReservePrivateRefCountEntry(); buf_state = LockBufHdr(desc); @@ -7936,7 +7370,6 @@ MarkDirtyUnpinnedBuffer(Buffer buf, bool *buffer_already_dirty) /* Make sure we can pin the buffer. */ ResourceOwnerEnlarge(CurrentResourceOwner); - ReservePrivateRefCountEntry(); desc = GetBufferDescriptor(buf - 1); LockBufHdr(desc); @@ -7989,7 +7422,6 @@ MarkDirtyRelUnpinnedBuffers(Relation rel, /* Make sure we can pin the buffer. */ ResourceOwnerEnlarge(CurrentResourceOwner); - ReservePrivateRefCountEntry(); buf_state = LockBufHdr(desc); @@ -8041,7 +7473,6 @@ MarkDirtyAllUnpinnedBuffers(int32 *buffers_dirtied, continue; ResourceOwnerEnlarge(CurrentResourceOwner); - ReservePrivateRefCountEntry(); LockBufHdr(desc); @@ -8740,3 +8171,12 @@ const PgAioHandleCallbacks aio_local_buffer_readv_cb = { .complete_local = local_buffer_readv_complete, .report = buffer_readv_report, }; + + +/* + * Intentionally included at the bottom so that the compiler can inline + * functions but developers are forced to use accessors. + * bufmgr.c should not be concerned with bufmgr_refcnt.c implementation + * details. + */ + #include "bufmgr_refcnt.c" diff --git a/src/backend/storage/buffer/bufmgr_refcnt.c b/src/backend/storage/buffer/bufmgr_refcnt.c new file mode 100644 index 0000000000..fef8f98037 --- /dev/null +++ b/src/backend/storage/buffer/bufmgr_refcnt.c @@ -0,0 +1,615 @@ +/*------------------------------------------------------------------------- + * + * bufmgr_refcnt.c + * Backend-private buffer refcount tracking + * + * This file is included at the end of bufmgr.c to allow the compiler + * to inline functions. Do not compile this file separately. + * + * Each buffer has a private refcount that keeps track of the number of + * times the buffer is pinned in the current process. This is so that the + * shared refcount needs to be modified only once if a buffer is pinned more + * than once by an individual backend. This mechanism is also used to track + * whether this backend has a buffer locked, and, if so, in what mode. + * + * To avoid - as we used to - requiring an array with NBuffers entries to keep + * track of local buffers, we use a small sequentially searched array + * (PrivateRefCountArrayKeys, with the corresponding data stored in + * PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to + * keep track of backend local pins. + * + * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all + * refcounts are kept track of in the array; after that, new array entries + * displace old ones into the hash table. That way a frequently used entry + * can't get "stuck" in the hashtable while infrequent ones clog the array. + * + * This was initially designed trying to optimize for the case where the + * number of pinned buffers is expected to not exceed REFCOUNT_ARRAY_ENTRIES. + * However this might not be the case with the introduction of prefetching. + * + * To enter a buffer into the refcount tracking mechanism first reserve a free + * entry using ReservePrivateRefCountEntry() and then later, if necessary, + * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing + * memory allocations in NewPrivateRefCountEntry() which can be important + * because in some scenarios it's called with a spinlock held... + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/buffer/bufmgr_refcnt.c + * + *------------------------------------------------------------------------- + */ + +#include "utils/hsearch.h" + +/* Structure definitions - internal to this file */ +typedef struct PrivateRefCountData +{ + int32 refcount; + BufferLockMode lockmode; +} PrivateRefCountData; + +struct PrivateRefCountEntry +{ + Buffer buffer; + PrivateRefCountData data; +}; + +struct PrivateRefCountIterator +{ + int array_index; + bool in_hash; + HASH_SEQ_STATUS *hash_status; +}; + +/* Private refcount array and keys */ +#define REFCOUNT_ARRAY_ENTRIES 8 +static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES]; +static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]; + +/* Overflow hash table for when array is full */ +static HTAB *PrivateRefCountHash = NULL; + +/* Count of entries that have overflowed into the hash table */ +static int32 PrivateRefCountOverflowed = 0; + +/* Clock hand for selecting victim when array is full */ +static uint32 PrivateRefCountClock = 0; + +/* Reserved slot index, or -1 if none reserved */ +static int ReservedRefCountSlot = -1; + +/* Cache for last accessed entry */ +static int PrivateRefCountEntryLast = -1; + +/* Advisory limit on the number of pins each backend should hold */ +static uint32 MaxProportionalPins = 0; + +/* Forward declarations */ +static void ReservePrivateRefCountEntry(void); +static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer); +static pg_noinline PrivateRefCountEntry *GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move); + +/* + * Initialize private refcount tracking for this backend. + */ +void +InitPrivateRefCount(void) +{ + HASHCTL hash_ctl; + + + /* + * An advisory limit on the number of pins each backend should hold, based + * on shared_buffers and the maximum number of connections possible. + * That's very pessimistic, but outside toy-sized shared_buffers it should + * allow plenty of pins. LimitAdditionalPins() and + * GetAdditionalPinLimit() can be used to check the remaining balance. + */ + MaxProportionalPins = NBuffers / (MaxBackends + NUM_AUXILIARY_PROCS); + + memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray)); + memset(&PrivateRefCountArrayKeys, 0, sizeof(PrivateRefCountArrayKeys)); + + hash_ctl.keysize = sizeof(Buffer); + hash_ctl.entrysize = sizeof(PrivateRefCountEntry); + + PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl, + HASH_ELEM | HASH_BLOBS); +} + +/* + * Ensure that the PrivateRefCountArray has sufficient space to store one more + * entry. + */ +static void +ReservePrivateRefCountEntry(void) +{ + /* Already reserved (or freed), nothing to do */ + if (ReservedRefCountSlot != -1) + return; + + /* + * First search for a free entry the array, that'll be sufficient in the + * majority of cases. + */ + { + int i; + + for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) + { + if (PrivateRefCountArrayKeys[i] == InvalidBuffer) + { + ReservedRefCountSlot = i; + } + } + + if (ReservedRefCountSlot != -1) + return; + } + + /* + * No luck. All array entries are full. Move one array entry into the hash + * table. + */ + { + int victim_slot; + PrivateRefCountEntry *victim_entry; + PrivateRefCountEntry *hashent; + bool found; + + /* select victim slot */ + victim_slot = PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES; + victim_entry = &PrivateRefCountArray[victim_slot]; + ReservedRefCountSlot = victim_slot; + + /* Better be used, otherwise we shouldn't get here. */ + Assert(PrivateRefCountArrayKeys[victim_slot] != InvalidBuffer); + Assert(PrivateRefCountArray[victim_slot].buffer != InvalidBuffer); + Assert(PrivateRefCountArrayKeys[victim_slot] == PrivateRefCountArray[victim_slot].buffer); + + /* enter victim array entry into hashtable */ + hashent = hash_search(PrivateRefCountHash, + &PrivateRefCountArrayKeys[victim_slot], + HASH_ENTER, + &found); + Assert(!found); + hashent->data = victim_entry->data; + + /* clear the now free array slot */ + PrivateRefCountArrayKeys[victim_slot] = InvalidBuffer; + victim_entry->buffer = InvalidBuffer; + + memset(&victim_entry->data, 0, sizeof(victim_entry->data)); + victim_entry->data.refcount = 0; + victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK; + + PrivateRefCountOverflowed++; + } +} + +/* + * Create a new refcount entry for the given buffer. + */ +static PrivateRefCountEntry * +NewPrivateRefCountEntry(Buffer buffer) +{ + PrivateRefCountEntry *res; + + /* only allowed to be called when a reservation has been made */ + Assert(ReservedRefCountSlot != -1); + + /* use up the reserved entry */ + res = &PrivateRefCountArray[ReservedRefCountSlot]; + + /* and fill it */ + PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer; + res->buffer = buffer; + res->data.refcount = 0; + res->data.lockmode = BUFFER_LOCK_UNLOCK; + + /* update cache for the next lookup */ + PrivateRefCountEntryLast = ReservedRefCountSlot; + + ReservedRefCountSlot = -1; + + return res; +} + +/* + * Slow-path for GetSharedBufferEntry(). + */ +static pg_noinline PrivateRefCountEntry * +GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move) +{ + PrivateRefCountEntry *res; + int match = -1; + int i; + + /* + * First search for references in the array. + */ + for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) + { + if (PrivateRefCountArrayKeys[i] == buffer) + { + match = i; + } + } + + if (likely(match != -1)) + { + PrivateRefCountEntryLast = match; + return &PrivateRefCountArray[match]; + } + + /* + * Only look up the buffer in the hashtable if we've previously overflowed. + */ + if (PrivateRefCountOverflowed == 0) + return NULL; + + res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL); + + if (res == NULL) + return NULL; + else if (!do_move) + { + return res; + } + else + { + /* move buffer from hashtable into the free array slot */ + bool found; + PrivateRefCountEntry *free; + + ReservePrivateRefCountEntry(); + + Assert(ReservedRefCountSlot != -1); + free = &PrivateRefCountArray[ReservedRefCountSlot]; + Assert(free->buffer == InvalidBuffer); + + free->buffer = buffer; + free->data = res->data; + PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer; + PrivateRefCountEntryLast = ReservedRefCountSlot; + + ReservedRefCountSlot = -1; + + hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found); + Assert(found); + Assert(PrivateRefCountOverflowed > 0); + PrivateRefCountOverflowed--; + + return free; + } +} + +/* + * Return the PrivateRefCountEntry for the passed buffer. + * Returns NULL if the buffer is not currently pinned. + */ +static PrivateRefCountEntry * +GetSharedBufferEntry(Buffer buffer) +{ + Assert(BufferIsValid(buffer)); + Assert(!BufferIsLocal(buffer)); + + /* Fast path: check one-entry cache */ + if (likely(PrivateRefCountEntryLast != -1) && + likely(PrivateRefCountArray[PrivateRefCountEntryLast].buffer == buffer)) + { + return &PrivateRefCountArray[PrivateRefCountEntryLast]; + } + + return GetPrivateRefCountEntrySlow(buffer, false); +} + +/* + * Create a new refcount entry for a buffer that is known to not be pinned. + * This is a fast path that skips the cache/hash lookup. + * Returns the new entry pointer with refcount already incremented. + */ +static PrivateRefCountEntry * +SharedBufferCreateRef(Buffer buffer) +{ + PrivateRefCountEntry *ref; + + Assert(BufferIsValid(buffer)); + Assert(!BufferIsLocal(buffer)); + + ReservePrivateRefCountEntry(); + ref = NewPrivateRefCountEntry(buffer); + ref->data.refcount++; + + return ref; +} + +/* + * Increment the private refcount for an existing entry. + * Use when you already have the entry from a previous lookup. + */ +static void +SharedBufferRefExisting(PrivateRefCountEntry *ref) +{ + Assert(ref != NULL); + Assert(ref->data.refcount > 0); + ref->data.refcount++; +} + +/* + * Decrement the private refcount for a buffer. + * If the refcount reaches zero, removes the entry and returns true. + * Returns false if the buffer still has references. + */ +static bool +SharedBufferUnref(PrivateRefCountEntry *ref) +{ + Assert(ref != NULL); + Assert(ref->data.refcount > 0); + + ref->data.refcount--; + + if (ref->data.refcount == 0) + { + /* No more references - clean up the entry */ + Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK); + + if (ref >= &PrivateRefCountArray[0] && + ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]) + { + ref->buffer = InvalidBuffer; + PrivateRefCountArrayKeys[ref - PrivateRefCountArray] = InvalidBuffer; + ReservedRefCountSlot = ref - PrivateRefCountArray; + } + else + { + bool found; + Buffer buffer = ref->buffer; + + hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found); + Assert(found); + Assert(PrivateRefCountOverflowed > 0); + PrivateRefCountOverflowed--; + } + + return true; + } + + return false; +} + +/* + * Accessors for refcount entry fields. + */ +static int32 +SharedBufferRefCount(PrivateRefCountEntry *ref) +{ + return ref->data.refcount; +} + +static BufferLockMode +SharedBufferGetLockMode(PrivateRefCountEntry *ref) +{ + return ref->data.lockmode; +} + +static void +SharedBufferSetLockMode(PrivateRefCountEntry *ref, BufferLockMode mode) +{ + ref->data.lockmode = mode; +} + +/* + * Check if the entry has exactly one reference. + */ +static bool +SharedBufferHasSingleRef(PrivateRefCountEntry *ref) +{ + return ref != NULL && ref->data.refcount == 1; +} + +/* + * Check for buffer refcount leaks. + */ +void +CheckPrivateRefCountLeaks(void) +{ +#ifdef USE_ASSERT_CHECKING + int RefCountErrors = 0; + PrivateRefCountEntry *res; + int i; + char *s; + + /* check the array */ + for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) + { + if (PrivateRefCountArrayKeys[i] != InvalidBuffer) + { + res = &PrivateRefCountArray[i]; + + s = DebugPrintBufferRefcount(res->buffer); + elog(WARNING, "buffer refcount leak: %s", s); + pfree(s); + + RefCountErrors++; + } + } + + /* if necessary search the hash */ + if (PrivateRefCountOverflowed) + { + HASH_SEQ_STATUS hstat; + + hash_seq_init(&hstat, PrivateRefCountHash); + while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL) + { + s = DebugPrintBufferRefcount(res->buffer); + elog(WARNING, "buffer refcount leak: %s", s); + pfree(s); + RefCountErrors++; + } + } + + Assert(RefCountErrors == 0); +#endif +} + +#ifdef USE_ASSERT_CHECKING +/* Forward declaration - defined in bufmgr.c */ +static void AssertNotCatalogBufferLock(Buffer buffer, BufferLockMode mode); + +/* + * Check for exclusive-locked catalog buffers. This is the core of + * AssertCouldGetRelation(). + */ +void +AssertBufferLocksPermitCatalogRead(void) +{ + PrivateRefCountIterator *iter; + PrivateRefCountEntry *res; + + iter = InitPrivateRefCountIterator(); + while ((res = GetNextPrivateRefCountEntry(iter)) != NULL) + { + Buffer buf = res->buffer; + + if (buf == InvalidBuffer) + continue; + + AssertNotCatalogBufferLock(buf, res->data.lockmode); + } + FreePrivateRefCountIterator(iter); +} +#endif + +/* + * Initialize an iterator for walking all private refcount entries. + */ +PrivateRefCountIterator * +InitPrivateRefCountIterator(void) +{ + PrivateRefCountIterator *iter = palloc(sizeof(PrivateRefCountIterator)); + + iter->array_index = 0; + iter->in_hash = false; + iter->hash_status = NULL; + return iter; +} + +/* + * Get the next private refcount entry. + * Returns NULL when iteration is complete. + */ +PrivateRefCountEntry * +GetNextPrivateRefCountEntry(PrivateRefCountIterator *iter) +{ + /* First iterate through the array */ + while (!iter->in_hash && iter->array_index < REFCOUNT_ARRAY_ENTRIES) + { + int idx = iter->array_index++; + + if (PrivateRefCountArrayKeys[idx] != InvalidBuffer) + return &PrivateRefCountArray[idx]; + } + + /* Then iterate through the hash if there are overflowed entries */ + if (!iter->in_hash) + { + iter->in_hash = true; + if (PrivateRefCountOverflowed > 0) + { + iter->hash_status = palloc(sizeof(HASH_SEQ_STATUS)); + hash_seq_init(iter->hash_status, PrivateRefCountHash); + } + } + + if (iter->hash_status != NULL) + { + PrivateRefCountEntry *res; + + res = (PrivateRefCountEntry *) hash_seq_search(iter->hash_status); + if (res != NULL) + return res; + + pfree(iter->hash_status); + iter->hash_status = NULL; + } + + return NULL; +} + +/* + * Free an iterator from InitPrivateRefCountIterator. + */ +void +FreePrivateRefCountIterator(PrivateRefCountIterator *iter) +{ + if (iter->hash_status != NULL) + { + hash_seq_term(iter->hash_status); + pfree(iter->hash_status); + } + pfree(iter); +} + + +/* + * Return the maximum number of buffers that a backend should try to pin once, + * to avoid exceeding its fair share. This is the highest value that + * GetAdditionalPinLimit() could ever return. Note that it may be zero on a + * system with a very small buffer pool relative to max_connections. + */ + uint32 + GetPinLimit(void) + { + return MaxProportionalPins; + } + + /* + * Return the maximum number of additional buffers that this backend should + * pin if it wants to stay under the per-backend limit, considering the number + * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit + * return by this function can be zero. + */ + uint32 + GetAdditionalPinLimit(void) + { + uint32 estimated_pins_held; + + /* + * We get the number of "overflowed" pins for free, but don't know the + * number of pins in PrivateRefCountArray. The cost of calculating that + * exactly doesn't seem worth it, so just assume the max. + */ + estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES; + + /* Is this backend already holding more than its fair share? */ + if (estimated_pins_held > MaxProportionalPins) + return 0; + + return MaxProportionalPins - estimated_pins_held; + } + + /* + * Limit the number of pins a batch operation may additionally acquire, to + * avoid running out of pinnable buffers. + * + * One additional pin is always allowed, on the assumption that the operation + * requires at least one to make progress. + */ + void + LimitAdditionalPins(uint32 *additional_pins) + { + uint32 limit; + + if (*additional_pins <= 1) + return; + + limit = GetAdditionalPinLimit(); + limit = Max(limit, 1); + if (limit < *additional_pins) + *additional_pins = limit; + } diff --git a/src/backend/storage/buffer/bufmgr_refcnt.h b/src/backend/storage/buffer/bufmgr_refcnt.h new file mode 100644 index 0000000000..f2a6f45d84 --- /dev/null +++ b/src/backend/storage/buffer/bufmgr_refcnt.h @@ -0,0 +1,59 @@ +/*------------------------------------------------------------------------- + * + * bufmgr_refcnt.h + * Backend-private buffer refcount tracking + * + * This header provides opaque declarations for the private refcount + * tracking system. The implementation is in bufmgr_refcnt.c which is + * included at the end of bufmgr.c for inlining. + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/storage/buffer/bufmgr_refcnt.h + * + *------------------------------------------------------------------------- + */ +#ifndef BUFMGR_REFCNT_H +#define BUFMGR_REFCNT_H + +#include "storage/buf.h" +#include "storage/bufmgr.h" + +/* Opaque handle to a private refcount entry */ +typedef struct PrivateRefCountEntry PrivateRefCountEntry; + +/* Opaque handle to an iterator */ +typedef struct PrivateRefCountIterator PrivateRefCountIterator; + +/* Initialization */ +extern void InitPrivateRefCount(void); + +/* + * Hot-path functions - forward declarations. + * Defined as static in bufmgr_refcnt.c which is included in bufmgr.c. + */ +static PrivateRefCountEntry *GetSharedBufferEntry(Buffer buffer); +static PrivateRefCountEntry *SharedBufferCreateRef(Buffer buffer); +static void SharedBufferRefExisting(PrivateRefCountEntry *ref); +static bool SharedBufferUnref(PrivateRefCountEntry *ref); +static BufferLockMode SharedBufferGetLockMode(PrivateRefCountEntry *ref); +static void SharedBufferSetLockMode(PrivateRefCountEntry *ref, BufferLockMode mode); +static int32 SharedBufferRefCount(PrivateRefCountEntry *ref); +static bool SharedBufferHasSingleRef(PrivateRefCountEntry *ref); + +/* Pin limiting */ +extern uint32 GetPinLimit(void); +extern uint32 GetAdditionalPinLimit(void); +extern void LimitAdditionalPins(uint32 *additional_pins); + +/* Leak checking */ +extern void CheckPrivateRefCountLeaks(void); + +/* Iterator functions */ +extern PrivateRefCountIterator *InitPrivateRefCountIterator(void); +extern PrivateRefCountEntry *GetNextPrivateRefCountEntry(PrivateRefCountIterator *iter); +extern void FreePrivateRefCountIterator(PrivateRefCountIterator *iter); + + +#endif /* BUFMGR_REFCNT_H */ -- 2.34.1