From 33138ffb0d834bc452e9d2d1819f21871c64210f Mon Sep 17 00:00:00 2001 From: bob Date: Wed, 11 Mar 2026 12:08:04 +0000 Subject: [PATCH 5/5] Array direct mapping Instead of searching all array entries, each buffer is mapped to exactly one slot at index (buffer % REFCOUNT_ARRAY_ENTRIES). Collisions are resolved by evicting to the hash table. This simplifies lookup to O(1) for the array check and should improve performance for num_buffers > 8 while maintaining performance for small numbers of buffers. --- src/backend/storage/buffer/bufmgr_refcnt.c | 307 +++++++++------------ 1 file changed, 132 insertions(+), 175 deletions(-) diff --git a/src/backend/storage/buffer/bufmgr_refcnt.c b/src/backend/storage/buffer/bufmgr_refcnt.c index bf84e9f62e..726697ebf7 100644 --- a/src/backend/storage/buffer/bufmgr_refcnt.c +++ b/src/backend/storage/buffer/bufmgr_refcnt.c @@ -12,26 +12,15 @@ * than once by an individual backend. This mechanism is also used to track * whether this backend has a buffer locked, and, if so, in what mode. * - * To avoid - as we used to - requiring an array with NBuffers entries to keep - * track of local buffers, we use a small sequentially searched array - * (PrivateRefCountArrayKeys, with the corresponding data stored in - * PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to - * keep track of backend local pins. + * To avoid requiring an array with NBuffers entries, we use a dynamically + * sized direct-mapped array (PrivateRefCountArray) and an overflow hash table + * (PrivateRefCountHash). Each buffer maps to exactly one array slot at + * index (buffer & mask). When a collision occurs (two different buffers + * map to the same slot), the existing entry is evicted to the hash table. + * The array grows automatically when occupation exceeds a specified threshold. * - * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all - * refcounts are kept track of in the array; after that, new array entries - * displace old ones into the hash table. That way a frequently used entry - * can't get "stuck" in the hashtable while infrequent ones clog the array. - * - * This was initially designed trying to optimize for the case where the - * number of pinned buffers is expected to not exceed REFCOUNT_ARRAY_ENTRIES. - * However this might not be the case with the introduction of prefetching. - * - * To enter a buffer into the refcount tracking mechanism first reserve a free - * entry using ReservePrivateRefCountEntry() and then later, if necessary, - * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing - * memory allocations in NewPrivateRefCountEntry() which can be important - * because in some scenarios it's called with a spinlock held... + * This design provides O(1) lookup for the common case where there are no + * collisions, while gracefully handling overflow via the hash table. * * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -86,33 +77,40 @@ struct PrivateRefCountIterator #define SH_DEFINE #include "lib/simplehash.h" -/* Private refcount array and keys */ -#define REFCOUNT_ARRAY_ENTRIES 8 -static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES]; -static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]; +/* + * Private refcount array - direct-mapped by (buffer & mask). + * Each buffer maps to exactly one slot. Collisions evict to hash table. + * + * Consider an array with N slots, after storing n random buffers + * + * - The probability of a single buffer hitting a given slot is 1/N + * - The probability of a single buffer not hitting a given slot is 1-1/N + * - The probability of n buffers not hitting a given slot is (1-1/N)^n + * - the expected number of vacant slots is N * (1-1/N)^n ~ N * e^(-n/N) + * + * The last approximation should be not too bad for large number, + * and that gives e.g. 63% occupation for n = N, 86% ocupation for n = 2*N + */ +#define REFCOUNT_ARRAY_INITIAL_SIZE 8 +#define REFCOUNT_ARRAY_MAX_OCCUPATION 0.86 +#define REFCOUNT_ARRAY_MAX_SIZE (1 << 20) /* 1M entries max */ + +static struct PrivateRefCountEntry *PrivateRefCountArray = NULL; +static int32 PrivateRefCountArrayMask = 0; /* (size - 1) for fast modulo */ +static int32 PrivateRefCountArrayUsed = 0; /* entries in the array */ +static int32 PrivateRefCountArrayTolerated = 0; /* grow when Used exceeds this */ -/* Overflow hash table for when array is full */ +/* Overflow hash table for collisions */ static refcount_hash *PrivateRefCountHash = NULL; -/* Count of entries that have overflowed into the hash table */ +/* Count of entries in the hash table */ static int32 PrivateRefCountOverflowed = 0; -/* Clock hand for selecting victim when array is full */ -static uint32 PrivateRefCountClock = 0; - -/* Reserved slot index, or -1 if none reserved */ -static int ReservedRefCountSlot = -1; - -/* Cache for last accessed entry */ -static int PrivateRefCountEntryLast = -1; - /* Advisory limit on the number of pins each backend should hold */ static uint32 MaxProportionalPins = 0; -/* Forward declarations */ -static void ReservePrivateRefCountEntry(Buffer buffer); -static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer); -static pg_noinline PrivateRefCountEntry *GetPrivateRefCountEntrySlow(Buffer buffer); +/* Forward declaration */ +static void GrowPrivateRefCountArray(void); /* * Initialize private refcount tracking for this backend. @@ -126,174 +124,135 @@ InitPrivateRefCount(void) * That's very pessimistic, but outside toy-sized shared_buffers it should * allow plenty of pins. LimitAdditionalPins() and * GetAdditionalPinLimit() can be used to check the remaining balance. - */ + */ MaxProportionalPins = NBuffers / (MaxBackends + NUM_AUXILIARY_PROCS); - memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray)); - memset(&PrivateRefCountArrayKeys, 0, sizeof(PrivateRefCountArrayKeys)); - PrivateRefCountHash = refcount_create(CurrentMemoryContext, 64, NULL); + /* Initialize the direct-mapped array */ + PrivateRefCountArrayMask = REFCOUNT_ARRAY_INITIAL_SIZE - 1; + PrivateRefCountArrayUsed = 0; + PrivateRefCountArrayTolerated = (int32)(REFCOUNT_ARRAY_INITIAL_SIZE * REFCOUNT_ARRAY_MAX_OCCUPATION); + PrivateRefCountArray = MemoryContextAllocZero(TopMemoryContext, + REFCOUNT_ARRAY_INITIAL_SIZE * sizeof(PrivateRefCountEntry)); + + PrivateRefCountHash = refcount_create(TopMemoryContext, 64, NULL); } /* - * Ensure that the PrivateRefCountArray has sufficient space to store one more - * entry. + * Grow the private refcount array when usage exceeds tolerated occupation. + * Doubles the size and rehashes all existing entries. */ static void -ReservePrivateRefCountEntry(Buffer buffer) +GrowPrivateRefCountArray(void) { - /* Already reserved (or freed), nothing to do */ - if (ReservedRefCountSlot != -1) + PrivateRefCountEntry *old_array = PrivateRefCountArray; + int32 old_size = PrivateRefCountArrayMask + 1; + int32 new_size = old_size * 2; + int32 new_mask = new_size - 1; + int32 i; + + /* Don't grow beyond maximum */ + if (new_size > REFCOUNT_ARRAY_MAX_SIZE) return; - /* - * First search for a free entry in the array, that'll be sufficient in - * the majority of cases. - */ - { - int i; - - for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) - { - if (PrivateRefCountArrayKeys[i] == InvalidBuffer) - { - ReservedRefCountSlot = i; - return; - } - } - } - - /* - * No luck. All array entries are full. Move one array entry into the hash - * table. - */ - { - int victim_slot; - PrivateRefCountEntry *victim_entry; - PrivateRefCountEntry *hashent; - bool found; - - /* select victim slot */ - victim_slot = PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES; - victim_entry = &PrivateRefCountArray[victim_slot]; - ReservedRefCountSlot = victim_slot; - - /* Better be used, otherwise we shouldn't get here. */ - Assert(PrivateRefCountArrayKeys[victim_slot] != InvalidBuffer); - Assert(PrivateRefCountArray[victim_slot].buffer != InvalidBuffer); - Assert(PrivateRefCountArrayKeys[victim_slot] == PrivateRefCountArray[victim_slot].buffer); - - /* enter victim array entry into hashtable */ - hashent = refcount_insert(PrivateRefCountHash, - PrivateRefCountArrayKeys[victim_slot], - &found); - Assert(!found); - hashent->data = victim_entry->data; - - /* clear the now free array slot */ - PrivateRefCountArrayKeys[victim_slot] = InvalidBuffer; - victim_entry->buffer = InvalidBuffer; - victim_entry->data = 0; - - PrivateRefCountOverflowed++; - } -} - -/* - * Create a new refcount entry for the given buffer. - */ -static PrivateRefCountEntry * -NewPrivateRefCountEntry(Buffer buffer) -{ - PrivateRefCountEntry *res; - - /* only allowed to be called when a reservation has been made */ - Assert(ReservedRefCountSlot != -1); - - /* use up the reserved entry */ - res = &PrivateRefCountArray[ReservedRefCountSlot]; - - /* and fill it */ - PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer; - res->buffer = buffer; - res->data = 0; - - /* update cache for the next lookup */ - PrivateRefCountEntryLast = ReservedRefCountSlot; - - ReservedRefCountSlot = -1; - - return res; -} - -/* - * Slow-path for GetSharedBufferEntry(). - */ -static pg_noinline PrivateRefCountEntry * -GetPrivateRefCountEntrySlow(Buffer buffer) -{ - PrivateRefCountEntry *res; - int i; + /* Allocate new array and update metadata */ + PrivateRefCountArray = MemoryContextAllocZero(TopMemoryContext, + new_size * sizeof(PrivateRefCountEntry)); + PrivateRefCountArrayMask = new_mask; + PrivateRefCountArrayTolerated = (int32)(new_size * REFCOUNT_ARRAY_MAX_OCCUPATION); /* - * First search for references in the array. + * Rehash entries from old array. When doubling, each old slot maps to + * two possible new slots (slot or slot + old_size), so entries from + * different old slots cannot collide. PrivateRefCountArrayUsed stays same. */ - for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) + for (i = 0; i < old_size; i++) { - if (PrivateRefCountArrayKeys[i] == buffer) + if (old_array[i].buffer != InvalidBuffer) { - PrivateRefCountEntryLast = i; - return &PrivateRefCountArray[i]; + int new_slot = old_array[i].buffer & new_mask; + + PrivateRefCountArray[new_slot] = old_array[i]; } } - /* - * Only look up the buffer in the hashtable if we've previously overflowed. - */ - if (PrivateRefCountOverflowed == 0) - return NULL; - - res = refcount_lookup(PrivateRefCountHash, buffer); - return res; + pfree(old_array); } /* * Return the PrivateRefCount entry for the passed buffer. * Returns NULL if the buffer is not currently pinned. + * + * With direct-mapped array, lookup is O(1): check the slot at + * (buffer & mask), then check hash table if needed. */ static PrivateRefCountEntry * GetSharedBufferEntry(Buffer buffer) { + int slot = buffer & PrivateRefCountArrayMask; + PrivateRefCountEntry *entry = &PrivateRefCountArray[slot]; + Assert(BufferIsValid(buffer)); Assert(!BufferIsLocal(buffer)); - /* Fast path: check one-entry cache */ - if (likely(PrivateRefCountEntryLast != -1) && - likely(PrivateRefCountArray[PrivateRefCountEntryLast].buffer == buffer)) - { - return &PrivateRefCountArray[PrivateRefCountEntryLast]; - } + /* Check the direct-mapped slot */ + if (entry->buffer == buffer) + return entry; - return GetPrivateRefCountEntrySlow(buffer); + /* Check hash table if there are overflowed entries */ + if (PrivateRefCountOverflowed > 0) + return refcount_lookup(PrivateRefCountHash, buffer); + + return NULL; } /* * Create a new refcount entry for a buffer that is known to not be pinned. - * This is a fast path that skips the cache/hash lookup. - * Returns the new entry pointer with refcount already incremented. + * Returns the new entry pointer with refcount set to 1. + * + * If the direct-mapped slot contains a different buffer, it is evicted + * to the hash table first. The array grows when overflow exceeds 20% of size. */ static PrivateRefCountEntry * SharedBufferCreateRef(Buffer buffer) { - PrivateRefCountEntry *ref; + int slot; + PrivateRefCountEntry *entry; + bool was_empty; Assert(BufferIsValid(buffer)); Assert(!BufferIsLocal(buffer)); - ReservePrivateRefCountEntry(buffer); - ref = NewPrivateRefCountEntry(buffer); - ref->data = ONE_PRIVATE_REFERENCE; + /* Grow when array usage exceeds tolerated occupation */ + if (PrivateRefCountArrayUsed > PrivateRefCountArrayTolerated) + GrowPrivateRefCountArray(); + + slot = buffer & PrivateRefCountArrayMask; + entry = &PrivateRefCountArray[slot]; + was_empty = (entry->buffer == InvalidBuffer); + + /* If slot contains a different buffer, evict it to hash table */ + if (!was_empty && entry->buffer != buffer) + { + PrivateRefCountEntry *hashent; + bool found; + + hashent = refcount_insert(PrivateRefCountHash, entry->buffer, &found); + Assert(!found); + hashent->data = entry->data; + + PrivateRefCountOverflowed++; + was_empty = true; + } + + /* Track array usage */ + if (was_empty) + PrivateRefCountArrayUsed++; + + /* Use the slot for our buffer */ + entry->buffer = buffer; + entry->data = ONE_PRIVATE_REFERENCE; - return ref; + return entry; } /* @@ -327,15 +286,15 @@ SharedBufferUnref(PrivateRefCountEntry *ref) Assert(SharedBufferGetLockMode(ref) == BUFFER_LOCK_UNLOCK); if (ref >= &PrivateRefCountArray[0] && - ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]) + ref < &PrivateRefCountArray[PrivateRefCountArrayMask + 1]) { + /* Array entry - mark as empty */ ref->buffer = InvalidBuffer; - PrivateRefCountArrayKeys[ref - PrivateRefCountArray] = InvalidBuffer; - ReservedRefCountSlot = ref - PrivateRefCountArray; + PrivateRefCountArrayUsed--; } else { - /* could make slightly more efficient by using the pointer */ + /* Hash table entry */ refcount_delete(PrivateRefCountHash, ref->buffer); Assert(PrivateRefCountOverflowed > 0); PrivateRefCountOverflowed--; @@ -392,9 +351,9 @@ CheckPrivateRefCountLeaks(void) char *s; /* check the array */ - for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) + for (i = 0; i <= PrivateRefCountArrayMask; i++) { - if (PrivateRefCountArrayKeys[i] != InvalidBuffer) + if (PrivateRefCountArray[i].buffer != InvalidBuffer) { res = &PrivateRefCountArray[i]; @@ -475,11 +434,11 @@ PrivateRefCountEntry * GetNextPrivateRefCountEntry(PrivateRefCountIterator *iter) { /* First iterate through the array */ - while (!iter->in_hash && iter->array_index < REFCOUNT_ARRAY_ENTRIES) + while (!iter->in_hash && iter->array_index <= PrivateRefCountArrayMask) { int idx = iter->array_index++; - if (PrivateRefCountArrayKeys[idx] != InvalidBuffer) + if (PrivateRefCountArray[idx].buffer != InvalidBuffer) return &PrivateRefCountArray[idx]; } @@ -547,11 +506,9 @@ FreePrivateRefCountIterator(PrivateRefCountIterator *iter) uint32 estimated_pins_held; /* - * We get the number of "overflowed" pins for free, but don't know the - * number of pins in PrivateRefCountArray. The cost of calculating that - * exactly doesn't seem worth it, so just assume the max. + * We track array usage, so we can get an accurate count. */ - estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES; + estimated_pins_held = PrivateRefCountOverflowed + PrivateRefCountArrayUsed; /* Is this backend already holding more than its fair share? */ if (estimated_pins_held > MaxProportionalPins) -- 2.34.1