From abe7cf9f1314bdaae2361ad237ef859e00b69c07 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Wed, 17 Sep 2025 23:04:29 +0200 Subject: [PATCH v20251126 1/9] Infrastructure for partitioning shared buffers The patch introduces a simple "registry" of buffer partitions, keeping track of the first/last buffer, etc. This serves as a source of truth for later patches (e.g. to partition clock-sweep). The registry is a small BufferPartitions array in shared memory, with partitions sized to be a fair share of shared buffers. Later patches may improve this to consider NUMA, and similar details. With the feature disabled (GUC set to empty list), there'll be a single partition for all the buffers (and it won't be mapped to a NUMA node). Notes: * Maybe the number of partitions should be configurable? Right now it's hard-coded as 4, but testing shows increasing to e.g. 16) can be beneficial. * This partitioning is independent of the partitions defined in lwlock.h, which defines 128 partitions to reduce lock conflict on the buffer mapping hashtable. The number of partitions introduced by this patch is expected to be much lower (a dozen or so). --- .../pg_buffercache--1.7--1.8.sql | 23 +++ contrib/pg_buffercache/pg_buffercache.control | 2 +- contrib/pg_buffercache/pg_buffercache_pages.c | 86 +++++++++++ src/backend/storage/buffer/buf_init.c | 144 +++++++++++++++++- src/include/storage/buf_internals.h | 6 + src/include/storage/bufmgr.h | 19 +++ src/tools/pgindent/typedefs.list | 2 + 7 files changed, 280 insertions(+), 2 deletions(-) create mode 100644 contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql diff --git a/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql b/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql new file mode 100644 index 00000000000..d62b8339bfc --- /dev/null +++ b/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql @@ -0,0 +1,23 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.8'" to load this file. \quit + +-- Register the new functions. +CREATE OR REPLACE FUNCTION pg_buffercache_partitions() +RETURNS SETOF RECORD +AS 'MODULE_PATHNAME', 'pg_buffercache_partitions' +LANGUAGE C PARALLEL SAFE; + +-- Create a view for convenient access. +CREATE VIEW pg_buffercache_partitions AS + SELECT P.* FROM pg_buffercache_partitions() AS P + (partition integer, -- partition index + num_buffers integer, -- number of buffers in the partition + first_buffer integer, -- first buffer of partition + last_buffer integer); -- last buffer of partition + +-- Don't want these to be available to public. +REVOKE ALL ON FUNCTION pg_buffercache_partitions() FROM PUBLIC; +REVOKE ALL ON pg_buffercache_partitions FROM PUBLIC; + +GRANT EXECUTE ON FUNCTION pg_buffercache_partitions() TO pg_monitor; +GRANT SELECT ON pg_buffercache_partitions TO pg_monitor; diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control index 11499550945..d2fa8ba53ba 100644 --- a/contrib/pg_buffercache/pg_buffercache.control +++ b/contrib/pg_buffercache/pg_buffercache.control @@ -1,5 +1,5 @@ # pg_buffercache extension comment = 'examine the shared buffer cache' -default_version = '1.7' +default_version = '1.8' module_pathname = '$libdir/pg_buffercache' relocatable = true diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index ae1712fc93c..8c89855192f 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -27,6 +27,7 @@ #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3 #define NUM_BUFFERCACHE_OS_PAGES_ELEM 3 +#define NUM_BUFFERCACHE_PARTITIONS_ELEM 4 PG_MODULE_MAGIC_EXT( .name = "pg_buffercache", @@ -101,6 +102,7 @@ PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts); PG_FUNCTION_INFO_V1(pg_buffercache_evict); PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation); PG_FUNCTION_INFO_V1(pg_buffercache_evict_all); +PG_FUNCTION_INFO_V1(pg_buffercache_partitions); /* Only need to touch memory once per backend process lifetime */ @@ -826,3 +828,87 @@ pg_buffercache_evict_all(PG_FUNCTION_ARGS) PG_RETURN_DATUM(result); } + +/* + * Inquire about partitioning of buffers between NUMA nodes. + */ +Datum +pg_buffercache_partitions(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + MemoryContext oldcontext; + TupleDesc tupledesc; + TupleDesc expected_tupledesc; + HeapTuple tuple; + Datum result; + + if (SRF_IS_FIRSTCALL()) + { + funcctx = SRF_FIRSTCALL_INIT(); + + /* Switch context when allocating stuff to be used in later calls */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + if (expected_tupledesc->natts != NUM_BUFFERCACHE_PARTITIONS_ELEM) + elog(ERROR, "incorrect number of output arguments"); + + /* Construct a tuple descriptor for the result rows. */ + tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts); + TupleDescInitEntry(tupledesc, (AttrNumber) 1, "partition", + INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 2, "num_buffers", + INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 3, "first_buffer", + INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 4, "last_buffer", + INT4OID, -1, 0); + + funcctx->user_fctx = BlessTupleDesc(tupledesc); + + /* Return to original context when allocating transient memory */ + MemoryContextSwitchTo(oldcontext); + + /* Set max calls and remember the user function context. */ + funcctx->max_calls = BufferPartitionCount(); + } + + funcctx = SRF_PERCALL_SETUP(); + + if (funcctx->call_cntr < funcctx->max_calls) + { + uint32 i = funcctx->call_cntr; + + int num_buffers, + first_buffer, + last_buffer; + + Datum values[NUM_BUFFERCACHE_PARTITIONS_ELEM]; + bool nulls[NUM_BUFFERCACHE_PARTITIONS_ELEM]; + + BufferPartitionGet(i, &num_buffers, + &first_buffer, &last_buffer); + + values[0] = Int32GetDatum(i); + nulls[0] = false; + + values[1] = Int32GetDatum(num_buffers); + nulls[1] = false; + + values[2] = Int32GetDatum(first_buffer); + nulls[2] = false; + + values[3] = Int32GetDatum(last_buffer); + nulls[3] = false; + + /* Build and return the tuple. */ + tuple = heap_form_tuple((TupleDesc) funcctx->user_fctx, values, nulls); + result = HeapTupleGetDatum(tuple); + + SRF_RETURN_NEXT(funcctx, result); + } + else + SRF_RETURN_DONE(funcctx); +} diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 6fd3a6bbac5..528a368a8b7 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -17,6 +17,11 @@ #include "storage/aio.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "storage/pg_shmem.h" +#include "storage/proc.h" +#include "utils/guc.h" +#include "utils/guc_hooks.h" +#include "utils/varlena.h" BufferDescPadded *BufferDescriptors; char *BufferBlocks; @@ -24,6 +29,14 @@ ConditionVariableMinimallyPadded *BufferIOCVArray; WritebackContext BackendWritebackContext; CkptSortItem *CkptBufferIds; +/* * + * number of buffer partitions */ +#define NUM_CLOCK_SWEEP_PARTITIONS 4 + +/* Array of structs with information about buffer ranges */ +BufferPartitions *BufferPartitionsArray = NULL; + +static void buffer_partitions_init(void); /* * Data Structures: @@ -70,7 +83,15 @@ BufferManagerShmemInit(void) bool foundBufs, foundDescs, foundIOCV, - foundBufCkpt; + foundBufCkpt, + foundParts; + + /* allocate the partition registry first */ + BufferPartitionsArray = (BufferPartitions *) + ShmemInitStruct("Buffer Partitions", + offsetof(BufferPartitions, partitions) + + mul_size(sizeof(BufferPartition), NUM_CLOCK_SWEEP_PARTITIONS), + &foundParts); /* Align descriptors to a cacheline boundary. */ BufferDescriptors = (BufferDescPadded *) @@ -112,6 +133,9 @@ BufferManagerShmemInit(void) { int i; + /* Initialize buffer partitions (calculate buffer ranges). */ + buffer_partitions_init(); + /* * Initialize all the buffer headers. */ @@ -175,5 +199,123 @@ BufferManagerShmemSize(void) /* size of checkpoint sort array in bufmgr.c */ size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem))); + /* account for registry of NUMA partitions */ + size = add_size(size, MAXALIGN(offsetof(BufferPartitions, partitions) + + mul_size(sizeof(BufferPartition), NUM_CLOCK_SWEEP_PARTITIONS))); + return size; } + +/* + * Sanity checks of buffers partitions - there must be no gaps, it must cover + * the whole range of buffers, etc. + */ +static void +AssertCheckBufferPartitions(void) +{ +#ifdef USE_ASSERT_CHECKING + int num_buffers = 0; + + Assert(BufferPartitionsArray->npartitions > 0); + + for (int i = 0; i < BufferPartitionsArray->npartitions; i++) + { + BufferPartition *part = &BufferPartitionsArray->partitions[i]; + + /* + * We can get a single-buffer partition, if the sizing forces the last + * partition to be just one buffer. But it's unlikely (and + * undesirable). + */ + Assert(part->first_buffer <= part->last_buffer); + Assert((part->last_buffer - part->first_buffer + 1) == part->num_buffers); + + num_buffers += part->num_buffers; + + /* + * The first partition needs to start on buffer 0. Later partitions + * need to be contiguous, without skipping any buffers. + */ + if (i == 0) + { + Assert(part->first_buffer == 0); + } + else + { + BufferPartition *prev = &BufferPartitionsArray->partitions[i - 1]; + + Assert((part->first_buffer - 1) == prev->last_buffer); + } + + /* the last partition needs to end on buffer (NBuffers - 1) */ + if (i == (BufferPartitionsArray->npartitions - 1)) + { + Assert(part->last_buffer == (NBuffers - 1)); + } + } + + Assert(num_buffers == NBuffers); +#endif +} + +/* + * buffer_partitions_init + * Initialize array of buffer partitions. + */ +static void +buffer_partitions_init(void) +{ + int remaining_buffers = NBuffers; + int buffer = 0; + + /* number of buffers per partition (make sure to not overflow) */ + int part_buffers + = ((int64) NBuffers + (NUM_CLOCK_SWEEP_PARTITIONS - 1)) / NUM_CLOCK_SWEEP_PARTITIONS; + + BufferPartitionsArray->npartitions = NUM_CLOCK_SWEEP_PARTITIONS; + + for (int n = 0; n < BufferPartitionsArray->npartitions; n++) + { + BufferPartition *part = &BufferPartitionsArray->partitions[n]; + + /* buffers this partition should get (last partition can get fewer) */ + int num_buffers = Min(remaining_buffers, part_buffers); + + remaining_buffers -= num_buffers; + + Assert((num_buffers > 0) && (num_buffers <= part_buffers)); + Assert((buffer >= 0) && (buffer < NBuffers)); + + part->num_buffers = num_buffers; + part->first_buffer = buffer; + part->last_buffer = buffer + (num_buffers - 1); + + buffer += num_buffers; + } + + AssertCheckBufferPartitions(); +} + +int +BufferPartitionCount(void) +{ + return BufferPartitionsArray->npartitions; +} + +void +BufferPartitionGet(int idx, int *num_buffers, + int *first_buffer, int *last_buffer) +{ + if ((idx >= 0) && (idx < BufferPartitionsArray->npartitions)) + { + BufferPartition *part = &BufferPartitionsArray->partitions[idx]; + + *num_buffers = part->num_buffers; + *first_buffer = part->first_buffer; + *last_buffer = part->last_buffer; + + return; + } + + elog(ERROR, "invalid partition index"); +} diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 5400c56a965..139055a4a7d 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -345,6 +345,7 @@ typedef struct WritebackContext /* in buf_init.c */ extern PGDLLIMPORT BufferDescPadded *BufferDescriptors; +extern PGDLLIMPORT BufferPartitions *BufferPartitionsArray; extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray; extern PGDLLIMPORT WritebackContext BackendWritebackContext; @@ -549,4 +550,9 @@ extern void DropRelationLocalBuffers(RelFileLocator rlocator, extern void DropRelationAllLocalBuffers(RelFileLocator rlocator); extern void AtEOXact_LocalBuffers(bool isCommit); +extern int BufferPartitionCount(void); +extern int BufferPartitionNodes(void); +extern void BufferPartitionGet(int idx, int *num_buffers, + int *first_buffer, int *last_buffer); + #endif /* BUFMGR_INTERNALS_H */ diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index b5f8f3c5d42..24860c6c2c4 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -153,6 +153,25 @@ struct ReadBuffersOperation typedef struct ReadBuffersOperation ReadBuffersOperation; +/* + * information about one partition of shared buffers + * + * first/last buffer - the values are inclusive + */ +typedef struct BufferPartition +{ + int num_buffers; /* number of buffers */ + int first_buffer; /* first buffer of partition */ + int last_buffer; /* last buffer of partition */ +} BufferPartition; + +/* an array of information about all partitions */ +typedef struct BufferPartitions +{ + int npartitions; /* number of partitions */ + BufferPartition partitions[FLEXIBLE_ARRAY_MEMBER]; +} BufferPartitions; + /* to avoid having to expose buf_internals.h here */ typedef struct WritebackContext WritebackContext; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index dfcd619bfee..d3edff346a8 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -347,6 +347,8 @@ BufferDescPadded BufferHeapTupleTableSlot BufferLookupEnt BufferManagerRelation +BufferPartition +BufferPartitions BufferStrategyControl BufferTag BufferUsage -- 2.51.1