From bbe6427b4e0e871bcb7b2cc4ce11ad8aba62799c Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Mon, 28 Jul 2025 14:01:37 +0200
Subject: [PATCH v20250807 03/11] NUMA: interleaving buffers

Ensure shared buffers are allocated from all NUMA nodes, in a balanced
way, instead of just using the node where Postgres initially starts, or
where the kernel decides to migrate the page, etc. With pre-warming
performed by a single backend, this can easily result in severely
unbalanced memory distribution (with most from a single NUMA node).

The kernel would eventually move some of the memory to other nodes
(thanks to zone_reclaim), but that tends to take a long time. So this
patch improves predictability, reduces the time needed for warmup
during benchmarking, etc.  It's less dependent on what the CPU
scheduler does, etc.

Furthermore, the buffers are mapped to NUMA nodes in a deterministic
way, so this also allows further improvements like backends using
buffers from the same NUMA node.

The effect is similar to

     numactl --interleave=all

but there's a number of important differences.

Firstly, it's applied only to shared buffers (and also to descriptors),
not to the whole shared memory segment. It's not clear we'd want to use
interleaving for all parts, storing entries with different sizes and
life cycles (e.g. ProcArray may need different approach).

Secondly, it considers the page and block size, and makes sure to always
put the whole buffer on a single NUMA node (even if it happens to use
multiple memory pages), and to keep the buffer and it's descriptor on
the same NUMA node. The seriousness/likelihood of these issues depends
on the memory page size (regular vs. huge pages).

The mapping of memory to NUMA nodes happens in larger chunks. This is
required to handle buffer descriptors (which are smaller than buffers),
and so many more fit onto a single memory page.

The number of buffer descriptors per memory page determines the smallest
number of buffers that can be placed on a NUMA node. With 2MB huge pages
this is 256MB, with 4KB pages this is 512KB). Nodes get a multiple of
this, and we try to keep the nodes balanced - the last node can get less
memory, though.

The "buffer partitions" may not be 1:1 with NUMA nodes. There's a
minimal number of partitions (default: 4) that will be created even with
fewer NUMA nodes, or no NUMA at all. Each node gets the same number of
partitions, to keep things simple. For example, with 2 nodes there'll be
4 partitions, with each node getting 2 of them. With 3 nodes there'll be
6 partitions (again, 2 per node).

The patch introduces a simple "registry" of buffer partitions, keeping
track of the first/last buffer, NUMA node, etc. This serves as a source
of truth, both for this patch and for later patches building on this
same buffer partition structure.

With the feature disabled (GUC set to 'off'), there'll be a single
partition for all the buffers (and it won't be mapped to a NUMA node).

Notes:

* The feature is enabled by numa_buffers_interleave GUC (default: false)

* It's not clear we want to enable interleaving for all shared memory.
  We probably want that for shared buffers, but maybe not for ProcArray
  or freelists.

* Similar questions are about huge pages - in general it's a good idea,
  but maybe it's not quite good for ProcArray. It's somewhate separate
  from NUMA, but not entirely because NUMA works on page granularity.
  PGPROC entries are ~8KB, so too large for interleaving with 4K pages,
  as we don't want to split the entry to multiple nodes. But could be
  done explicitly, by specifying which node to use for the pages.

* We could partition ProcArray, with one partition per NUMA node, and
  then at connection time pick a node from the same node. The process
  could migrate to some other node later, especially for long-lived
  connections, but there's no perfect solution, Maybe we could set
  affinity to cores from the same node, or something like that?
---
 contrib/pg_buffercache/Makefile               |   2 +-
 .../pg_buffercache--1.6--1.7.sql              |  26 +
 contrib/pg_buffercache/pg_buffercache.control |   2 +-
 contrib/pg_buffercache/pg_buffercache_pages.c |  92 +++
 src/backend/storage/buffer/buf_init.c         | 626 +++++++++++++++++-
 src/backend/utils/init/globals.c              |   3 +
 src/backend/utils/misc/guc_tables.c           |  10 +
 src/include/miscadmin.h                       |   2 +
 src/include/storage/buf_internals.h           |   6 +
 src/include/storage/bufmgr.h                  |  15 +
 src/tools/pgindent/typedefs.list              |   2 +
 11 files changed, 775 insertions(+), 11 deletions(-)
 create mode 100644 contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql

diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index 5f748543e2e..0e618f66aec 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -9,7 +9,7 @@ EXTENSION = pg_buffercache
 DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
 	pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
 	pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
-	pg_buffercache--1.5--1.6.sql
+	pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql
 PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
 
 REGRESS = pg_buffercache pg_buffercache_numa
diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
new file mode 100644
index 00000000000..fb9003c011e
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
@@ -0,0 +1,26 @@
+/* contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.7'" to load this file. \quit
+
+-- Register the new functions.
+CREATE OR REPLACE FUNCTION pg_buffercache_partitions()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_partitions'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE VIEW pg_buffercache_partitions AS
+	SELECT P.* FROM pg_buffercache_partitions() AS P
+	(partition integer,			-- partition index
+	 numa_node integer,			-- NUMA node of the partitioon
+	 num_buffers integer,		-- number of buffers in the partition
+	 first_buffer integer,		-- first buffer of partition
+	 last_buffer integer);		-- last buffer of partition
+
+-- Don't want these to be available to public.
+REVOKE ALL ON FUNCTION pg_buffercache_partitions() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_partitions FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_partitions() TO pg_monitor;
+GRANT SELECT ON pg_buffercache_partitions TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index b030ba3a6fa..11499550945 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
 # pg_buffercache extension
 comment = 'examine the shared buffer cache'
-default_version = '1.6'
+default_version = '1.7'
 module_pathname = '$libdir/pg_buffercache'
 relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index ae0291e6e96..8baa7c7b543 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -27,6 +27,7 @@
 #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
 
 #define NUM_BUFFERCACHE_NUMA_ELEM	3
+#define NUM_BUFFERCACHE_PARTITIONS_ELEM	5
 
 PG_MODULE_MAGIC_EXT(
 					.name = "pg_buffercache",
@@ -100,6 +101,7 @@ PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
 PG_FUNCTION_INFO_V1(pg_buffercache_evict);
 PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation);
 PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
+PG_FUNCTION_INFO_V1(pg_buffercache_partitions);
 
 
 /* Only need to touch memory once per backend process lifetime */
@@ -771,3 +773,93 @@ pg_buffercache_evict_all(PG_FUNCTION_ARGS)
 
 	PG_RETURN_DATUM(result);
 }
+
+/*
+ * Inquire about partitioning of buffers between NUMA nodes.
+ */
+Datum
+pg_buffercache_partitions(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+	MemoryContext oldcontext;
+	TupleDesc	tupledesc;
+	TupleDesc	expected_tupledesc;
+	HeapTuple	tuple;
+	Datum		result;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		funcctx = SRF_FIRSTCALL_INIT();
+
+		/* Switch context when allocating stuff to be used in later calls */
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+			elog(ERROR, "return type must be a row type");
+
+		if (expected_tupledesc->natts != NUM_BUFFERCACHE_PARTITIONS_ELEM)
+			elog(ERROR, "incorrect number of output arguments");
+
+		/* Construct a tuple descriptor for the result rows. */
+		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 1, "partition",
+						   INT4OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "numa_node",
+						   INT4OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 3, "num_buffers",
+						   INT4OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 4, "first_buffer",
+						   INT4OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 5, "last_buffer",
+						   INT4OID, -1, 0);
+
+		funcctx->user_fctx = BlessTupleDesc(tupledesc);
+
+		/* Return to original context when allocating transient memory */
+		MemoryContextSwitchTo(oldcontext);
+
+		/* Set max calls and remember the user function context. */
+		funcctx->max_calls = BufferPartitionCount();
+	}
+
+	funcctx = SRF_PERCALL_SETUP();
+
+	if (funcctx->call_cntr < funcctx->max_calls)
+	{
+		uint32		i = funcctx->call_cntr;
+
+		int			numa_node,
+					num_buffers,
+					first_buffer,
+					last_buffer;
+
+		Datum		values[NUM_BUFFERCACHE_PARTITIONS_ELEM];
+		bool		nulls[NUM_BUFFERCACHE_PARTITIONS_ELEM];
+
+		BufferPartitionGet(i, &numa_node, &num_buffers,
+						   &first_buffer, &last_buffer);
+
+		values[0] = Int32GetDatum(i);
+		nulls[0] = false;
+
+		values[1] = Int32GetDatum(numa_node);
+		nulls[1] = false;
+
+		values[2] = Int32GetDatum(num_buffers);
+		nulls[2] = false;
+
+		values[3] = Int32GetDatum(first_buffer);
+		nulls[3] = false;
+
+		values[4] = Int32GetDatum(last_buffer);
+		nulls[4] = false;
+
+		/* Build and return the tuple. */
+		tuple = heap_form_tuple((TupleDesc) funcctx->user_fctx, values, nulls);
+		result = HeapTupleGetDatum(tuple);
+
+		SRF_RETURN_NEXT(funcctx, result);
+	}
+	else
+		SRF_RETURN_DONE(funcctx);
+}
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index ed1dc488a42..5b65a855b29 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -14,9 +14,17 @@
  */
 #include "postgres.h"
 
+#ifdef USE_LIBNUMA
+#include <numa.h>
+#include <numaif.h>
+#endif
+
+#include "port/pg_numa.h"
 #include "storage/aio.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#include "storage/pg_shmem.h"
+#include "storage/proc.h"
 
 BufferDescPadded *BufferDescriptors;
 char	   *BufferBlocks;
@@ -24,6 +32,19 @@ ConditionVariableMinimallyPadded *BufferIOCVArray;
 WritebackContext BackendWritebackContext;
 CkptSortItem *CkptBufferIds;
 
+BufferPartitions *BufferPartitionsArray;
+
+static Size get_memory_page_size(void);
+static void buffer_partitions_prepare(void);
+static void buffer_partitions_init(void);
+
+/* number of NUMA nodes (as returned by numa_num_configured_nodes) */
+static int	numa_nodes = -1;	/* number of nodes when sizing */
+static Size numa_page_size = 0; /* page used to size partitions */
+static bool numa_can_partition = false; /* can map to NUMA nodes? */
+static int	numa_buffers_per_node = -1; /* buffers per node */
+static int	numa_partitions = 0;	/* total (multiple of nodes) */
+
 
 /*
  * Data Structures:
@@ -70,19 +91,89 @@ BufferManagerShmemInit(void)
 	bool		foundBufs,
 				foundDescs,
 				foundIOCV,
-				foundBufCkpt;
+				foundBufCkpt,
+				foundParts;
+	Size		mem_page_size;
+	Size		buffer_align;
+
+	/*
+	 * XXX A bit weird. Do we need to worry about postmaster? Could this even
+	 * run outside postmaster? I don't think so.
+	 *
+	 * XXX Another issue is we may get different values than when sizing the
+	 * the memory, because at that point we didn't know if we get huge pages,
+	 * so we assumed we will. Shouldn't cause crashes, but we might allocate
+	 * shared memory and then not use some of it (because of the alignment
+	 * that we don't actually need). Not sure about better way, good for now.
+	 */
+	if (IsUnderPostmaster)
+		mem_page_size = pg_get_shmem_pagesize();
+	else
+		mem_page_size = get_memory_page_size();
+
+	/*
+	 * With NUMA we need to ensure the buffers are properly aligned not just
+	 * to PG_IO_ALIGN_SIZE, but also to memory page size, because NUMA works
+	 * on page granularity, and we don't want a buffer to get split to
+	 * multiple nodes (when using multiple memory pages).
+	 *
+	 * We also don't want to interfere with other parts of shared memory,
+	 * which could easily happen with huge pages (e.g. with data stored before
+	 * buffers).
+	 *
+	 * We do this by aligning to the larger of the two values (we know both
+	 * are power-of-two values, so the larger value is automatically a
+	 * multiple of the lesser one).
+	 *
+	 * XXX Maybe there's a way to use less alignment?
+	 *
+	 * XXX Maybe with (mem_page_size > PG_IO_ALIGN_SIZE), we don't need to
+	 * align to mem_page_size? Especially for very large huge pages (e.g. 1GB)
+	 * that doesn't seem quite worth it. Maybe we should simply align to
+	 * BLCKSZ, so that buffers don't get split? Still, we might interfere with
+	 * other stuff stored in shared memory that we want to allocate on a
+	 * particular NUMA node (e.g. ProcArray).
+	 *
+	 * XXX Maybe with "too large" huge pages we should just not do this, or
+	 * maybe do this only for sufficiently large areas (e.g. shared buffers,
+	 * but not ProcArray).
+	 */
+	buffer_align = Max(mem_page_size, PG_IO_ALIGN_SIZE);
+
+	/* one page is a multiple of the other */
+	Assert(((mem_page_size % PG_IO_ALIGN_SIZE) == 0) ||
+		   ((PG_IO_ALIGN_SIZE % mem_page_size) == 0));
+
+	/* allocate the partition registry first */
+	BufferPartitionsArray = (BufferPartitions *)
+		ShmemInitStruct("Buffer Partitions",
+						offsetof(BufferPartitions, partitions) +
+						mul_size(sizeof(BufferPartition), numa_partitions),
+						&foundParts);
 
-	/* Align descriptors to a cacheline boundary. */
+	/*
+	 * Align descriptors to a cacheline boundary, and memory page.
+	 *
+	 * We want to distribute both to NUMA nodes, so that each buffer and it's
+	 * descriptor are on the same NUMA node. So we align both the same way.
+	 *
+	 * XXX The memory page is always larger than cacheline, so the cacheline
+	 * reference is a bit unnecessary.
+	 *
+	 * XXX In principle we only need to do this with NUMA, otherwise we could
+	 * still align just to cacheline, as before.
+	 */
 	BufferDescriptors = (BufferDescPadded *)
-		ShmemInitStruct("Buffer Descriptors",
-						NBuffers * sizeof(BufferDescPadded),
-						&foundDescs);
+		TYPEALIGN(buffer_align,
+				  ShmemInitStruct("Buffer Descriptors",
+								  NBuffers * sizeof(BufferDescPadded) + buffer_align,
+								  &foundDescs));
 
 	/* Align buffer pool on IO page size boundary. */
 	BufferBlocks = (char *)
-		TYPEALIGN(PG_IO_ALIGN_SIZE,
+		TYPEALIGN(buffer_align,
 				  ShmemInitStruct("Buffer Blocks",
-								  NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
+								  NBuffers * (Size) BLCKSZ + buffer_align,
 								  &foundBufs));
 
 	/* Align condition variables to cacheline boundary. */
@@ -112,6 +203,12 @@ BufferManagerShmemInit(void)
 	{
 		int			i;
 
+		/*
+		 * Initialize the registry of buffer partitions, and also move the
+		 * memory to different NUMA nodes (if enabled by GUC)
+		 */
+		buffer_partitions_init();
+
 		/*
 		 * Initialize all the buffer headers.
 		 */
@@ -144,6 +241,11 @@ BufferManagerShmemInit(void)
 		GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST;
 	}
 
+	/*
+	 * As this point we have all the buffers in a single long freelist. With
+	 * freelist partitioning we rebuild them in StrategyInitialize.
+	 */
+
 	/* Init other shared buffer-management stuff */
 	StrategyInitialize(!foundDescs);
 
@@ -152,24 +254,68 @@ BufferManagerShmemInit(void)
 						 &backend_flush_after);
 }
 
+/*
+ * Determine the size of memory page.
+ *
+ * XXX This is a bit tricky, because the result depends at which point we call
+ * this. Before the allocation we don't know if we succeed in allocating huge
+ * pages - but we have to size everything for the chance that we will. And then
+ * if the huge pages fail (with 'huge_pages=try'), we'll use the regular memory
+ * pages. But at that point we can't adjust the sizing.
+ *
+ * XXX Maybe with huge_pages=try we should do the sizing twice - first with
+ * huge pages, and if that fails, then without them. But not for this patch.
+ * Up to this point there was no such dependency on huge pages.
+ */
+static Size
+get_memory_page_size(void)
+{
+	Size		os_page_size;
+	Size		huge_page_size;
+
+#ifdef WIN32
+	SYSTEM_INFO sysinfo;
+
+	GetSystemInfo(&sysinfo);
+	os_page_size = sysinfo.dwPageSize;
+#else
+	os_page_size = sysconf(_SC_PAGESIZE);
+#endif
+
+	/* assume huge pages get used, unless HUGE_PAGES_OFF */
+	if (huge_pages_status != HUGE_PAGES_OFF)
+		GetHugePageSize(&huge_page_size, NULL);
+	else
+		huge_page_size = 0;
+
+	return Max(os_page_size, huge_page_size);
+}
+
 /*
  * BufferManagerShmemSize
  *
  * compute the size of shared memory for the buffer pool including
  * data pages, buffer descriptors, hash tables, etc.
+ *
+ * XXX Called before allocation, so we don't know if huge pages get used yet.
+ * So we need to assume huge pages get used, and use get_memory_page_size()
+ * to calculate the largest possible memory page.
  */
 Size
 BufferManagerShmemSize(void)
 {
 	Size		size = 0;
 
+	/* calculate partition info for buffers */
+	buffer_partitions_prepare();
+
 	/* size of buffer descriptors */
 	size = add_size(size, mul_size(NBuffers, sizeof(BufferDescPadded)));
 	/* to allow aligning buffer descriptors */
-	size = add_size(size, PG_CACHE_LINE_SIZE);
+	size = add_size(size, Max(numa_page_size, PG_IO_ALIGN_SIZE));
 
 	/* size of data pages, plus alignment padding */
-	size = add_size(size, PG_IO_ALIGN_SIZE);
+	size = add_size(size, Max(numa_page_size, PG_IO_ALIGN_SIZE));
 	size = add_size(size, mul_size(NBuffers, BLCKSZ));
 
 	/* size of stuff controlled by freelist.c */
@@ -184,5 +330,467 @@ BufferManagerShmemSize(void)
 	/* size of checkpoint sort array in bufmgr.c */
 	size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem)));
 
+	/* account for registry of NUMA partitions */
+	size = add_size(size, MAXALIGN(offsetof(BufferPartitions, partitions) +
+								   mul_size(sizeof(BufferPartition), numa_partitions)));
+
 	return size;
 }
+
+/*
+ * Calculate the NUMA node for a given buffer.
+ */
+int
+BufferGetNode(Buffer buffer)
+{
+	/* not NUMA interleaving */
+	if (numa_buffers_per_node == -1)
+		return 0;
+
+	return (buffer / numa_buffers_per_node);
+}
+
+/*
+ * pg_numa_interleave_memory
+ *		move memory to different NUMA nodes in larger chunks
+ *
+ * startptr - start of the region (should be aligned to page size)
+ * endptr - end of the region (doesn't need to be aligned)
+ * mem_page_size - size of the memory page size
+ * chunk_size - size of the chunk to move to a single node (should be multiple
+ *              of page size
+ * num_nodes - number of nodes to allocate memory to
+ *
+ * XXX Maybe this should use numa_tonode_memory and numa_police_memory instead?
+ * That might be more efficient than numa_move_pages, as it works on larger
+ * chunks of memory, not individual system pages, I think.
+ *
+ * XXX The "interleave" name is not quite accurate, I guess.
+ */
+static void
+pg_numa_move_to_node(char *startptr, char *endptr, int node)
+{
+	Size		mem_page_size;
+	Size		sz;
+
+	/*
+	 * Get the "actual" memory page size, not the one we used for sizing. We
+	 * might have used huge page for sizing, but only get regular pages when
+	 * allocating, so we must use the smaller pages here.
+	 *
+	 * XXX A bit weird. Do we need to worry about postmaster? Could this even
+	 * run outside postmaster? I don't think so.
+	 */
+	if (IsUnderPostmaster)
+		mem_page_size = pg_get_shmem_pagesize();
+	else
+		mem_page_size = get_memory_page_size();
+
+	Assert((int64) startptr % mem_page_size == 0);
+
+	sz = (endptr - startptr);
+	numa_tonode_memory(startptr, sz, node);
+}
+
+
+#define MIN_BUFFER_PARTITIONS	4
+
+/*
+ * buffer_partitions_prepare
+ *		Calculate parameters for partitioning buffers.
+ *
+ * We want to split the shared buffers into multiple partitions, of roughly
+ * the same size. This is meant to serve multiple purposes. We want to map
+ * the partitions to different NUMA nodes, to balance memory usage, and
+ * allow partitioning some data structures built on top of buffers, to give
+ * preference to local access (buffers on the same NUMA node). This applies
+ * mostly to freelists and clocksweep.
+ *
+ * We may want to use partitioning even on non-NUMA systems, or when running
+ * on a single NUMA node. Partitioning the freelist/clocksweep is beneficial
+ * even without the NUMA effects.
+ *
+ * So we try to always build at least 4 partitions (MIN_BUFFER_PARTITIONS)
+ * in total, or at least one partition per NUMA node. We always create the
+ * same number of partitions per NUMA node.
+ *
+ * Some examples:
+ *
+ * - non-NUMA system (or 1 NUMA node): 4 partitions for the single node
+ *
+ * - 2 NUMA nodes: 4 partitions, 2 for each node
+ *
+ * - 3 NUMA nodes: 6 partitions, 2 for each node
+ *
+ * - 4+ NUMA nodes: one partition per node
+ *
+ * NUMA works on the memory-page granularity, which determines the smallest
+ * amount of memory we can allocate to single node. This is determined by
+ * how many BufferDescriptors fit onto a single memory page, so this depends
+ * on huge page support. With 2MB huge pages (typical on x86 Linux), this is
+ * 32768 buffers (256MB). With regular 4kB pages, it's 64 buffers (512KB).
+ *
+ * Note: This is determined before the allocation, i.e. we don't know if the
+ * allocation got to use huge pages. So unless huge_pages=off we assume we're
+ * using huge pages.
+ *
+ * This minimal size requirement only matters for the per-node amount of
+ * memory, not for the individual partitions. The partitions for the same
+ * node are a contiguous chunk of memory, which can be split arbitrarily,
+ * it's independent of the NUMA granularity.
+ *
+ * XXX This patch only implements placing the buffers onto different NUMA
+ * nodes. The freelist/clocksweep partitioning is implemented in separate
+ * patches later in the patch series. Those patches however use the same
+ * buffer partition registry, to align the partitions.
+ *
+ *
+ * XXX This needs to consider the minimum chunk size, i.e. we can't split
+ * buffers beyond some point, at some point it gets we run into the size of
+ * buffer descriptors. Not sure if we should give preference to one of these
+ * (probably at least print a warning).
+ *
+ * XXX We want to do this even with numa_buffers_interleave=false, so that the
+ * other patches can do their partitioning. But in that case we don't need to
+ * enforce the min chunk size (probably)?
+ *
+ * XXX We need to only call this once, when sizing the memory. But at that
+ * point we don't know if we get to use huge pages or not (unless when huge
+ * pages are disabled). We'll proceed as if the huge pages were used, and we
+ * may have to use larger partitions. Maybe there's some sort of fallback,
+ * but for now we simply disable the NUMA partitioning - it simply means the
+ * shared buffers are too small.
+ *
+ * XXX We don't need to make each partition a multiple of min_partition_size.
+ * That's something we need to do for a node (because NUMA works at granularity
+ * of pages), but partitions for a single node can split that arbitrarily.
+ * Although keeping the sizes power-of-two would allow calculating everything
+ * as shift/mask, without expensive division/modulo operations.
+ */
+static void
+buffer_partitions_prepare(void)
+{
+	/*
+	 * Minimum number of buffers we can allocate to a NUMA node (determined by
+	 * how many BufferDescriptors fit onto a memory page).
+	 */
+	int			min_node_buffers;
+
+	/*
+	 * Maximum number of nodes we can split shared buffers to, assuming each
+	 * node gets the smallest allocatable chunk (the last node can get a
+	 * smaller amount of memory, not the full chunk).
+	 */
+	int			max_nodes;
+
+	/*
+	 * How many partitions to create per node. Could be more than 1 for small
+	 * number of nodes (of non-NUMA systems).
+	 */
+	int			num_partitions_per_node;
+
+	/* bail out if already initialized (calculate only once) */
+	if (numa_nodes != -1)
+		return;
+
+	/* XXX only gives us the number, the nodes may not be 0, 1, 2, ... */
+	numa_nodes = numa_num_configured_nodes();
+
+	/* XXX can this happen? */
+	if (numa_nodes < 1)
+		numa_nodes = 1;
+
+	elog(WARNING, "IsUnderPostmaster %d", IsUnderPostmaster);
+
+	/*
+	 * XXX A bit weird. Do we need to worry about postmaster? Could this even
+	 * run outside postmaster? I don't think so.
+	 *
+	 * XXX Another issue is we may get different values than when sizing the
+	 * the memory, because at that point we didn't know if we get huge pages,
+	 * so we assumed we will. Shouldn't cause crashes, but we might allocate
+	 * shared memory and then not use some of it (because of the alignment
+	 * that we don't actually need). Not sure about better way, good for now.
+	 */
+	if (IsUnderPostmaster)
+		numa_page_size = pg_get_shmem_pagesize();
+	else
+		numa_page_size = get_memory_page_size();
+
+	/* make sure the chunks will align nicely */
+	Assert(BLCKSZ % sizeof(BufferDescPadded) == 0);
+	Assert(numa_page_size % sizeof(BufferDescPadded) == 0);
+	Assert(((BLCKSZ % numa_page_size) == 0) || ((numa_page_size % BLCKSZ) == 0));
+
+	/*
+	 * The minimum number of buffers we can allocate from a single node, using
+	 * the memory page size (determined by buffer descriptors). NUMA allocates
+	 * memory in pages, and we need to do that for both buffers and
+	 * descriptors at the same time.
+	 *
+	 * In practice the BLCKSZ doesn't really matter, because it's much larger
+	 * than BufferDescPadded, so the result is determined buffer descriptors.
+	 */
+	min_node_buffers = (numa_page_size / sizeof(BufferDescPadded));
+
+	/*
+	 * Maximum number of nodes (each getting min_node_buffers) we can handle
+	 * given the current shared buffers size. The last node is allowed to be
+	 * smaller (half of the other nodes).
+	 */
+	max_nodes = (NBuffers + (min_node_buffers / 2)) / min_node_buffers;
+
+	/*
+	 * Can we actually do NUMA partitioning with these settings? If we can't
+	 * handle the current number of nodes, then no.
+	 *
+	 * XXX This shouldn't be a big issue in practice. NUMA systems typically
+	 * run with large shared buffers, which also makes the imbalance issues
+	 * fairly significant (it's quick to rebalance 128MB, much slower to do
+	 * that for 256GB).
+	 */
+	numa_can_partition = true;	/* assume we can allocate to nodes */
+	if (numa_nodes > max_nodes)
+	{
+		elog(WARNING, "shared buffers too small for %d nodes (max nodes %d)",
+			 numa_nodes, max_nodes);
+		numa_can_partition = false;
+	}
+
+	/*
+	 * We know we can partition to the desired number of nodes, now it's time
+	 * to figure out how many partitions we need per node. We simply add
+	 * partitions per node until we reach MIN_BUFFER_PARTITIONS.
+	 *
+	 * XXX Maybe we should make sure to keep the actual partition size a power
+	 * of 2, to make the calculations simpler (shift instead of mod).
+	 */
+	num_partitions_per_node = 1;
+
+	while (numa_nodes * num_partitions_per_node < MIN_BUFFER_PARTITIONS)
+		num_partitions_per_node++;
+
+	/* now we know the total number of partitions */
+	numa_partitions = (numa_nodes * num_partitions_per_node);
+
+	/*
+	 * Finally, calculate how many buffers we'll assign to a single NUMA node.
+	 * If we have only a single node, or can't map to that many nodes, just
+	 * take a "fair share" of buffers.
+	 *
+	 * XXX In both cases the last node can get fewer buffers.
+	 */
+	if (!numa_can_partition)
+	{
+		numa_buffers_per_node = (NBuffers + (numa_nodes - 1)) / numa_nodes;
+	}
+	else
+	{
+		numa_buffers_per_node = min_node_buffers;
+		while (numa_buffers_per_node * numa_nodes < NBuffers)
+			numa_buffers_per_node += min_node_buffers;
+
+		/* the last node should get at least some buffers */
+		Assert(NBuffers - (numa_nodes - 1) * numa_buffers_per_node > 0);
+	}
+
+	elog(LOG, "NUMA: buffers %d partitions %d num_nodes %d per_node %d buffers_per_node %d (min %d)",
+		 NBuffers, numa_partitions, numa_nodes, num_partitions_per_node,
+		 numa_buffers_per_node, min_node_buffers);
+}
+
+static void
+AssertCheckBufferPartitions(void)
+{
+#ifdef USE_ASSERT_CHECKING
+	int			num_buffers = 0;
+
+	for (int i = 0; i < numa_partitions; i++)
+	{
+		BufferPartition *part = &BufferPartitionsArray->partitions[i];
+
+		/*
+		 * We can get a single-buffer partition, if the sizing forces the last
+		 * partition to be just one buffer. But it's unlikely (and
+		 * undesirable).
+		 */
+		Assert(part->first_buffer <= part->last_buffer);
+		Assert((part->last_buffer - part->first_buffer + 1) == part->num_buffers);
+
+		num_buffers += part->num_buffers;
+
+		/*
+		 * The first partition needs to start on buffer 0. Later partitions
+		 * need to be contiguous, without skipping any buffers.
+		 */
+		if (i == 0)
+		{
+			Assert(part->first_buffer == 0);
+		}
+		else
+		{
+			BufferPartition *prev = &BufferPartitionsArray->partitions[i - 1];
+
+			Assert((part->first_buffer - 1) == prev->last_buffer);
+		}
+
+		/* the last partition needs to end on buffer (NBuffers - 1) */
+		if (i == (numa_partitions - 1))
+		{
+			Assert(part->last_buffer == (NBuffers - 1));
+		}
+	}
+
+	Assert(num_buffers == NBuffers);
+#endif
+}
+
+static void
+buffer_partitions_init(void)
+{
+	int			remaining_buffers = NBuffers;
+	int			buffer = 0;
+	int			parts_per_node = (numa_partitions / numa_nodes);
+	char	   *buffers_ptr,
+			   *descriptors_ptr;
+
+	BufferPartitionsArray->npartitions = numa_partitions;
+
+	for (int n = 0; n < numa_nodes; n++)
+	{
+		/* buffers this node should get (last node can get fewer) */
+		int			node_buffers = Min(remaining_buffers, numa_buffers_per_node);
+
+		/* split node buffers netween partitions (last one can get fewer) */
+		int			part_buffers = (node_buffers + (parts_per_node - 1)) / parts_per_node;
+
+		remaining_buffers -= node_buffers;
+
+		Assert((node_buffers > 0) && (node_buffers <= NBuffers));
+		Assert((n >= 0) && (n < numa_nodes));
+
+		for (int p = 0; p < parts_per_node; p++)
+		{
+			int			idx = (n * parts_per_node) + p;
+			BufferPartition *part = &BufferPartitionsArray->partitions[idx];
+			int			num_buffers = Min(node_buffers, part_buffers);
+
+			Assert((idx >= 0) && (idx < numa_partitions));
+			Assert((buffer >= 0) && (buffer < NBuffers));
+			Assert((num_buffers > 0) && (num_buffers <= part_buffers));
+
+			/* XXX we should get the actual node ID from the mask */
+			part->numa_node = n;
+
+			part->num_buffers = num_buffers;
+			part->first_buffer = buffer;
+			part->last_buffer = buffer + (num_buffers - 1);
+
+			elog(LOG, "NUMA: buffer %d node %d partition %d buffers %d first %d last %d", idx, n, p, num_buffers, buffer, buffer + (num_buffers - 1));
+
+			buffer += num_buffers;
+			node_buffers -= part_buffers;
+		}
+	}
+
+	AssertCheckBufferPartitions();
+
+	/*
+	 * With buffers interleaving disabled (or can't partition, because of
+	 * shared buffers being too small), we're done.
+	 */
+	if (!numa_buffers_interleave || !numa_can_partition)
+		return;
+
+	/*
+	 * Assign chunks of buffers and buffer descriptors to the available NUMA
+	 * nodes. We can't use the regular interleaving, because with regular
+	 * memory pages (smaller than BLCKSZ) we'd split all buffers to multiple
+	 * NUMA nodes. And we don't want that.
+	 *
+	 * But even with huge pages it seems like a good idea to not have mapping
+	 * for each page.
+	 *
+	 * So we always assign a larger contiguous chunk of buffers to the same
+	 * NUMA node, as calculated by choose_chunk_buffers(). We try to keep the
+	 * chunks large enough to work both for buffers and buffer descriptors,
+	 * but not too large. See the comments at choose_chunk_buffers() for
+	 * details.
+	 *
+	 * Thanks to the earlier alignment (to memory page etc.), we know the
+	 * buffers won't get split, etc.
+	 *
+	 * This also makes it easier / straightforward to calculate which NUMA
+	 * node a buffer belongs to (it's a matter of divide + mod). See
+	 * BufferGetNode().
+	 *
+	 * We need to account for partitions being of different length, when the
+	 * NBuffers is not nicely divisible. To do that we keep track of the start
+	 * of the next partition.
+	 */
+	buffers_ptr = BufferBlocks;
+	descriptors_ptr = (char *) BufferDescriptors;
+
+	for (int i = 0; i < numa_partitions; i++)
+	{
+		BufferPartition *part = &BufferPartitionsArray->partitions[i];
+		char	   *startptr,
+				   *endptr;
+
+		/* first map buffers */
+		startptr = buffers_ptr;
+		endptr = startptr + ((Size) part->num_buffers * BLCKSZ);
+		buffers_ptr = endptr;	/* start of the next partition */
+
+		elog(LOG, "NUMA: buffer_partitions_init: %d => %d buffers %d start %p end %p (size %ld)",
+			 i, part->numa_node, part->num_buffers, startptr, endptr, (endptr - startptr));
+
+		pg_numa_move_to_node(startptr, endptr, part->numa_node);
+
+		/* now do the same for buffer descriptors */
+		startptr = descriptors_ptr;
+		endptr = startptr + ((Size) part->num_buffers * sizeof(BufferDescPadded));
+		descriptors_ptr = endptr;
+
+		elog(LOG, "NUMA: buffer_partitions_init: %d => %d descriptors %d start %p end %p (size %ld)",
+			 i, part->numa_node, part->num_buffers, startptr, endptr, (endptr - startptr));
+
+		pg_numa_move_to_node(startptr, endptr, part->numa_node);
+	}
+
+	/* we should have consumed the arrays exactly */
+	Assert(buffers_ptr == BufferBlocks + (Size) NBuffers * BLCKSZ);
+	Assert(descriptors_ptr == (char *) BufferDescriptors + (Size) NBuffers * sizeof(BufferDescPadded));
+}
+
+int
+BufferPartitionCount(void)
+{
+	return BufferPartitionsArray->npartitions;
+}
+
+void
+BufferPartitionGet(int idx, int *node, int *num_buffers,
+				   int *first_buffer, int *last_buffer)
+{
+	if ((idx >= 0) && (idx < BufferPartitionsArray->npartitions))
+	{
+		BufferPartition *part = &BufferPartitionsArray->partitions[idx];
+
+		*node = part->numa_node;
+		*num_buffers = part->num_buffers;
+		*first_buffer = part->first_buffer;
+		*last_buffer = part->last_buffer;
+
+		return;
+	}
+
+	elog(ERROR, "invalid partition index");
+}
+
+void
+BufferPartitionParams(int *num_partitions, int *num_nodes)
+{
+	*num_partitions = numa_partitions;
+	*num_nodes = numa_nodes;
+}
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index d31cb45a058..876cb64cf66 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -145,6 +145,9 @@ int			max_worker_processes = 8;
 int			max_parallel_workers = 8;
 int			MaxBackends = 0;
 
+/* NUMA stuff */
+bool		numa_buffers_interleave = false;
+
 /* GUC parameters for vacuum */
 int			VacuumBufferUsageLimit = 2048;
 
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index d14b1678e7f..9570087aa60 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -2116,6 +2116,16 @@ struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"numa_buffers_interleave", PGC_POSTMASTER, DEVELOPER_OPTIONS,
+			gettext_noop("Enables NUMA interleaving of shared buffers."),
+			gettext_noop("When enabled, the buffers in shared memory are interleaved to all NUMA nodes."),
+		},
+		&numa_buffers_interleave,
+		false,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"sync_replication_slots", PGC_SIGHUP, REPLICATION_STANDBY,
 			gettext_noop("Enables a physical standby to synchronize logical failover replication slots from the primary server."),
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 1bef98471c3..014a6079af2 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -178,6 +178,8 @@ extern PGDLLIMPORT int MaxConnections;
 extern PGDLLIMPORT int max_worker_processes;
 extern PGDLLIMPORT int max_parallel_workers;
 
+extern PGDLLIMPORT bool numa_buffers_interleave;
+
 extern PGDLLIMPORT int commit_timestamp_buffers;
 extern PGDLLIMPORT int multixact_member_buffers;
 extern PGDLLIMPORT int multixact_offset_buffers;
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 52a71b138f7..9dfbecb9fe4 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -323,6 +323,7 @@ typedef struct WritebackContext
 
 /* in buf_init.c */
 extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
+extern PGDLLIMPORT BufferPartitions *BufferPartitionsArray;
 extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
 extern PGDLLIMPORT WritebackContext BackendWritebackContext;
 
@@ -491,4 +492,9 @@ extern void DropRelationLocalBuffers(RelFileLocator rlocator,
 extern void DropRelationAllLocalBuffers(RelFileLocator rlocator);
 extern void AtEOXact_LocalBuffers(bool isCommit);
 
+extern int	BufferPartitionCount(void);
+extern void BufferPartitionGet(int idx, int *node, int *num_buffers,
+							   int *first_buffer, int *last_buffer);
+extern void BufferPartitionParams(int *num_partitions, int *num_nodes);
+
 #endif							/* BUFMGR_INTERNALS_H */
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 41fdc1e7693..deaf4f19fa4 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -143,6 +143,20 @@ struct ReadBuffersOperation
 
 typedef struct ReadBuffersOperation ReadBuffersOperation;
 
+typedef struct BufferPartition
+{
+	int			numa_node;
+	int			num_buffers;
+	int			first_buffer;
+	int			last_buffer;
+} BufferPartition;
+
+typedef struct BufferPartitions
+{
+	int			npartitions;
+	BufferPartition partitions[FLEXIBLE_ARRAY_MEMBER];
+} BufferPartitions;
+
 /* forward declared, to avoid having to expose buf_internals.h here */
 struct WritebackContext;
 
@@ -319,6 +333,7 @@ extern void EvictRelUnpinnedBuffers(Relation rel,
 /* in buf_init.c */
 extern void BufferManagerShmemInit(void);
 extern Size BufferManagerShmemSize(void);
+extern int	BufferGetNode(Buffer buffer);
 
 /* in localbuf.c */
 extern void AtProcExit_LocalBuffers(void);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index e6f2e93b2d6..03ca3b7c8bc 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -346,6 +346,8 @@ BufferDescPadded
 BufferHeapTupleTableSlot
 BufferLookupEnt
 BufferManagerRelation
+BufferPartition
+BufferPartitions
 BufferStrategyControl
 BufferTag
 BufferUsage
-- 
2.50.1