From a31e641c96beccea652f4e93ecee22398ff80b15 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Tue, 11 Nov 2025 12:05:35 +0100
Subject: [PATCH v20251121 6/7] NUMA: shared buffers partitioning

Ensure shared buffers are allocated from all NUMA nodes, in a balanced
way, instead of just using the node where Postgres initially starts, or
where the kernel decides to migrate the page, etc. With pre-warming
performed by a single backend, this can easily result in severely
unbalanced memory distribution (with most from a single NUMA node).

The kernel would eventually move some of the memory to other nodes
(thanks to zone_reclaim), but that tends to take a long time. So this
patch improves predictability, reduces the time needed for warmup
during benchmarking, etc.  It's less dependent on what the CPU
scheduler does, etc.

Furthermore, the buffers are mapped to NUMA nodes in a deterministic
way, so this also allows further improvements like backends using
buffers from the same NUMA node.

The effect is similar to

     numactl --interleave=all

but there's a number of important differences.

Firstly, it's applied only to shared buffers (and also to descriptors),
not to the whole shared memory segment. It's not clear we'd want to use
interleaving for all parts, storing entries with different sizes and
life cycles (e.g. ProcArray may need different approach).

Secondly, it considers the page and block size, and makes sure to always
put the whole buffer on a single NUMA node (even if it happens to use
multiple memory pages), and to keep the buffer and it's descriptor on
the same NUMA node. The seriousness/likelihood of these issues depends
on the memory page size (regular vs. huge pages).

The mapping of memory to NUMA nodes happens in larger chunks. This is
required to handle buffer descriptors (which are smaller than buffers),
and so many more fit onto a single memory page.

The number of buffer descriptors per memory page determines the smallest
number of buffers that can be placed on a NUMA node. With 2MB huge pages
this is 256MB, with 4KB pages this is 512KB). Nodes get a multiple of
this, and we try to keep the nodes balanced - the last node can get less
memory, though.

The "buffer partitions" may not be 1:1 with NUMA nodes. There's a
minimal number of partitions (default: 4) that will be created even with
fewer NUMA nodes, or no NUMA at all. Each node gets the same number of
partitions, to keep things simple. For example, with 2 nodes there'll be
4 partitions, with each node getting 2 of them. With 3 nodes there'll be
6 partitions (again, 2 per node).

Notes:

* The feature is enabled by debug_numa = buffers GUC (default: empty),
  which works similarly to debug_io_direct.

* This patch partitions just shared buffers, not the whole shared
  memory. A later patch will do that for PGPROC, but it's tricky and
  requires a different approach because of huge pages.
---
 .../pg_buffercache--1.6--1.7.sql              |   1 +
 contrib/pg_buffercache/pg_buffercache_pages.c |  52 +-
 src/backend/port/sysv_shmem.c                 |  32 +
 src/backend/storage/buffer/buf_init.c         | 569 +++++++++++++++++-
 src/backend/storage/buffer/freelist.c         |  88 ++-
 src/backend/utils/misc/guc_parameters.dat     |  10 +
 src/backend/utils/misc/guc_tables.c           |   1 +
 src/include/port/pg_numa.h                    |   6 +
 src/include/storage/buf_internals.h           |  14 +-
 src/include/storage/bufmgr.h                  |   4 +
 src/include/utils/guc_hooks.h                 |   3 +
 src/port/pg_numa.c                            |  64 ++
 12 files changed, 772 insertions(+), 72 deletions(-)

diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
index 2c4d560514d..dc2ce019283 100644
--- a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
+++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
@@ -13,6 +13,7 @@ LANGUAGE C PARALLEL SAFE;
 CREATE VIEW pg_buffercache_partitions AS
 	SELECT P.* FROM pg_buffercache_partitions() AS P
 	(partition integer,			-- partition index
+	 numa_node integer,			-- NUMA node of the partitioon
 	 num_buffers integer,		-- number of buffers in the partition
 	 first_buffer integer,		-- first buffer of partition
 	 last_buffer integer,		-- last buffer of partition
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index aa8ea08e1bb..179a38fd6ed 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -29,7 +29,7 @@
 #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
 
 #define NUM_BUFFERCACHE_NUMA_ELEM	3
-#define NUM_BUFFERCACHE_PARTITIONS_ELEM	11
+#define NUM_BUFFERCACHE_PARTITIONS_ELEM	12
 
 PG_MODULE_MAGIC_EXT(
 					.name = "pg_buffercache",
@@ -813,25 +813,27 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS)
 		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
 		TupleDescInitEntry(tupledesc, (AttrNumber) 1, "partition",
 						   INT4OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "num_buffers",
+		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "numa_node",
 						   INT4OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 3, "first_buffer",
+		TupleDescInitEntry(tupledesc, (AttrNumber) 3, "num_buffers",
 						   INT4OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 4, "last_buffer",
+		TupleDescInitEntry(tupledesc, (AttrNumber) 4, "first_buffer",
 						   INT4OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 5, "num_passes",
+		TupleDescInitEntry(tupledesc, (AttrNumber) 5, "last_buffer",
+						   INT4OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 6, "num_passes",
 						   INT8OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 6, "next_buffer",
+		TupleDescInitEntry(tupledesc, (AttrNumber) 7, "next_buffer",
 						   INT4OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 7, "total_allocs",
+		TupleDescInitEntry(tupledesc, (AttrNumber) 8, "total_allocs",
 						   INT8OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 8, "num_allocs",
+		TupleDescInitEntry(tupledesc, (AttrNumber) 9, "num_allocs",
 						   INT8OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 9, "total_req_allocs",
+		TupleDescInitEntry(tupledesc, (AttrNumber) 10, "total_req_allocs",
 						   INT8OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 10, "num_req_allocs",
+		TupleDescInitEntry(tupledesc, (AttrNumber) 11, "num_req_allocs",
 						   INT8OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 11, "weigths",
+		TupleDescInitEntry(tupledesc, (AttrNumber) 12, "weigths",
 						   typentry->typarray, -1, 0);
 
 		funcctx->user_fctx = BlessTupleDesc(tupledesc);
@@ -849,7 +851,8 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS)
 	{
 		uint32		i = funcctx->call_cntr;
 
-		int			num_buffers,
+		int			numa_node,
+					num_buffers,
 					first_buffer,
 					last_buffer;
 
@@ -868,7 +871,7 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS)
 		Datum		values[NUM_BUFFERCACHE_PARTITIONS_ELEM];
 		bool		nulls[NUM_BUFFERCACHE_PARTITIONS_ELEM];
 
-		BufferPartitionGet(i, &num_buffers,
+		BufferPartitionGet(i, &numa_node, &num_buffers,
 						   &first_buffer, &last_buffer);
 
 		ClockSweepPartitionGetInfo(i,
@@ -886,36 +889,39 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS)
 		values[0] = Int32GetDatum(i);
 		nulls[0] = false;
 
-		values[1] = Int32GetDatum(num_buffers);
+		values[1] = Int32GetDatum(numa_node);
 		nulls[1] = false;
 
-		values[2] = Int32GetDatum(first_buffer);
+		values[2] = Int32GetDatum(num_buffers);
 		nulls[2] = false;
 
-		values[3] = Int32GetDatum(last_buffer);
+		values[3] = Int32GetDatum(first_buffer);
 		nulls[3] = false;
 
-		values[4] = Int64GetDatum(complete_passes);
+		values[4] = Int32GetDatum(last_buffer);
 		nulls[4] = false;
 
-		values[5] = Int32GetDatum(next_victim_buffer);
+		values[5] = Int64GetDatum(complete_passes);
 		nulls[5] = false;
 
-		values[6] = Int64GetDatum(buffer_total_allocs);
+		values[6] = Int32GetDatum(next_victim_buffer);
 		nulls[6] = false;
 
-		values[7] = Int64GetDatum(buffer_allocs);
+		values[7] = Int64GetDatum(buffer_total_allocs);
 		nulls[7] = false;
 
-		values[8] = Int64GetDatum(buffer_total_req_allocs);
+		values[8] = Int64GetDatum(buffer_allocs);
 		nulls[8] = false;
 
-		values[9] = Int64GetDatum(buffer_req_allocs);
+		values[9] = Int64GetDatum(buffer_total_req_allocs);
 		nulls[9] = false;
 
-		values[10] = PointerGetDatum(array);
+		values[10] = Int64GetDatum(buffer_req_allocs);
 		nulls[10] = false;
 
+		values[11] = PointerGetDatum(array);
+		nulls[11] = false;
+
 		/* Build and return the tuple. */
 		tuple = heap_form_tuple((TupleDesc) funcctx->user_fctx, values, nulls);
 		result = HeapTupleGetDatum(tuple);
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 197926d44f6..6019bee334d 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -19,6 +19,7 @@
  */
 #include "postgres.h"
 
+#include <numa.h>
 #include <signal.h>
 #include <unistd.h>
 #include <sys/file.h>
@@ -602,6 +603,14 @@ CreateAnonymousSegment(Size *size)
 	void	   *ptr = MAP_FAILED;
 	int			mmap_errno = 0;
 
+	/*
+	 * Set the memory policy to interleave to all NUMA nodes before calling
+	 * mmap, in case we use MAP_POPULATE to prefault all the pages.
+	 *
+	 * XXX Probably not needed without that, but also costs nothing.
+	 */
+	numa_set_membind(numa_all_nodes_ptr);
+
 #ifndef MAP_HUGETLB
 	/* PGSharedMemoryCreate should have dealt with this case */
 	Assert(huge_pages != HUGE_PAGES_ON);
@@ -616,6 +625,9 @@ CreateAnonymousSegment(Size *size)
 
 		GetHugePageSize(&hugepagesize, &mmap_flags);
 
+		// prefault the memory at start?
+		// mmap_flags |= MAP_POPULATE;
+
 		if (allocsize % hugepagesize != 0)
 			allocsize += hugepagesize - (allocsize % hugepagesize);
 
@@ -638,6 +650,11 @@ CreateAnonymousSegment(Size *size)
 
 	if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON)
 	{
+		int			mmap_flags = 0;
+
+		// prefault the memory at start?
+		// mmap_flags |= MAP_POPULATE;
+
 		/*
 		 * Use the original size, not the rounded-up value, when falling back
 		 * to non-huge pages.
@@ -663,6 +680,21 @@ CreateAnonymousSegment(Size *size)
 						 allocsize) : 0));
 	}
 
+	/* undo the earlier num_set_membind() call. */
+	numa_set_localalloc();
+
+	/*
+	 * Before touching the memory, set the allocation policy, so that
+	 * it gets interleaved by default. We have to do this to distribute
+	 * the memory that's not located explicitly. We need this especially
+	 * with huge pages, where we could run out of huge pages on some
+	 * nodes and crash otherwise.
+	 *
+	 * XXX Probably not needed with MAP_POPULATE, in which case the policy
+	 * was already set by num_set_membind() earlier. But doesn't hurt.
+	 */
+	numa_interleave_memory(ptr, allocsize, numa_all_nodes_ptr);
+
 	*size = allocsize;
 	return ptr;
 }
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 0362fda24aa..587859a5754 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -14,6 +14,12 @@
  */
 #include "postgres.h"
 
+#ifdef USE_LIBNUMA
+#include <numa.h>
+#include <numaif.h>
+#endif
+
+#include "port/pg_numa.h"
 #include "storage/aio.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
@@ -29,15 +35,24 @@ ConditionVariableMinimallyPadded *BufferIOCVArray;
 WritebackContext BackendWritebackContext;
 CkptSortItem *CkptBufferIds;
 
-/* *
- * number of buffer partitions */
-#define NUM_CLOCK_SWEEP_PARTITIONS	4
+/*
+ * Minimum number of buffer partitions, no matter the number of NUMA nodes.
+ */
+#define MIN_BUFFER_PARTITIONS	4
 
 /* Array of structs with information about buffer ranges */
 BufferPartitions *BufferPartitionsArray = NULL;
 
+static void buffer_partitions_prepare(void);
 static void buffer_partitions_init(void);
 
+/* number of NUMA nodes (as returned by numa_num_configured_nodes) */
+static int	numa_nodes = -1;	/* number of nodes when sizing */
+static Size numa_page_size = 0; /* page used to size partitions */
+static bool numa_can_partition = false; /* can map to NUMA nodes? */
+static int	numa_buffers_per_node = -1; /* buffers per node */
+static int	numa_partitions = 0;	/* total (multiple of nodes) */
+
 /*
  * Data Structures:
  *		buffers live in a freelist and a lookup data structure.
@@ -85,25 +100,85 @@ BufferManagerShmemInit(void)
 				foundIOCV,
 				foundBufCkpt,
 				foundParts;
+	Size		buffer_align;
+
+	/*
+	 * Determine the memory page size used to partition shared buffers over
+	 * the available NUMA nodes.
+	 *
+	 * XXX We have to call prepare again, because with EXEC_BACKEND we may not
+	 * see the values already calculated in BufferManagerShmemSize().
+	 *
+	 * XXX We need to be careful to get the same value when calculating the
+	 * and then later when initializing the structs after allocation, or to not
+	 * depend on that value too much. Before the allocation we don't know if we
+	 * get huge pages, so we just have to assume we do.
+	 */
+	buffer_partitions_prepare();
+
+	/*
+	 * With NUMA we need to ensure the buffers are properly aligned not just
+	 * to PG_IO_ALIGN_SIZE, but also to memory page size. NUMA works on page
+	 * granularity, and we don't want a buffer to get split to multiple nodes
+	 * (when spanning multiple memory pages).
+	 *
+	 * We also don't want to interfere with other parts of shared memory,
+	 * which could easily happen with huge pages (e.g. with data stored before
+	 * buffers).
+	 *
+	 * We do this by aligning to the larger of the two values (we know both
+	 * are power-of-two values, so the larger value is automatically a
+	 * multiple of the lesser one).
+	 *
+	 * XXX Maybe there's a way to use less alignment?
+	 *
+	 * XXX Maybe with (numa_page_size > PG_IO_ALIGN_SIZE), we don't need to
+	 * align to numa_page_size? Especially for very large huge pages (e.g. 1GB)
+	 * that doesn't seem quite worth it. Maybe we should simply align to
+	 * BLCKSZ, so that buffers don't get split? Still, we might interfere with
+	 * other stuff stored in shared memory that we want to allocate on a
+	 * particular NUMA node (e.g. ProcArray).
+	 *
+	 * XXX Maybe with "too large" huge pages we should just not do this, or
+	 * maybe do this only for sufficiently large areas (e.g. shared buffers,
+	 * but not ProcArray).
+	 */
+	buffer_align = Max(numa_page_size, PG_IO_ALIGN_SIZE);
+
+	/* one page is a multiple of the other */
+	Assert(((numa_page_size % PG_IO_ALIGN_SIZE) == 0) ||
+		   ((PG_IO_ALIGN_SIZE % numa_page_size) == 0));
 
 	/* allocate the partition registry first */
 	BufferPartitionsArray = (BufferPartitions *)
 		ShmemInitStruct("Buffer Partitions",
 						offsetof(BufferPartitions, partitions) +
-						mul_size(sizeof(BufferPartition), NUM_CLOCK_SWEEP_PARTITIONS),
+						mul_size(sizeof(BufferPartition), numa_partitions),
 						&foundParts);
 
-	/* Align descriptors to a cacheline boundary. */
+	/*
+	 * Align descriptors to a cacheline boundary, and memory page.
+	 *
+	 * We want to distribute both to NUMA nodes, so that each buffer and it's
+	 * descriptor are on the same NUMA node. So we align both the same way.
+	 *
+	 * XXX The memory page is always larger than cacheline, so the cacheline
+	 * reference is a bit unnecessary.
+	 *
+	 * XXX In principle we only need to do this with NUMA, otherwise we could
+	 * still align just to cacheline, as before.
+	 */
 	BufferDescriptors = (BufferDescPadded *)
-		ShmemInitStruct("Buffer Descriptors",
-						NBuffers * sizeof(BufferDescPadded),
-						&foundDescs);
+		TYPEALIGN(buffer_align,
+				  ShmemInitStruct("Buffer Descriptors",
+								  NBuffers * sizeof(BufferDescPadded) + buffer_align,
+								  &foundDescs));
 
 	/* Align buffer pool on IO page size boundary. */
 	BufferBlocks = (char *)
-		TYPEALIGN(PG_IO_ALIGN_SIZE,
+		TYPEALIGN(buffer_align,
 				  ShmemInitStruct("Buffer Blocks",
-								  NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
+								  NBuffers * (Size) BLCKSZ + buffer_align,
 								  &foundBufs));
 
 	/* Align condition variables to cacheline boundary. */
@@ -133,7 +208,10 @@ BufferManagerShmemInit(void)
 	{
 		int			i;
 
-		/* Initialize buffer partitions (calculate buffer ranges). */
+		/*
+		 * Initialize buffer partitions, including moving memory to different
+		 * NUMA nodes (if enabled by GUC).
+		 */
 		buffer_partitions_init();
 
 		/*
@@ -172,19 +250,26 @@ BufferManagerShmemInit(void)
  *
  * compute the size of shared memory for the buffer pool including
  * data pages, buffer descriptors, hash tables, etc.
+ *
+ * XXX Called before allocation, so we don't know if huge pages get used yet.
+ * So we need to assume huge pages get used, and use get_memory_page_size()
+ * to calculate the largest possible memory page.
  */
 Size
 BufferManagerShmemSize(void)
 {
 	Size		size = 0;
 
+	/* calculate partition info for buffers */
+	buffer_partitions_prepare();
+
 	/* size of buffer descriptors */
 	size = add_size(size, mul_size(NBuffers, sizeof(BufferDescPadded)));
 	/* to allow aligning buffer descriptors */
-	size = add_size(size, PG_CACHE_LINE_SIZE);
+	size = add_size(size, Max(numa_page_size, PG_IO_ALIGN_SIZE));
 
 	/* size of data pages, plus alignment padding */
-	size = add_size(size, PG_IO_ALIGN_SIZE);
+	size = add_size(size, Max(numa_page_size, PG_IO_ALIGN_SIZE));
 	size = add_size(size, mul_size(NBuffers, BLCKSZ));
 
 	/* size of stuff controlled by freelist.c */
@@ -201,11 +286,244 @@ BufferManagerShmemSize(void)
 
 	/* account for registry of NUMA partitions */
 	size = add_size(size, MAXALIGN(offsetof(BufferPartitions, partitions) +
-								   mul_size(sizeof(BufferPartition), NUM_CLOCK_SWEEP_PARTITIONS)));
+								   mul_size(sizeof(BufferPartition), numa_partitions)));
 
 	return size;
 }
 
+/*
+ * Calculate the NUMA node for a given buffer.
+ */
+int
+BufferGetNode(Buffer buffer)
+{
+	/* not NUMA partitioning */
+	if (numa_buffers_per_node == -1)
+		return 0;
+
+	/* no NUMA-aware partitioning */
+	if ((numa_flags & NUMA_BUFFERS) == 0)
+		return 0;
+
+	return (buffer / numa_buffers_per_node);
+}
+
+/*
+ * buffer_partitions_prepare
+ *		Calculate parameters for partitioning buffers.
+ *
+ * We want to split the shared buffers into multiple partitions, of roughly
+ * the same size. This is meant to serve multiple purposes. We want to map
+ * the partitions to different NUMA nodes, to balance memory usage, and
+ * allow partitioning some data structures built on top of buffers, to give
+ * preference to local access (buffers on the same NUMA node). This applies
+ * mostly to freelists and clocksweep.
+ *
+ * We may want to use partitioning even on non-NUMA systems, or when running
+ * on a single NUMA node. Partitioning the freelist/clocksweep is beneficial
+ * even without the NUMA effects.
+ *
+ * So we try to always build at least 4 partitions (MIN_BUFFER_PARTITIONS)
+ * in total, or at least one partition per NUMA node. We always create the
+ * same number of partitions per NUMA node.
+ *
+ * Some examples:
+ *
+ * - non-NUMA system (or 1 NUMA node): 4 partitions for the single node
+ *
+ * - 2 NUMA nodes: 4 partitions, 2 for each node
+ *
+ * - 3 NUMA nodes: 6 partitions, 2 for each node
+ *
+ * - 4+ NUMA nodes: one partition per node
+ *
+ * NUMA works on the memory-page granularity, which determines the smallest
+ * amount of memory we can allocate to single node. This is determined by
+ * how many BufferDescriptors fit onto a single memory page, so this depends
+ * on huge page support. With 2MB huge pages (typical on x86 Linux), this is
+ * 32768 buffers (256MB). With regular 4kB pages, it's 64 buffers (512KB).
+ *
+ * Note: This is determined before the allocation, i.e. we don't know if the
+ * allocation got to use huge pages. So unless huge_pages=off we assume we're
+ * using huge pages.
+ *
+ * This minimal size requirement only matters for the per-node amount of
+ * memory, not for the individual partitions. The partitions for the same
+ * node are a contiguous chunk of memory, which can be split arbitrarily,
+ * it's independent of the NUMA granularity.
+ *
+ * XXX This patch only implements placing the buffers onto different NUMA
+ * nodes. The freelist/clocksweep partitioning is implemented in separate
+ * patches earlier in the patch series. Those patches however use the same
+ * buffer partition registry, to align the partitions.
+ *
+ *
+ * XXX This needs to consider the minimum chunk size, i.e. we can't split
+ * buffers beyond some point, at some point it gets we run into the size of
+ * buffer descriptors. Not sure if we should give preference to one of these
+ * (probably at least print a warning).
+ *
+ * XXX We want to do this even with numa_buffers_interleave=false, so that the
+ * other patches can do their partitioning. But in that case we don't need to
+ * enforce the min chunk size (probably)?
+ *
+ * XXX We need to only call this once, when sizing the memory. But at that
+ * point we don't know if we get to use huge pages or not (unless when huge
+ * pages are disabled). We'll proceed as if the huge pages were used, and we
+ * may have to use larger partitions. Maybe there's some sort of fallback,
+ * but for now we simply disable the NUMA partitioning - it simply means the
+ * shared buffers are too small.
+ *
+ * XXX We don't need to make each partition a multiple of min_partition_size.
+ * That's something we need to do for a node (because NUMA works at granularity
+ * of pages), but partitions for a single node can split that arbitrarily.
+ * Although keeping the sizes power-of-two would allow calculating everything
+ * as shift/mask, without expensive division/modulo operations.
+ */
+static void
+buffer_partitions_prepare(void)
+{
+	/*
+	 * Minimum number of buffers we can allocate to a NUMA node (determined by
+	 * how many BufferDescriptors fit onto a memory page).
+	 */
+	int			min_node_buffers;
+
+	/*
+	 * Maximum number of nodes we can split shared buffers to, assuming each
+	 * node gets the smallest allocatable chunk (the last node can get a
+	 * smaller amount of memory, not the full chunk).
+	 */
+	int			max_nodes;
+
+	/*
+	 * How many partitions to create per node. Could be more than 1 for small
+	 * number of nodes (of non-NUMA systems).
+	 */
+	int			num_partitions_per_node;
+
+	/* bail out if already initialized (calculate only once) */
+	if (numa_nodes != -1)
+		return;
+
+	/* XXX only gives us the number, the nodes may not be 0, 1, 2, ... */
+#if USE_LIBNUMA
+	numa_nodes = numa_num_configured_nodes();
+#else
+	/* without NUMA, assume there's just one node */
+	numa_nodes = 1;
+#endif
+
+	/* we should never get here without at least one NUMA node */
+	Assert(numa_nodes > 0);
+
+	/*
+	 * XXX A bit weird. Do we need to worry about postmaster? Could this even
+	 * run outside postmaster? I don't think so.
+	 *
+	 * XXX Another issue is we may get different values than when sizing the
+	 * the memory, because at that point we didn't know if we get huge pages,
+	 * so we assumed we will. Shouldn't cause crashes, but we might allocate
+	 * shared memory and then not use some of it (because of the alignment
+	 * that we don't actually need). Not sure about better way, good for now.
+	 */
+	numa_page_size = pg_numa_page_size();
+
+	/* make sure the chunks will align nicely */
+	Assert(BLCKSZ % sizeof(BufferDescPadded) == 0);
+	Assert(numa_page_size % sizeof(BufferDescPadded) == 0);
+	Assert(((BLCKSZ % numa_page_size) == 0) || ((numa_page_size % BLCKSZ) == 0));
+
+	/*
+	 * The minimum number of buffers we can allocate from a single node, using
+	 * the memory page size (determined by buffer descriptors). NUMA allocates
+	 * memory in pages, and we need to do that for both buffers and
+	 * descriptors at the same time.
+	 *
+	 * In practice the BLCKSZ doesn't really matter, because it's much larger
+	 * than BufferDescPadded, so the result is determined buffer descriptors.
+	 */
+	min_node_buffers = (numa_page_size / sizeof(BufferDescPadded));
+
+	/*
+	 * Maximum number of nodes (each getting min_node_buffers) we can handle
+	 * given the current shared buffers size. The last node is allowed to be
+	 * smaller (half of the other nodes).
+	 */
+	max_nodes = (NBuffers + (min_node_buffers / 2)) / min_node_buffers;
+
+	/*
+	 * Can we actually do NUMA partitioning with these settings? If we can't
+	 * handle the current number of nodes, then no.
+	 *
+	 * XXX This shouldn't be a big issue in practice. NUMA systems typically
+	 * run with large shared buffers, which also makes the imbalance issues
+	 * fairly significant (it's quick to rebalance 128MB, much slower to do
+	 * that for 256GB).
+	 */
+	numa_can_partition = true;	/* assume we can allocate to nodes */
+	if (numa_nodes > max_nodes)
+	{
+		elog(NOTICE, "shared buffers too small for %d nodes (max nodes %d)",
+			 numa_nodes, max_nodes);
+		numa_can_partition = false;
+	}
+	else if ((numa_flags & NUMA_BUFFERS) == 0)
+	{
+		elog(NOTICE, "NUMA-partitioning of buffers disabled");
+		numa_can_partition = false;
+	}
+
+	/*
+	 * We know we can partition to the desired number of nodes, now it's time
+	 * to figure out how many partitions we need per node. We simply add
+	 * partitions per node until we reach MIN_BUFFER_PARTITIONS.
+	 *
+	 * XXX Maybe we should make sure to keep the actual partition size a power
+	 * of 2, to make the calculations simpler (shift instead of mod).
+	 */
+	num_partitions_per_node = 1;
+
+	while (numa_nodes * num_partitions_per_node < MIN_BUFFER_PARTITIONS)
+		num_partitions_per_node++;
+
+	/* now we know the total number of partitions */
+	numa_partitions = (numa_nodes * num_partitions_per_node);
+
+	/*
+	 * Finally, calculate how many buffers we'll assign to a single NUMA node.
+	 * If we have only a single node, or when we can't partition for some
+	 * reason, just take a "fair share" of buffers. This can happen for a
+	 * number of reasons - missing NUMA support, partitioning of buffers not
+	 * enabled, or not enough buffers for this many nodes.
+	 *
+	 * We still build partitions, because we want to allow partitioning of
+	 * the clock-sweep later.
+	 *
+	 * The number of buffers for each partition is calculated later, once we
+	 * have allocated the shared memory (because that's where we store it).
+	 *
+	 * XXX In both cases the last node can get fewer buffers.
+	 */
+	if (!numa_can_partition)
+	{
+		numa_buffers_per_node = (NBuffers + (numa_nodes - 1)) / numa_nodes;
+	}
+	else
+	{
+		numa_buffers_per_node = min_node_buffers;
+		while (numa_buffers_per_node * numa_nodes < NBuffers)
+			numa_buffers_per_node += min_node_buffers;
+
+		/* the last node should get at least some buffers */
+		Assert(NBuffers - (numa_nodes - 1) * numa_buffers_per_node > 0);
+	}
+
+	elog(DEBUG1, "NUMA: buffers %d partitions %d num_nodes %d per_node %d buffers_per_node %d (min %d)",
+		 NBuffers, numa_partitions, numa_nodes, num_partitions_per_node,
+		 numa_buffers_per_node, min_node_buffers);
+}
+
 /*
  * Sanity checks of buffers partitions - there must be no gaps, it must cover
  * the whole range of buffers, etc.
@@ -267,33 +585,137 @@ buffer_partitions_init(void)
 {
 	int			remaining_buffers = NBuffers;
 	int			buffer = 0;
+	int			parts_per_node = (numa_partitions / numa_nodes);
+	char	   *buffers_ptr,
+			   *descriptors_ptr;
 
-	/* number of buffers per partition (make sure to not overflow) */
-	int			part_buffers
-		= ((int64) NBuffers + (NUM_CLOCK_SWEEP_PARTITIONS - 1)) / NUM_CLOCK_SWEEP_PARTITIONS;
-
-	BufferPartitionsArray->npartitions = NUM_CLOCK_SWEEP_PARTITIONS;
+	BufferPartitionsArray->npartitions = numa_partitions;
+	BufferPartitionsArray->nnodes = numa_nodes;
 
-	for (int n = 0; n < BufferPartitionsArray->npartitions; n++)
+	for (int n = 0; n < numa_nodes; n++)
 	{
-		BufferPartition *part = &BufferPartitionsArray->partitions[n];
+		/* buffers this node should get (last node can get fewer) */
+		int			node_buffers = Min(remaining_buffers, numa_buffers_per_node);
 
-		/* buffers this partition should get (last partition can get fewer) */
-		int			num_buffers = Min(remaining_buffers, part_buffers);
+		/* split node buffers netween partitions (last one can get fewer) */
+		int			part_buffers = (node_buffers + (parts_per_node - 1)) / parts_per_node;
 
-		remaining_buffers -= num_buffers;
+		remaining_buffers -= node_buffers;
 
-		Assert((num_buffers > 0) && (num_buffers <= part_buffers));
-		Assert((buffer >= 0) && (buffer < NBuffers));
+		Assert((node_buffers > 0) && (node_buffers <= NBuffers));
+		Assert((n >= 0) && (n < numa_nodes));
+
+		for (int p = 0; p < parts_per_node; p++)
+		{
+			int			idx = (n * parts_per_node) + p;
+			BufferPartition *part = &BufferPartitionsArray->partitions[idx];
+			int			num_buffers = Min(node_buffers, part_buffers);
 
-		part->num_buffers = num_buffers;
-		part->first_buffer = buffer;
-		part->last_buffer = buffer + (num_buffers - 1);
+			Assert((idx >= 0) && (idx < numa_partitions));
+			Assert((buffer >= 0) && (buffer < NBuffers));
+			Assert((num_buffers > 0) && (num_buffers <= part_buffers));
 
-		buffer += num_buffers;
+			/* XXX we should get the actual node ID from the mask */
+			if (numa_can_partition)
+				part->numa_node = n;
+			else
+				part->numa_node = -1;
+
+			part->num_buffers = num_buffers;
+			part->first_buffer = buffer;
+			part->last_buffer = buffer + (num_buffers - 1);
+
+			elog(DEBUG1, "NUMA: buffer %d node %d partition %d buffers %d first %d last %d", idx, n, p, num_buffers, buffer, buffer + (num_buffers - 1));
+
+			buffer += num_buffers;
+			node_buffers -= part_buffers;
+		}
 	}
 
 	AssertCheckBufferPartitions();
+
+	/*
+	 * With buffers interleaving disabled (or can't partition, because of
+	 * shared buffers being too small), we're done.
+	 */
+	if (((numa_flags & NUMA_BUFFERS) == 0) || !numa_can_partition)
+		return;
+
+	/*
+	 * Assign chunks of buffers and buffer descriptors to the available NUMA
+	 * nodes. We can't use the regular interleaving, because with regular
+	 * memory pages (smaller than BLCKSZ) we'd split all buffers to multiple
+	 * NUMA nodes. And we don't want that.
+	 *
+	 * But even with huge pages it seems like a good idea to not map pages
+	 * one by one.
+	 *
+	 * So we always assign a larger contiguous chunk of buffers to the same
+	 * NUMA node, as calculated by choose_chunk_buffers(). We try to keep the
+	 * chunks large enough to work both for buffers and buffer descriptors,
+	 * but not too large. See the comments at choose_chunk_buffers() for
+	 * details.
+	 *
+	 * Thanks to the earlier alignment (to memory page etc.), we know the
+	 * buffers won't get split, etc.
+	 *
+	 * This also makes it easier / straightforward to calculate which NUMA
+	 * node a buffer belongs to (it's a matter of divide + mod). See
+	 * BufferGetNode().
+	 *
+	 * We need to account for partitions being of different length, when the
+	 * NBuffers is not nicely divisible. To do that we keep track of the start
+	 * of the next partition.
+	 *
+	 * We always map all partitions for the same node at once, so that we
+	 * don't need to worry about alignment of memory pages that get split
+	 * between partitions (we only worry about min_node_buffers for whole
+	 * NUMA nodes, not for individual partitions).
+	 */
+	buffers_ptr = BufferBlocks;
+	descriptors_ptr = (char *) BufferDescriptors;
+
+	for (int n = 0; n < numa_nodes; n++)
+	{
+		char	   *startptr,
+				   *endptr;
+		int			num_buffers = 0;
+
+		/* sum buffers in all partitions for this node */
+		for (int p = 0; p < parts_per_node; p++)
+		{
+			int		pidx = (n * parts_per_node + p);
+			BufferPartition *part = &BufferPartitionsArray->partitions[pidx];
+
+			Assert(part->numa_node == n);
+
+			num_buffers += part->num_buffers;
+		}
+
+		/* first map buffers */
+		startptr = buffers_ptr;
+		endptr = startptr + ((Size) num_buffers * BLCKSZ);
+		buffers_ptr = endptr;	/* start of the next partition */
+
+		elog(DEBUG1, "NUMA: buffer_partitions_init: %d => buffers %d start %p end %p (size %zd)",
+			 n, num_buffers, startptr, endptr, (endptr - startptr));
+
+		pg_numa_move_to_node(startptr, endptr, n);
+
+		/* now do the same for buffer descriptors */
+		startptr = descriptors_ptr;
+		endptr = startptr + ((Size) num_buffers * sizeof(BufferDescPadded));
+		descriptors_ptr = endptr;
+
+		elog(DEBUG1, "NUMA: buffer_partitions_init: %d => descriptors %d start %p end %p (size %zd)",
+			 n, num_buffers, startptr, endptr, (endptr - startptr));
+
+		pg_numa_move_to_node(startptr, endptr, n);
+	}
+
+	/* we should have consumed the arrays exactly */
+	Assert(buffers_ptr == BufferBlocks + (Size) NBuffers * BLCKSZ);
+	Assert(descriptors_ptr == (char *) BufferDescriptors + (Size) NBuffers * sizeof(BufferDescPadded));
 }
 
 int
@@ -302,14 +724,21 @@ BufferPartitionCount(void)
 	return BufferPartitionsArray->npartitions;
 }
 
+int
+BufferPartitionNodes(void)
+{
+	return BufferPartitionsArray->nnodes;
+}
+
 void
-BufferPartitionGet(int idx, int *num_buffers,
+BufferPartitionGet(int idx, int *node, int *num_buffers,
 				   int *first_buffer, int *last_buffer)
 {
 	if ((idx >= 0) && (idx < BufferPartitionsArray->npartitions))
 	{
 		BufferPartition *part = &BufferPartitionsArray->partitions[idx];
 
+		*node = part->numa_node;
 		*num_buffers = part->num_buffers;
 		*first_buffer = part->first_buffer;
 		*last_buffer = part->last_buffer;
@@ -322,8 +751,82 @@ BufferPartitionGet(int idx, int *num_buffers,
 
 /* return parameters before the partitions are initialized (during sizing) */
 void
-BufferPartitionParams(int *num_partitions)
+BufferPartitionParams(int *num_partitions, int *num_nodes)
 {
 	if (num_partitions)
-		*num_partitions = NUM_CLOCK_SWEEP_PARTITIONS;
+		*num_partitions = numa_partitions;
+
+	if (num_nodes)
+		*num_nodes = numa_nodes;
+}
+
+/* XXX the GUC hooks should probably be somewhere else? */
+bool
+check_debug_numa(char **newval, void **extra, GucSource source)
+{
+	bool		result = true;
+	int			flags;
+
+#if USE_LIBNUMA == 0
+	if (strcmp(*newval, "") != 0)
+	{
+		GUC_check_errdetail("\"%s\" is not supported on this platform.",
+							"debug_numa");
+		result = false;
+	}
+	flags = 0;
+#else
+	List	   *elemlist;
+	ListCell   *l;
+	char	   *rawstring;
+
+	/* Need a modifiable copy of string */
+	rawstring = pstrdup(*newval);
+
+	if (!SplitGUCList(rawstring, ',', &elemlist))
+	{
+		GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
+							"debug_numa");
+		pfree(rawstring);
+		list_free(elemlist);
+		return false;
+	}
+
+	flags = 0;
+	foreach(l, elemlist)
+	{
+		char	   *item = (char *) lfirst(l);
+
+		if (pg_strcasecmp(item, "buffers") == 0)
+			flags |= NUMA_BUFFERS;
+		else
+		{
+			GUC_check_errdetail("Invalid option \"%s\".", item);
+			result = false;
+			break;
+		}
+	}
+
+	pfree(rawstring);
+	list_free(elemlist);
+#endif
+
+	if (!result)
+		return result;
+
+	/* Save the flags in *extra, for use by assign_debug_io_direct */
+	*extra = guc_malloc(LOG, sizeof(int));
+	if (!*extra)
+		return false;
+	*((int *) *extra) = flags;
+
+	return result;
+}
+
+void
+assign_debug_numa(const char *newval, void *extra)
+{
+	int		   *flags = (int *) extra;
+
+	numa_flags = *flags;
 }
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 8be77a9c8b1..810a549efce 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -124,7 +124,9 @@ typedef struct
 	//int			__attribute__((aligned(64))) bgwprocno;
 
 	/* info about freelist partitioning */
+	int			num_nodes;		/* effectively number of NUMA nodes */
 	int			num_partitions;
+	int			num_partitions_per_node;
 
 	/* clocksweep partitions */
 	ClockSweep	sweeps[FLEXIBLE_ARRAY_MEMBER];
@@ -270,16 +272,72 @@ ClockSweepTick(ClockSweep *sweep)
  * calculate_partition_index
  *		calculate the buffer / clock-sweep partition to use
  *
- * use PID to determine the buffer partition
- *
- * XXX We could use NUMA node / core ID to pick partition, but we'd need
- * to handle cases with fewer nodes/cores than partitions somehow. Although,
- * maybe the balancing would handle that too.
+ * With libnuma, use the NUMA node and CPU to pick the partition. Otherwise
+ * use just PID instead of CPU (we assume everything is a single NUMA node).
  */
 static int
 calculate_partition_index(void)
 {
-	return (MyProcPid % StrategyControl->num_partitions);
+	int		cpu,
+			node,
+			index;
+
+	/*
+	 * The buffers are partitioned, so determine the CPU/NUMA node, and pick a
+	 * partition based on that.
+	 *
+	 * Without NUMA assume everything is a single NUMA node, and we pick the
+	 * partition based on PID (we may not have sched_getcpu).
+	 */
+#ifdef USE_LIBNUMA
+	cpu = sched_getcpu();
+
+	if (cpu < 0)
+		elog(ERROR, "sched_getcpu failed: %m");
+
+	node = numa_node_of_cpu(cpu);
+#else
+	cpu = MyProcPid;
+	node = 0;
+#endif
+
+	Assert(StrategyControl->num_partitions ==
+		   (StrategyControl->num_nodes * StrategyControl->num_partitions_per_node));
+
+	/*
+	 * XXX We should't get nodes that we haven't considered while building the
+	 * partitions. Maybe if we allow this (e.g. due to support adjusting the
+	 * NUMA stuff at runtime), we should just do our best to minimize the
+	 * conflicts somehow. But it'll make the mapping harder, so for now we
+	 * ignore it.
+	 */
+	if (node > StrategyControl->num_nodes)
+		elog(ERROR, "node out of range: %d > %u", cpu, StrategyControl->num_nodes);
+
+	/*
+	 * Find the partition. If we have a single partition per node, we can
+	 * calculate the index directly from node. Otherwise we need to do two
+	 * steps, using node and then cpu.
+	 */
+	if (StrategyControl->num_partitions_per_node == 1)
+	{
+		/* fast-path */
+		index = (node % StrategyControl->num_partitions);
+	}
+	else
+	{
+		int			index_group,
+					index_part;
+
+		/* two steps - calculate group from node, partition from cpu */
+		index_group = (node % StrategyControl->num_nodes);
+		index_part = (cpu % StrategyControl->num_partitions_per_node);
+
+		index = (index_group * StrategyControl->num_partitions_per_node)
+			+ index_part;
+	}
+
+	return index;
 }
 
 /*
@@ -947,7 +1005,7 @@ StrategyShmemSize(void)
 	Size		size = 0;
 	int			num_partitions;
 
-	BufferPartitionParams(&num_partitions);
+	BufferPartitionParams(&num_partitions, NULL);
 
 	/* size of lookup hash table ... see comment in StrategyInitialize */
 	size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
@@ -974,9 +1032,17 @@ StrategyInitialize(bool init)
 {
 	bool		found;
 
+	int			num_nodes;
 	int			num_partitions;
+	int			num_partitions_per_node;
 
 	num_partitions = BufferPartitionCount();
+	num_nodes = BufferPartitionNodes();
+
+	/* always a multiple of NUMA nodes */
+	Assert(num_partitions % num_nodes == 0);
+
+	num_partitions_per_node = (num_partitions / num_nodes);
 
 	/*
 	 * Initialize the shared buffer lookup hashtable.
@@ -1011,7 +1077,8 @@ StrategyInitialize(bool init)
 		/* Initialize the clock sweep pointers (for all partitions) */
 		for (int i = 0; i < num_partitions; i++)
 		{
-			int			num_buffers,
+			int			node,
+						num_buffers,
 						first_buffer,
 						last_buffer;
 
@@ -1020,7 +1087,8 @@ StrategyInitialize(bool init)
 			pg_atomic_init_u32(&StrategyControl->sweeps[i].nextVictimBuffer, 0);
 
 			/* get info about the buffer partition */
-			BufferPartitionGet(i, &num_buffers, &first_buffer, &last_buffer);
+			BufferPartitionGet(i, &node, &num_buffers,
+							   &first_buffer, &last_buffer);
 
 			/*
 			 * FIXME This may not quite right, because if NBuffers is not a
@@ -1056,6 +1124,8 @@ StrategyInitialize(bool init)
 
 		/* initialize the partitioned clocksweep */
 		StrategyControl->num_partitions = num_partitions;
+		StrategyControl->num_nodes = num_nodes;
+		StrategyControl->num_partitions_per_node = num_partitions_per_node;
 	}
 	else
 		Assert(!init);
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index 1128167c025..8192c27066b 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -636,6 +636,16 @@
   options => 'debug_logical_replication_streaming_options',
 },
 
+{ name => 'debug_numa', type => 'string', context => 'PGC_POSTMASTER', group => 'DEVELOPER_OPTIONS',
+  short_desc => 'NUMA-aware partitioning of shared memory.',
+  long_desc => 'An empty string disables NUMA-aware partitioning.',
+  flags => 'GUC_LIST_INPUT | GUC_NOT_IN_SAMPLE',
+  variable => 'debug_numa_string',
+  boot_val => '""',
+  check_hook => 'check_debug_numa',
+  assign_hook => 'assign_debug_numa',
+},
+
 { name => 'debug_parallel_query', type => 'enum', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS',
   short_desc => 'Forces the planner\'s use parallel query nodes.',
   long_desc => 'This can be useful for testing the parallel query infrastructure by forcing the planner to generate plans that contain nodes that perform tuple communication between workers and the main process.',
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 0209b2067a2..404eb3432f9 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -595,6 +595,7 @@ static char *server_version_string;
 static int	server_version_num;
 static char *debug_io_direct_string;
 static char *restrict_nonsystem_relation_kind_string;
+static char *debug_numa_string;
 
 #ifdef HAVE_SYSLOG
 #define	DEFAULT_SYSLOG_FACILITY LOG_LOCAL0
diff --git a/src/include/port/pg_numa.h b/src/include/port/pg_numa.h
index 9d1ea6d0db8..9734aa315ff 100644
--- a/src/include/port/pg_numa.h
+++ b/src/include/port/pg_numa.h
@@ -17,6 +17,12 @@
 extern PGDLLIMPORT int pg_numa_init(void);
 extern PGDLLIMPORT int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status);
 extern PGDLLIMPORT int pg_numa_get_max_node(void);
+extern PGDLLIMPORT Size pg_numa_page_size(void);
+extern PGDLLIMPORT void pg_numa_move_to_node(char *startptr, char *endptr, int node);
+
+extern PGDLLIMPORT int numa_flags;
+
+#define		NUMA_BUFFERS		0x01
 
 #ifdef USE_LIBNUMA
 
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 1118b386228..33377841c57 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -299,10 +299,10 @@ typedef struct BufferDesc
  * line sized.
  *
  * XXX: As this is primarily matters in highly concurrent workloads which
- * probably all are 64bit these days, and the space wastage would be a bit
- * more noticeable on 32bit systems, we don't force the stride to be cache
- * line sized on those. If somebody does actual performance testing, we can
- * reevaluate.
+ * probably all are 64bit these days. We force the stride to be cache line
+ * sized even on 32bit systems, where the space wastage is be a bit more
+ * noticeable, to allow partitioning of shared buffers (which requires the
+ * memory page be a multiple of buffer descriptor).
  *
  * Note that local buffer descriptors aren't forced to be aligned - as there's
  * no concurrent access to those it's unlikely to be beneficial.
@@ -312,7 +312,7 @@ typedef struct BufferDesc
  * platform with either 32 or 128 byte line sizes, it's good to align to
  * boundaries and avoid false sharing.
  */
-#define BUFFERDESC_PAD_TO_SIZE	(SIZEOF_VOID_P == 8 ? 64 : 1)
+#define BUFFERDESC_PAD_TO_SIZE	64
 
 typedef union BufferDescPadded
 {
@@ -555,8 +555,8 @@ extern void AtEOXact_LocalBuffers(bool isCommit);
 
 extern int	BufferPartitionCount(void);
 extern int	BufferPartitionNodes(void);
-extern void BufferPartitionGet(int idx, int *num_buffers,
+extern void BufferPartitionGet(int idx, int *node, int *num_buffers,
 							   int *first_buffer, int *last_buffer);
-extern void BufferPartitionParams(int *num_partitions);
+extern void BufferPartitionParams(int *num_partitions, int *num_nodes);
 
 #endif							/* BUFMGR_INTERNALS_H */
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 4e7b1fcd4ab..510018db115 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -156,10 +156,12 @@ typedef struct ReadBuffersOperation ReadBuffersOperation;
 /*
  * information about one partition of shared buffers
  *
+ * numa_nod specifies node for this partition (-1 means allocated on any node)
  * first/last buffer - the values are inclusive
  */
 typedef struct BufferPartition
 {
+	int			numa_node;		/* NUMA node (-1 no node) */
 	int			num_buffers;	/* number of buffers */
 	int			first_buffer;	/* first buffer of partition */
 	int			last_buffer;	/* last buffer of partition */
@@ -169,6 +171,7 @@ typedef struct BufferPartition
 typedef struct BufferPartitions
 {
 	int			npartitions;	/* number of partitions */
+	int			nnodes;			/* number of NUMA nodes */
 	BufferPartition partitions[FLEXIBLE_ARRAY_MEMBER];
 } BufferPartitions;
 
@@ -346,6 +349,7 @@ extern void EvictRelUnpinnedBuffers(Relation rel,
 /* in buf_init.c */
 extern void BufferManagerShmemInit(void);
 extern Size BufferManagerShmemSize(void);
+extern int	BufferGetNode(Buffer buffer);
 
 /* in localbuf.c */
 extern void AtProcExit_LocalBuffers(void);
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index 82ac8646a8d..15304df0de5 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -175,4 +175,7 @@ extern bool check_synchronized_standby_slots(char **newval, void **extra,
 											 GucSource source);
 extern void assign_synchronized_standby_slots(const char *newval, void *extra);
 
+extern bool check_debug_numa(char **newval, void **extra, GucSource source);
+extern void assign_debug_numa(const char *newval, void *extra);
+
 #endif							/* GUC_HOOKS_H */
diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c
index 540ada3f8ef..d9c3841e078 100644
--- a/src/port/pg_numa.c
+++ b/src/port/pg_numa.c
@@ -18,6 +18,9 @@
 
 #include "miscadmin.h"
 #include "port/pg_numa.h"
+#include "storage/pg_shmem.h"
+
+int	numa_flags;
 
 /*
  * At this point we provide support only for Linux thanks to libnuma, but in
@@ -116,6 +119,36 @@ pg_numa_get_max_node(void)
 	return numa_max_node();
 }
 
+/*
+ * pg_numa_move_to_node
+ *		move memory to different NUMA nodes in larger chunks
+ *
+ * startptr - start of the region (should be aligned to page size)
+ * endptr - end of the region (doesn't need to be aligned)
+ * node - node to move the memory to
+ *
+ * The "startptr" is expected to be a multiple of system memory page size, as
+ * determined by pg_numa_page_size.
+ *
+ * XXX We only expect to do this during startup, when the shared memory is
+ * still being setup.
+ */
+void
+pg_numa_move_to_node(char *startptr, char *endptr, int node)
+{
+	Size		sz = (endptr - startptr);
+
+	Assert((int64) startptr % pg_numa_page_size() == 0);
+
+	/*
+	 * numa_tonode_memory does not actually cause a page fault, and thus does
+	 * not locate the memory on the node. So it's fast, at least compared to
+	 * pg_numa_query_pages, and does not make startup longer. But it also
+	 * means the expensive part happen later, on the first access.
+	 */
+	numa_tonode_memory(startptr, sz, node);
+}
+
 #else
 
 /* Empty wrappers */
@@ -138,4 +171,35 @@ pg_numa_get_max_node(void)
 	return 0;
 }
 
+void
+pg_numa_move_to_node(char *startptr, char *endptr, int node)
+{
+	/* we don't expect to ever get here in builds without libnuma */
+	Assert(false);
+}
+
 #endif
+
+Size
+pg_numa_page_size(void)
+{
+	Size		os_page_size;
+	Size		huge_page_size;
+
+#ifdef WIN32
+	SYSTEM_INFO sysinfo;
+
+	GetSystemInfo(&sysinfo);
+	os_page_size = sysinfo.dwPageSize;
+#else
+	os_page_size = sysconf(_SC_PAGESIZE);
+#endif
+
+	/* assume huge pages get used, unless HUGE_PAGES_OFF */
+	if (huge_pages_status != HUGE_PAGES_OFF)
+		GetHugePageSize(&huge_page_size, NULL);
+	else
+		huge_page_size = 0;
+
+	return Max(os_page_size, huge_page_size);
+}
-- 
2.51.1