From f832fedb847004990e7e806691ab4338752a5db5 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Thu, 31 Jul 2025 19:50:05 +0200
Subject: [PATCH v20250807 08/11] NUMA: clocksweep allocation balancing

If backends only allocate buffers from the "local" partition, this could
cause significant misbalance - some partitions might be overused, while
other partitions would be left unused. In other words, shared buffers
would not be used efficiently.

We want all partitions to be used about the same, i.e. serve about the
same number of allocations. To achieve that, allocations from partitions
that are "too busy" may get redirected to other partitions. The system
counts allocations requested from each partition, calculates the "fair
share" (average per partition), and then redirectsexcess allocations to
other partitions.

Each partition gets a set of coefficients determining the fraction of
allocations to redirect to other partitions. The coefficients may be
interpreted as a "budget" for each of the partition, i.e. the number of
allocations to serve from that partition, before moving to the next
partition (in a round-robin manner).

All of this is tied to the partition where the allocation was requested.
Each partition has a separate set of coefficients.

We might also treat the coefficients as probabilities, and use PRNG to
determine where to direct individual requests. But a PRNG seems fairly
expensive, and the budget approach works well.

We intentionally keep the "budget" fairly low, with the sum for a given
partition 100. That means we get to the same partition after only 100
allocations, keeping it more balanced. It wouldn't be hard to make the
budgets higher (e.g. matching the number of allocations per round), but
it might also make the behavior less smooth (long period of allocations
from each partition).

This is very simple/cheap, and over many allocations it has the same
effect. For periods of low activity it may diverge, but that does not
matter much (we care about high-activity periods much more).
---
 .../pg_buffercache--1.6--1.7.sql              |   5 +-
 contrib/pg_buffercache/pg_buffercache_pages.c |  41 +-
 src/backend/storage/buffer/bufmgr.c           |   3 +
 src/backend/storage/buffer/freelist.c         | 396 ++++++++++++++++--
 src/include/storage/buf_internals.h           |   1 +
 src/include/storage/bufmgr.h                  |   5 +-
 6 files changed, 420 insertions(+), 31 deletions(-)

diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
index 999bb2128f0..5acae31b836 100644
--- a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
+++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
@@ -27,7 +27,10 @@ CREATE VIEW pg_buffercache_partitions AS
 	 num_passes bigint,			-- clocksweep passes
 	 next_buffer integer,		-- next victim buffer for clocksweep
 	 total_allocs bigint,		-- handled allocs (running total)
-	 num_allocs bigint);		-- handled allocs (current cycle)
+	 num_allocs bigint,			-- handled allocs (current cycle)
+	 total_req_allocs bigint,	-- requested allocs (running total)
+	 num_req_allocs bigint,		-- handled allocs (current cycle)
+	 weights int[]);			-- balancing weights
 
 -- Don't want these to be available to public.
 REVOKE ALL ON FUNCTION pg_buffercache_partitions() FROM PUBLIC;
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 7ca075e6164..13014549d00 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -15,6 +15,8 @@
 #include "port/pg_numa.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
 #include "utils/rel.h"
 
 
@@ -27,7 +29,7 @@
 #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
 
 #define NUM_BUFFERCACHE_NUMA_ELEM	3
-#define NUM_BUFFERCACHE_PARTITIONS_ELEM	12
+#define NUM_BUFFERCACHE_PARTITIONS_ELEM	15
 
 PG_MODULE_MAGIC_EXT(
 					.name = "pg_buffercache",
@@ -789,6 +791,8 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS)
 
 	if (SRF_IS_FIRSTCALL())
 	{
+		TypeCacheEntry *typentry = lookup_type_cache(INT4OID, 0);
+
 		funcctx = SRF_FIRSTCALL_INIT();
 
 		/* Switch context when allocating stuff to be used in later calls */
@@ -826,6 +830,12 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS)
 						   INT8OID, -1, 0);
 		TupleDescInitEntry(tupledesc, (AttrNumber) 12, "num_allocs",
 						   INT8OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 13, "total_req_allocs",
+						   INT8OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 14, "num_req_allocs",
+						   INT8OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 15, "weigths",
+						   typentry->typarray, -1, 0);
 
 		funcctx->user_fctx = BlessTupleDesc(tupledesc);
 
@@ -851,11 +861,17 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS)
 					buffers_remain,
 					buffers_free;
 
-		uint64		buffer_total_allocs;
+		uint64		buffer_total_allocs,
+					buffer_total_req_allocs;
 
 		uint32		complete_passes,
 					next_victim_buffer,
-					buffer_allocs;
+					buffer_allocs,
+					buffer_req_allocs;
+
+		int		   *weights;
+		Datum	   *dweights;
+		ArrayType  *array;
 
 		Datum		values[NUM_BUFFERCACHE_PARTITIONS_ELEM];
 		bool		nulls[NUM_BUFFERCACHE_PARTITIONS_ELEM];
@@ -866,7 +882,15 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS)
 		FreelistPartitionGetInfo(i, &buffers_consumed, &buffers_remain,
 								 &buffers_free,
 								 &complete_passes, &next_victim_buffer,
-								 &buffer_total_allocs, &buffer_allocs);
+								 &buffer_total_allocs, &buffer_allocs,
+								 &buffer_total_req_allocs, &buffer_req_allocs,
+								 &weights);
+
+		dweights = palloc_array(Datum, funcctx->max_calls);
+		for (int i = 0; i < funcctx->max_calls; i++)
+			dweights[i] = Int32GetDatum(weights[i]);
+
+		array = construct_array_builtin(dweights, funcctx->max_calls, INT4OID);
 
 		values[0] = Int32GetDatum(i);
 		nulls[0] = false;
@@ -904,6 +928,15 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS)
 		values[11] = Int64GetDatum(buffer_allocs);
 		nulls[11] = false;
 
+		values[12] = Int64GetDatum(buffer_total_req_allocs);
+		nulls[12] = false;
+
+		values[13] = Int64GetDatum(buffer_req_allocs);
+		nulls[13] = false;
+
+		values[14] = PointerGetDatum(array);
+		nulls[14] = false;
+
 		/* Build and return the tuple. */
 		tuple = heap_form_tuple((TupleDesc) funcctx->user_fctx, values, nulls);
 		result = HeapTupleGetDatum(tuple);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 7a8c45ac59c..97b6f973c26 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3623,6 +3623,9 @@ BgBufferSync(WritebackContext *wb_context)
 	/* assume we can hibernate, any partition can set to false */
 	bool		hibernate = true;
 
+	/* trigger partition rebalancing first */
+	StrategySyncBalance();
+
 	/* get the number of clocksweep partitions, and total alloc count */
 	StrategySyncPrepare(&num_partitions, &recent_alloc);
 
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 17988b4fd53..f2203cebcc8 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -51,6 +51,22 @@ typedef struct BufferStrategyFreelist
 	uint64		consumed;
 }			BufferStrategyFreelist;
 
+/*
+ * XXX needed for make ClockSweep fixed-size, should be tied to the number
+ * of buffer partitions.
+ */
+#define MAX_BUFFER_PARTITIONS		16
+
+/*
+ * Coefficient used to combine the old and new balance coefficients, using
+ * weighted average. The higher the value, the more the old value affects the
+ * result.
+ *
+ * XXX Doesn't this invalidate the interpretation as a probability to allocate
+ * from a given partition? Does it still sum to 100%?
+ */
+#define CLOCKSWEEP_HISTORY_COEFF	0.5
+
 /*
  * Information about one partition of the ClockSweep (on a subset of buffers).
  *
@@ -83,9 +99,28 @@ typedef struct
 	uint32		completePasses; /* Complete cycles of the clock sweep */
 	pg_atomic_uint32 numBufferAllocs;	/* Buffers allocated since last reset */
 
+	/*
+	 * Buffers that should have been allocated in this partition (but might
+	 * have been redirected to keep allocations balanced).
+	 */
+	pg_atomic_uint32 numRequestedAllocs;
+
 	/* running total of allocs */
 	pg_atomic_uint64 numTotalAllocs;
+	pg_atomic_uint64 numTotalRequestedAllocs;
 
+	/*
+	 * Weights to balance buffer allocations for all the partitions. Each
+	 * partition gets a vector of weights 0-100, determining what fraction
+	 * of buffers to allocate from that particular. So [75, 15, 5, 5] would
+	 * mean 75% allocations should go from partition 0, 15% from partition
+	 * 1, and 5% from partitions 2&3. Each partition gets a different vector
+	 * of weights.
+	 *
+	 * XXX Allocate a fixed-length array, to simplify working with array of
+	 * the structs, etc.
+	 */
+	uint8		balance[MAX_BUFFER_PARTITIONS];
 } ClockSweep;
 
 /*
@@ -153,7 +188,33 @@ static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
 									 uint32 *buf_state);
 static void AddBufferToRing(BufferAccessStrategy strategy,
 							BufferDesc *buf);
-static ClockSweep *ChooseClockSweep(void);
+static ClockSweep *ChooseClockSweep(bool balance);
+
+/*
+ * clocksweep allocation balancing
+ *
+ * To balance allocations from clocksweep partitions, each partition gets
+ * a set of "weights" determining the fraction of allocations to redirect
+ * to other partitions.
+ *
+ * We could do that based on a random number generator, but that seems too
+ * expensive. So instead we simply treat the probabilities as a budget, i.e.
+ * a number of allocations to serve from that partition, before moving to
+ * the next partition (in a round-robin manner).
+ *
+ * This is very simple/cheap, and over many allocations it has the same
+ * effect. For periods of low activity it may diverge, but that does not
+ * matter much (we care about high-activity periods much more).
+ *
+ * We intentionally keep the "budget" fairly low, with the sum for a given
+ * partition 100. That means we get to the same partition after only 100
+ * allocations, keeping it more balances. It wouldn't be hard to make the
+ * budgets higher (say, to match the expected number of allocations, i.e.
+ * about the average number of allocations from the past interval).
+ */
+static int clocksweep_partition_optimal = -1;
+static int clocksweep_partition = -1;
+static int clocksweep_count = 0;
 
 /*
  * ClockSweepTick - Helper routine for StrategyGetBuffer()
@@ -165,7 +226,7 @@ static inline uint32
 ClockSweepTick(void)
 {
 	uint32		victim;
-	ClockSweep *sweep = ChooseClockSweep();
+	ClockSweep *sweep = ChooseClockSweep(true);
 
 	/*
 	 * Atomically move hand ahead one buffer - if there's several processes
@@ -294,32 +355,72 @@ calculate_partition_index()
  * and that's cheaper. But how would that deal with odd number of nodes?
  */
 static ClockSweep *
-ChooseClockSweep(void)
+ChooseClockSweep(bool balance)
 {
-	int			index = calculate_partition_index();
+	/* What's the "optimal" partition? */
+	int		index = calculate_partition_index();
+	ClockSweep *sweep = &StrategyControl->sweeps[index];
+
+	/*
+	 * Did we migrate to a different core / NUMA node, affecting the
+	 * clocksweep partition we should use? Switch to that partition.
+	 */
+	if (clocksweep_partition_optimal != index)
+	{
+		clocksweep_partition_optimal = index;
+		clocksweep_partition = index;
+		clocksweep_count = sweep->balance[index];
+	}
+
+	/* we should have a valid partition */
+	Assert(clocksweep_partition_optimal != -1);
+	Assert(clocksweep_partition != -1);
+
+	/*
+	 * If rebalancing is enabled, use the weights to redirect the allocations
+	 * to match the desired distribution. We do that by using the partitions
+	 * in a round-robin way, after allocating the "weight" of allocations
+	 * from each partitions.
+	 */
+	if (balance)
+	{
+		/*
+		 * Ran out of allocations from the current partition? Move to the
+		 * next partition with non-zero weight, and use the weight as a
+		 * budget for allocations.
+		 */
+		while (clocksweep_count == 0)
+		{
+			clocksweep_partition
+				= (clocksweep_partition + 1) % StrategyControl->num_partitions;
+
+			Assert((clocksweep_partition >= 0) &&
+				   (clocksweep_partition < StrategyControl->num_partitions));
+
+			clocksweep_count = sweep->balance[clocksweep_partition];
+		}
+
+		/* account for the allocation - take it from the budget */
+		--clocksweep_count;
 
-	return &StrategyControl->sweeps[index];
+		/* account for the alloc in the "optimal" (original) partition */
+		pg_atomic_fetch_add_u32(&sweep->numRequestedAllocs, 1);
+	}
+
+	return &StrategyControl->sweeps[clocksweep_partition];
 }
 
 /*
  * ChooseFreeList
- *		Pick the buffer freelist to use, depending on the CPU and NUMA node.
- *
- * Without partitioned freelists (numa_partition_freelist=false), there's only
- * a single freelist, so use that.
- *
- * With partitioned freelists, we have multiple ways how to pick the freelist
- * for the backend:
- *
- * - one freelist per CPU, use the freelist for CPU the task executes on
- *
- * - one freelist per NUMA node, use the freelist for node task executes on
+ *		pick a clocksweep partition based on NUMA node and CPU
  *
- * - use fixed number of freelists, map processes to lists based on PID
+ * The number of freelist partitions may not match the number of NUMA
+ * nodes, but it should not be lower. Each partition should be mapped to
+ * a single NUMA node, but a node may have multiple partitions. If there
+ * are multiple partitions per node (all nodes have the same number of
+ * partitions), we pick the partition using CPU.
  *
- * There may be some other strategies, not sure. The important thing is this
- * needs to be refrecled during initialization, i.e. we need to create the
- * right number of lists.
+ * XXX Maybe this should use the same balancing strategy as clocksweep?
  */
 static BufferStrategyFreelist *
 ChooseFreeList(void)
@@ -417,7 +518,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	 * the rate of buffer consumption.  Note that buffers recycled by a
 	 * strategy object are intentionally not counted here.
 	 */
-	pg_atomic_fetch_add_u32(&ChooseClockSweep()->numBufferAllocs, 1);
+	pg_atomic_fetch_add_u32(&ChooseClockSweep(false)->numBufferAllocs, 1);
 
 	/*
 	 * First check, without acquiring the lock, whether there's buffers in the
@@ -580,6 +681,224 @@ StrategyFreeBuffer(BufferDesc *buf)
 	SpinLockRelease(&freelist->freelist_lock);
 }
 
+/*
+ * StrategySyncBalance
+ *		update partition weights, to balance the buffer allocations
+ *
+ * We want to give preference to allocating buffers on the same NUMA node,
+ * but that might lead to imbalance - a single process would only use a
+ * fraction of shared buffers. We don't want that, we want to utilize the
+ * whole shared buffers. The number of allocations in each partition may
+ * also change over time, so we need to adapt to that.
+ *
+ * To allow this "adaptive balancing", each partition has a set of weights,
+ * determining what fraction of allocations to direct to other partitions.
+ * For simplicity the coefficients are integers 0-100, expressing the
+ * percentage of allocations redirected to that partition.
+ *
+ * Consider for example weights [50, 25, 25, 0] for one of 4 partitions.
+ * This means 50% of allocations will be redirected to partition 0, 25%
+ * to partitions 1 and 2, and no allocations will go to partition 3.
+ *
+ * To calculate these weights, assume we know the number of allocations
+ * requested for each partition in the past interval. We can use this to
+ * calculate weights for the following interval, aiming to allocate the
+ * same (fair share) number of buffers from each partition.
+ *
+ * Note: This is based on number of allocations "originating" in a given
+ * partition. If an allocation is requested in a partition A, it's counted
+ * as allocation for A, even if it gets redirected to some other partition.
+ * The patch addes a new counter to track this.
+ *
+ * The main observation is that partitions get divided into two groups,
+ * depending on whether the number allocations is higher or lower than the
+ * target average. But the "total delta" for these two groups is the
+ * same, i.e. sum(abs(allocs - avg_allocs)) is the same. Therefore, the
+ * task is to "distribute" the excess allocations between the partitions
+ * with not enough allocations.
+ *
+ * Partitions with (nallocs < avg_nallocs) don't redirect any allocations.
+ *
+ * Partitions with (nallocs > avg_nallocs) redirect the extra allocations,
+ * with each target allocation getting a proportional part (with respect
+ * to the total delta).
+ *
+ * XXX In principle we might do without the new "requestedAllocs" counter,
+ * but we'd need to solve the matrix equation Ax=b, with [A,b] known
+ * (weights and allocs), and calculate x (requested allocs). But it's not
+ * quite clear this'd always have a solution.
+ */
+void
+StrategySyncBalance(void)
+{
+	/* snapshot of allocs for partitions */
+	uint32	allocs[MAX_BUFFER_PARTITIONS];
+
+	uint32	total_allocs = 0,	/* total number of allocations */
+			avg_allocs,			/* average allocations (per partition) */
+			delta_allocs = 0;	/* sum of allocs above average */
+
+	/*
+	 * Collect the number of allocations requested in the past interval.
+	 * While at it, reset the counter to start the new interval.
+	 *
+	 * We lock the partitions one by one, so this is not exactly consistent
+	 * snapshot of the counts, and the resets happen before we update the
+	 * weights too. But we're only looking for heuristics anyway, so this
+	 * should be good enough.
+	 *
+	 * A similar issue applies to the counter reset - we haven't updated
+	 * the weights yet. Should be fine, we'll simply consider this in the
+	 * next balancing cycle.
+	 *
+	 * XXX Does this need to worry about the completePasses too?
+	 */
+	for (int i = 0; i < StrategyControl->num_partitions; i++)
+	{
+		ClockSweep *sweep = &StrategyControl->sweeps[i];
+
+		/* no need for a spinlock */
+		allocs[i] = pg_atomic_exchange_u32(&sweep->numRequestedAllocs, 0);
+
+		/* add the allocs to running total */
+		pg_atomic_fetch_add_u64(&sweep->numTotalRequestedAllocs, allocs[i]);
+
+		total_allocs += allocs[i];
+	}
+
+	/*
+	 * Calculate the "fair share" of allocations per partition.
+	 *
+	 * XXX The last partition could be smaller, in which case it should be
+	 * expected to handle fewer allocations. So this should be a weighted
+	 * average. But for now a simple average is good enough.
+	 */
+	avg_allocs = (total_allocs / StrategyControl->num_partitions);
+
+	/*
+	 * Calculate the "delta" from balanced state, i.e. how many allocations
+	 * we'd need to redistribute.
+	 */
+	for (int i = 0; i < StrategyControl->num_partitions; i++)
+	{
+		if (allocs[i] > avg_allocs)
+			delta_allocs += (allocs[i] - avg_allocs);
+	}
+
+	/*
+	 * Skip the rebalancing when there's not enough activity. In this case
+	 * we just keep the current weights.
+	 *
+	 * XXX The threshold of 100 allocation is pretty arbitrary.
+	 *
+	 * XXX Maybe a better strategy would be to slowly return to the default
+	 * weights, with each partition allocation only from itself?
+	 *
+	 * XXX Maybe we shouldn't even reset the counters in this case? But it
+	 * should not matter, if the activity is low.
+	 */
+	if (avg_allocs < 100)
+	{
+		elog(LOG, "rebalance skipped: not enough allocations (allocs: %u)",
+			 avg_allocs);
+		return;
+	}
+
+	/*
+	 * Likewise, skip rebalancing if the misbalance is not significant. We
+	 * consider it acceptable if the amount of allocations we'd need to
+	 * redistribute is less than 10% of the average.
+	 *
+	 * XXX Again, these threshold are rather arbitrary.
+	 */
+	if (delta_allocs < (avg_allocs * 0.1))
+	{
+		elog(LOG, "rebalance skipped: delta within limit (delta: %u, threshold: %u)",
+			 delta_allocs, (uint32) (avg_allocs * 0.1));
+		return;
+	}
+
+	/*
+	 * Got to do the rebalancing. Go through the partitions, and for each
+	 * partition decide if it gets to redirect or receive allocations.
+	 *
+	 * If a partition has fewer than average allocations, it won't redirect
+	 * any allocations to other partitions. So it only has a single non-zero
+	 * weight, and that's for itself.
+	 *
+	 * If a parttion has more than average allocations, it won't receive
+	 * any redirected allocations. Instead, the excess allocations are
+	 * redirected to the other partitions.
+	 *
+	 * The redistribution is "proportional" - if the excess allocations of
+	 * a partition represent 10% of the "delta", then each partition that
+	 * needs more allocations will get 10% of the gap from this one.
+	 *
+	 * XXX We should add hysteresis, to "dampen" the changes, and make
+	 * sure it does not oscillate too much.
+	 *
+	 * XXX Ideally, the alternative partitions to use first would be the
+	 * other partitions for the same node (if any).
+	 */
+	for (int i = 0; i < StrategyControl->num_partitions; i++)
+	{
+		ClockSweep *sweep = &StrategyControl->sweeps[i];
+		uint8		balance[MAX_BUFFER_PARTITIONS];
+
+		/* lock, we're going to modify the balance weights */
+		SpinLockAcquire(&sweep->clock_sweep_lock);
+
+		/* reset the weights to start from scratch */
+		memset(balance, 0, sizeof(uint8) * MAX_BUFFER_PARTITIONS);
+
+		/* does this partition has fewer or more than avg_allocs? */
+		if (allocs[i] < avg_allocs)
+		{
+			/* fewer - don't redirect any allocations elsewhere */
+			balance[i] = 100;
+		}
+		else
+		{
+			/*
+			 * more - redistribute the excess allocations
+			 *
+			 * Each "target" partition (with less than avg_allocs) should get
+			 * a fraction proportional to (excess/delta) from this one.
+			 */
+
+			/* fraction of the "total" delta */
+			double	delta_frac = (allocs[i] - avg_allocs) * 1.0 / delta_allocs;
+
+			/* keep just enough allocations to meet the target */
+			balance[i] = (100.0 * avg_allocs / allocs[i]);
+
+			/* redirect the extra allocations */
+			for (int j = 0; j < StrategyControl->num_partitions; j++)
+			{
+				/* How many allocations to receive from i-th partition? */
+				uint32	receive_allocs = delta_frac * (avg_allocs - allocs[j]);
+
+				/* ignore partitions that don't need additional allocations */
+				if (allocs[j] > avg_allocs)
+					continue;
+
+				/* fraction to redirect */
+				balance[j] = (100.0 * receive_allocs / allocs[i]) + 0.5;
+			}
+		}
+
+		/* combine the old and new weights (hysteresis) */
+		for (int j = 0; j < MAX_BUFFER_PARTITIONS; j++)
+		{
+			sweep->balance[j]
+				= CLOCKSWEEP_HISTORY_COEFF * sweep->balance[j] +
+				  (1.0 - CLOCKSWEEP_HISTORY_COEFF) * balance[j];
+		}
+
+		SpinLockRelease(&sweep->clock_sweep_lock);
+	}
+}
+
 /*
  * StrategySyncStart -- prepare for sync of all partitions
  *
@@ -606,6 +925,7 @@ StrategySyncPrepare(int *num_parts, uint32 *num_buf_alloc)
 	{
 		ClockSweep *sweep = &StrategyControl->sweeps[i];
 
+		/* XXX we don't need the spinlock to read atomics, no? */
 		SpinLockAcquire(&sweep->clock_sweep_lock);
 		if (num_buf_alloc)
 		{
@@ -852,7 +1172,21 @@ StrategyInitialize(bool init)
 			/* Clear statistics */
 			StrategyControl->sweeps[i].completePasses = 0;
 			pg_atomic_init_u32(&StrategyControl->sweeps[i].numBufferAllocs, 0);
+			pg_atomic_init_u32(&StrategyControl->sweeps[i].numRequestedAllocs, 0);
 			pg_atomic_init_u64(&StrategyControl->sweeps[i].numTotalAllocs, 0);
+			pg_atomic_init_u64(&StrategyControl->sweeps[i].numTotalRequestedAllocs, 0);
+
+			/*
+			 * Initialize the weights - start by allocating 100% buffers from
+			 * the current node / partition.
+			 */
+			for (int j = 0; j < MAX_BUFFER_PARTITIONS; j++)
+			{
+				if (i == j)
+					StrategyControl->sweeps[i].balance[i] = 100;
+				else
+					StrategyControl->sweeps[i].balance[j] = 0;
+			}
 		}
 
 		/* No pending notification */
@@ -1242,7 +1576,9 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_r
 void
 FreelistPartitionGetInfo(int idx, uint64 *consumed, uint64 *remain, uint64 *actually_free,
 						 uint32 *complete_passes, uint32 *next_victim_buffer,
-						 uint64 *buffer_total_allocs, uint32 *buffer_allocs)
+						 uint64 *buffer_total_allocs, uint32 *buffer_allocs,
+						 uint64 *buffer_total_req_allocs, uint32 *buffer_req_allocs,
+						 int **weights)
 {
 	BufferStrategyFreelist *freelist;
 	ClockSweep *sweep;
@@ -1288,11 +1624,21 @@ FreelistPartitionGetInfo(int idx, uint64 *consumed, uint64 *remain, uint64 *actu
 
 	/* get the clocksweep stats too */
 	*complete_passes = sweep->completePasses;
+
+	/* calculate the actual buffer ID */
 	*next_victim_buffer = pg_atomic_read_u32(&sweep->nextVictimBuffer);
+	*next_victim_buffer = sweep->firstBuffer + (*next_victim_buffer % sweep->numBuffers);
 
-	*buffer_allocs = pg_atomic_read_u32(&sweep->numBufferAllocs);
 	*buffer_total_allocs = pg_atomic_read_u64(&sweep->numTotalAllocs);
+	*buffer_allocs = pg_atomic_read_u32(&sweep->numBufferAllocs);
 
-	/* calculate the actual buffer ID */
-	*next_victim_buffer = sweep->firstBuffer + (*next_victim_buffer % sweep->numBuffers);
+	*buffer_total_req_allocs = pg_atomic_read_u64(&sweep->numTotalRequestedAllocs);
+	*buffer_req_allocs = pg_atomic_read_u32(&sweep->numRequestedAllocs);
+
+	/* return the weights in a newly allocated array */
+	*weights = palloc_array(int, StrategyControl->num_partitions);
+	for (int i = 0; i < StrategyControl->num_partitions; i++)
+	{
+		(*weights)[i] = (int) sweep->balance[i];
+	}
 }
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 907b160b4f7..38bd5511048 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -449,6 +449,7 @@ extern void StrategyFreeBuffer(BufferDesc *buf);
 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
 								 BufferDesc *buf, bool from_ring);
 
+extern void StrategySyncBalance(void);
 extern void StrategySyncPrepare(int *num_parts, uint32 *num_buf_alloc);
 extern int	StrategySyncStart(int partition, uint32 *complete_passes,
 							  int *first_buffer, int *num_buffers);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index a6795c5fee9..e1729f0ee14 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -353,7 +353,10 @@ extern void FreelistPartitionGetInfo(int idx,
 									 uint32 *complete_passes,
 									 uint32 *next_victim_buffer,
 									 uint64 *buffer_total_allocs,
-									 uint32 *buffer_allocs);
+									 uint32 *buffer_allocs,
+									 uint64 *buffer_total_req_allocs,
+									 uint32 *buffer_req_allocs,
+									 int **weights);
 
 /* inline functions */
 
-- 
2.50.1