From d2c7801011a394b5b9e00d60bf59e759b65e1bed Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Thu, 22 May 2025 18:39:08 +0200
Subject: [PATCH v20250807 10/11] NUMA: interleave PGPROC entries

The goal is to distribute ProcArray (or rather PGPROC entries and
associated fast-path arrays) to NUMA nodes.

We can't do this by simply interleaving pages, because that wouldn't
work for both parts at the same time. We want to place the PGPROC and
it's fast-path locking structs on the same node, but the structs are
of different sizes, etc.

Another problem is that PGPROC entries are fairly small, so with huge
pages and reasonable values of max_connections everything fits onto a
single page. We don't want to make this incompatible with huge pages.

Note: If we eventually switch to allocating separate shared segments for
different parts (to allow on-line resizing), we could keep using regular
pages for procarray, and this would not be such an issue.

To make this work, we split the PGPROC array into per-node segments,
each with about (MaxBackends / numa_nodes) entries, and one segment for
auxiliary processes and prepared transations. And we do the same thing
for fast-path arrays.

The PGPROC segments are laid out like this (e.g. for 2 NUMA nodes):

 - PGPROC array / node #0
 - PGPROC array / node #1
 - PGPROC array / aux processes + 2PC transactions
 - fast-path arrays / node #0
 - fast-path arrays / node #1
 - fast-path arrays / aux processes + 2PC transaction

Each segment is aligned to (starts at) memory page, and is effectively a
multiple of multiple memory pages.

Having a single PGPROC array made certain operations easiers - e.g. it
was possible to iterate the array, and GetNumberFromPGProc() could
calculate offset by simply subtracting PGPROC pointers. With multiple
segments that's not possible, but the fallout is minimal.

Most places accessed PGPROC through PROC_HDR->allProcs, and can continue
to do so, except that now they get a pointer to the PGPROC (which most
places wanted anyway).

With the feature disabled, there's only a single "partition" for all
PGPROC entries.

Similarly to the buffer partitioning, this introduces a small "registry"
of partitions, as a source of truth. And then also a new "system" view
"pg_buffercache_pgproc" showing basic infromation abouut the partitions.

Note: There's an indirection, though. But the pointer does not change,
so hopefully that's not an issue. And each PGPROC entry gets an explicit
procnumber field, which is the index in allProcs, GetNumberFromPGProc
can simply return that.

Each PGPROC also gets numa_node, tracking the NUMA node, so that we
don't have to recalculate that. This is used by InitProcess() to pick
a PGPROC entry from the local NUMA node.

Note: The scheduler may migrate the process to a different CPU/node
later. Maybe we should consider pinning the process to the node?
---
 .../pg_buffercache--1.6--1.7.sql              |  19 +
 contrib/pg_buffercache/pg_buffercache_pages.c |  94 +++
 src/backend/access/transam/clog.c             |   4 +-
 src/backend/postmaster/pgarch.c               |   2 +-
 src/backend/postmaster/walsummarizer.c        |   2 +-
 src/backend/storage/buffer/buf_init.c         |   2 -
 src/backend/storage/buffer/freelist.c         |   2 +-
 src/backend/storage/ipc/procarray.c           |  63 +-
 src/backend/storage/lmgr/lock.c               |   6 +-
 src/backend/storage/lmgr/proc.c               | 565 +++++++++++++++++-
 src/backend/utils/init/globals.c              |   1 +
 src/backend/utils/misc/guc_tables.c           |  10 +
 src/include/miscadmin.h                       |   1 +
 src/include/storage/proc.h                    |  14 +-
 src/tools/pgindent/typedefs.list              |   1 +
 15 files changed, 722 insertions(+), 64 deletions(-)

diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
index 5acae31b836..ba54f69eeb4 100644
--- a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
+++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql
@@ -38,3 +38,22 @@ REVOKE ALL ON pg_buffercache_partitions FROM PUBLIC;
 
 GRANT EXECUTE ON FUNCTION pg_buffercache_partitions() TO pg_monitor;
 GRANT SELECT ON pg_buffercache_partitions TO pg_monitor;
+
+-- Register the new functions.
+CREATE OR REPLACE FUNCTION pg_buffercache_pgproc()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_pgproc'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE VIEW pg_buffercache_pgproc AS
+	SELECT P.* FROM pg_buffercache_pgproc() AS P
+	(partition integer,
+	 numa_node integer, num_procs integer, pgproc_ptr bigint, fastpath_ptr bigint);
+
+-- Don't want these to be available to public.
+REVOKE ALL ON FUNCTION pg_buffercache_pgproc() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_pgproc FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_pgproc() TO pg_monitor;
+GRANT SELECT ON pg_buffercache_pgproc TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 13014549d00..ee3aa8be2ce 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -15,6 +15,7 @@
 #include "port/pg_numa.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#include "storage/proc.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
 #include "utils/rel.h"
@@ -30,6 +31,7 @@
 
 #define NUM_BUFFERCACHE_NUMA_ELEM	3
 #define NUM_BUFFERCACHE_PARTITIONS_ELEM	15
+#define NUM_BUFFERCACHE_PGPROC_ELEM	5
 
 PG_MODULE_MAGIC_EXT(
 					.name = "pg_buffercache",
@@ -104,6 +106,7 @@ PG_FUNCTION_INFO_V1(pg_buffercache_evict);
 PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation);
 PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
 PG_FUNCTION_INFO_V1(pg_buffercache_partitions);
+PG_FUNCTION_INFO_V1(pg_buffercache_pgproc);
 
 
 /* Only need to touch memory once per backend process lifetime */
@@ -946,3 +949,94 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS)
 	else
 		SRF_RETURN_DONE(funcctx);
 }
+
+/*
+ * Inquire about partitioning of PGPROC array.
+ */
+Datum
+pg_buffercache_pgproc(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+	MemoryContext oldcontext;
+	TupleDesc	tupledesc;
+	TupleDesc	expected_tupledesc;
+	HeapTuple	tuple;
+	Datum		result;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		funcctx = SRF_FIRSTCALL_INIT();
+
+		/* Switch context when allocating stuff to be used in later calls */
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+			elog(ERROR, "return type must be a row type");
+
+		if (expected_tupledesc->natts != NUM_BUFFERCACHE_PGPROC_ELEM)
+			elog(ERROR, "incorrect number of output arguments");
+
+		/* Construct a tuple descriptor for the result rows. */
+		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 1, "partition",
+						   INT4OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "numa_node",
+						   INT4OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 3, "num_procs",
+						   INT4OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 4, "pgproc_ptr",
+						   INT8OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 5, "fastpath_ptr",
+						   INT8OID, -1, 0);
+
+		funcctx->user_fctx = BlessTupleDesc(tupledesc);
+
+		/* Return to original context when allocating transient memory */
+		MemoryContextSwitchTo(oldcontext);
+
+		/* Set max calls and remember the user function context. */
+		funcctx->max_calls = ProcPartitionCount();
+	}
+
+	funcctx = SRF_PERCALL_SETUP();
+
+	if (funcctx->call_cntr < funcctx->max_calls)
+	{
+		uint32		i = funcctx->call_cntr;
+
+		int			numa_node,
+					num_procs;
+
+		void	   *pgproc_ptr,
+				   *fastpath_ptr;
+
+		Datum		values[NUM_BUFFERCACHE_PGPROC_ELEM];
+		bool		nulls[NUM_BUFFERCACHE_PGPROC_ELEM];
+
+		ProcPartitionGet(i, &numa_node, &num_procs,
+						 &pgproc_ptr, &fastpath_ptr);
+
+		values[0] = Int32GetDatum(i);
+		nulls[0] = false;
+
+		values[1] = Int32GetDatum(numa_node);
+		nulls[1] = false;
+
+		values[2] = Int32GetDatum(num_procs);
+		nulls[2] = false;
+
+		values[3] = PointerGetDatum(pgproc_ptr);
+		nulls[3] = false;
+
+		values[4] = PointerGetDatum(fastpath_ptr);
+		nulls[4] = false;
+
+		/* Build and return the tuple. */
+		tuple = heap_form_tuple((TupleDesc) funcctx->user_fctx, values, nulls);
+		result = HeapTupleGetDatum(tuple);
+
+		SRF_RETURN_NEXT(funcctx, result);
+	}
+	else
+		SRF_RETURN_DONE(funcctx);
+}
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index e80fbe109cf..928d126d0ee 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -574,7 +574,7 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status,
 	/* Walk the list and update the status of all XIDs. */
 	while (nextidx != INVALID_PROC_NUMBER)
 	{
-		PGPROC	   *nextproc = &ProcGlobal->allProcs[nextidx];
+		PGPROC	   *nextproc = ProcGlobal->allProcs[nextidx];
 		int64		thispageno = nextproc->clogGroupMemberPage;
 
 		/*
@@ -633,7 +633,7 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status,
 	 */
 	while (wakeidx != INVALID_PROC_NUMBER)
 	{
-		PGPROC	   *wakeproc = &ProcGlobal->allProcs[wakeidx];
+		PGPROC	   *wakeproc = ProcGlobal->allProcs[wakeidx];
 
 		wakeidx = pg_atomic_read_u32(&wakeproc->clogGroupNext);
 		pg_atomic_write_u32(&wakeproc->clogGroupNext, INVALID_PROC_NUMBER);
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index 78e39e5f866..e28e0f7d3bd 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -289,7 +289,7 @@ PgArchWakeup(void)
 	 * be relaunched shortly and will start archiving.
 	 */
 	if (arch_pgprocno != INVALID_PROC_NUMBER)
-		SetLatch(&ProcGlobal->allProcs[arch_pgprocno].procLatch);
+		SetLatch(&ProcGlobal->allProcs[arch_pgprocno]->procLatch);
 }
 
 
diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c
index 777c9a8d555..087279a6a8e 100644
--- a/src/backend/postmaster/walsummarizer.c
+++ b/src/backend/postmaster/walsummarizer.c
@@ -649,7 +649,7 @@ WakeupWalSummarizer(void)
 	LWLockRelease(WALSummarizerLock);
 
 	if (pgprocno != INVALID_PROC_NUMBER)
-		SetLatch(&ProcGlobal->allProcs[pgprocno].procLatch);
+		SetLatch(&ProcGlobal->allProcs[pgprocno]->procLatch);
 }
 
 /*
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 5b65a855b29..fb52039e1a6 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -500,8 +500,6 @@ buffer_partitions_prepare(void)
 	if (numa_nodes < 1)
 		numa_nodes = 1;
 
-	elog(WARNING, "IsUnderPostmaster %d", IsUnderPostmaster);
-
 	/*
 	 * XXX A bit weird. Do we need to worry about postmaster? Could this even
 	 * run outside postmaster? I don't think so.
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index bbe29bc9729..878d1e33f61 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -510,7 +510,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 		 * actually fine because procLatch isn't ever freed, so we just can
 		 * potentially set the wrong process' (or no process') latch.
 		 */
-		SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
+		SetLatch(&ProcGlobal->allProcs[bgwprocno]->procLatch);
 	}
 
 	/*
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index bf987aed8d3..3e86e4ca2ae 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -268,7 +268,7 @@ typedef enum KAXCompressReason
 
 static ProcArrayStruct *procArray;
 
-static PGPROC *allProcs;
+static PGPROC **allProcs;
 
 /*
  * Cache to reduce overhead of repeated calls to TransactionIdIsInProgress()
@@ -502,7 +502,7 @@ ProcArrayAdd(PGPROC *proc)
 		int			this_procno = arrayP->pgprocnos[index];
 
 		Assert(this_procno >= 0 && this_procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
-		Assert(allProcs[this_procno].pgxactoff == index);
+		Assert(allProcs[this_procno]->pgxactoff == index);
 
 		/* If we have found our right position in the array, break */
 		if (this_procno > pgprocno)
@@ -538,9 +538,9 @@ ProcArrayAdd(PGPROC *proc)
 		int			procno = arrayP->pgprocnos[index];
 
 		Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
-		Assert(allProcs[procno].pgxactoff == index - 1);
+		Assert(allProcs[procno]->pgxactoff == index - 1);
 
-		allProcs[procno].pgxactoff = index;
+		allProcs[procno]->pgxactoff = index;
 	}
 
 	/*
@@ -581,7 +581,7 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 	myoff = proc->pgxactoff;
 
 	Assert(myoff >= 0 && myoff < arrayP->numProcs);
-	Assert(ProcGlobal->allProcs[arrayP->pgprocnos[myoff]].pgxactoff == myoff);
+	Assert(ProcGlobal->allProcs[arrayP->pgprocnos[myoff]]->pgxactoff == myoff);
 
 	if (TransactionIdIsValid(latestXid))
 	{
@@ -636,9 +636,9 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 		int			procno = arrayP->pgprocnos[index];
 
 		Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
-		Assert(allProcs[procno].pgxactoff - 1 == index);
+		Assert(allProcs[procno]->pgxactoff - 1 == index);
 
-		allProcs[procno].pgxactoff = index;
+		allProcs[procno]->pgxactoff = index;
 	}
 
 	/*
@@ -860,7 +860,7 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid)
 	/* Walk the list and clear all XIDs. */
 	while (nextidx != INVALID_PROC_NUMBER)
 	{
-		PGPROC	   *nextproc = &allProcs[nextidx];
+		PGPROC	   *nextproc = allProcs[nextidx];
 
 		ProcArrayEndTransactionInternal(nextproc, nextproc->procArrayGroupMemberXid);
 
@@ -880,7 +880,7 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid)
 	 */
 	while (wakeidx != INVALID_PROC_NUMBER)
 	{
-		PGPROC	   *nextproc = &allProcs[wakeidx];
+		PGPROC	   *nextproc = allProcs[wakeidx];
 
 		wakeidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext);
 		pg_atomic_write_u32(&nextproc->procArrayGroupNext, INVALID_PROC_NUMBER);
@@ -1526,7 +1526,7 @@ TransactionIdIsInProgress(TransactionId xid)
 		pxids = other_subxidstates[pgxactoff].count;
 		pg_read_barrier();		/* pairs with barrier in GetNewTransactionId() */
 		pgprocno = arrayP->pgprocnos[pgxactoff];
-		proc = &allProcs[pgprocno];
+		proc = allProcs[pgprocno];
 		for (j = pxids - 1; j >= 0; j--)
 		{
 			/* Fetch xid just once - see GetNewTransactionId */
@@ -1622,7 +1622,6 @@ TransactionIdIsInProgress(TransactionId xid)
 	return false;
 }
 
-
 /*
  * Determine XID horizons.
  *
@@ -1740,7 +1739,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 	for (int index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 		int8		statusFlags = ProcGlobal->statusFlags[index];
 		TransactionId xid;
 		TransactionId xmin;
@@ -2224,7 +2223,7 @@ GetSnapshotData(Snapshot snapshot)
 			TransactionId xid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]);
 			uint8		statusFlags;
 
-			Assert(allProcs[arrayP->pgprocnos[pgxactoff]].pgxactoff == pgxactoff);
+			Assert(allProcs[arrayP->pgprocnos[pgxactoff]]->pgxactoff == pgxactoff);
 
 			/*
 			 * If the transaction has no XID assigned, we can skip it; it
@@ -2298,7 +2297,7 @@ GetSnapshotData(Snapshot snapshot)
 					if (nsubxids > 0)
 					{
 						int			pgprocno = pgprocnos[pgxactoff];
-						PGPROC	   *proc = &allProcs[pgprocno];
+						PGPROC	   *proc = allProcs[pgprocno];
 
 						pg_read_barrier();	/* pairs with GetNewTransactionId */
 
@@ -2499,7 +2498,7 @@ ProcArrayInstallImportedXmin(TransactionId xmin,
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 		int			statusFlags = ProcGlobal->statusFlags[index];
 		TransactionId xid;
 
@@ -2725,7 +2724,7 @@ GetRunningTransactionData(void)
 		if (TransactionIdPrecedes(xid, oldestDatabaseRunningXid))
 		{
 			int			pgprocno = arrayP->pgprocnos[index];
-			PGPROC	   *proc = &allProcs[pgprocno];
+			PGPROC	   *proc = allProcs[pgprocno];
 
 			if (proc->databaseId == MyDatabaseId)
 				oldestDatabaseRunningXid = xid;
@@ -2756,7 +2755,7 @@ GetRunningTransactionData(void)
 		for (index = 0; index < arrayP->numProcs; index++)
 		{
 			int			pgprocno = arrayP->pgprocnos[index];
-			PGPROC	   *proc = &allProcs[pgprocno];
+			PGPROC	   *proc = allProcs[pgprocno];
 			int			nsubxids;
 
 			/*
@@ -2858,7 +2857,7 @@ GetOldestActiveTransactionId(bool inCommitOnly, bool allDbs)
 	{
 		TransactionId xid;
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		/* Fetch xid just once - see GetNewTransactionId */
 		xid = UINT32_ACCESS_ONCE(other_xids[index]);
@@ -3020,7 +3019,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids, int type)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		if ((proc->delayChkptFlags & type) != 0)
 		{
@@ -3061,7 +3060,7 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 		VirtualTransactionId vxid;
 
 		GET_VXID_FROM_PGPROC(vxid, *proc);
@@ -3189,7 +3188,7 @@ BackendPidGetProcWithLock(int pid)
 
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
-		PGPROC	   *proc = &allProcs[arrayP->pgprocnos[index]];
+		PGPROC	   *proc = allProcs[arrayP->pgprocnos[index]];
 
 		if (proc->pid == pid)
 		{
@@ -3232,7 +3231,7 @@ BackendXidGetPid(TransactionId xid)
 		if (other_xids[index] == xid)
 		{
 			int			pgprocno = arrayP->pgprocnos[index];
-			PGPROC	   *proc = &allProcs[pgprocno];
+			PGPROC	   *proc = allProcs[pgprocno];
 
 			result = proc->pid;
 			break;
@@ -3301,7 +3300,7 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 		uint8		statusFlags = ProcGlobal->statusFlags[index];
 
 		if (proc == MyProc)
@@ -3403,7 +3402,7 @@ GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		/* Exclude prepared transactions */
 		if (proc->pid == 0)
@@ -3468,7 +3467,7 @@ SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode,
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 		VirtualTransactionId procvxid;
 
 		GET_VXID_FROM_PGPROC(procvxid, *proc);
@@ -3523,7 +3522,7 @@ MinimumActiveBackends(int min)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		/*
 		 * Since we're not holding a lock, need to be prepared to deal with
@@ -3569,7 +3568,7 @@ CountDBBackends(Oid databaseid)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		if (proc->pid == 0)
 			continue;			/* do not count prepared xacts */
@@ -3598,7 +3597,7 @@ CountDBConnections(Oid databaseid)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		if (proc->pid == 0)
 			continue;			/* do not count prepared xacts */
@@ -3629,7 +3628,7 @@ CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		if (databaseid == InvalidOid || proc->databaseId == databaseid)
 		{
@@ -3670,7 +3669,7 @@ CountUserBackends(Oid roleid)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		if (proc->pid == 0)
 			continue;			/* do not count prepared xacts */
@@ -3733,7 +3732,7 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
 		for (index = 0; index < arrayP->numProcs; index++)
 		{
 			int			pgprocno = arrayP->pgprocnos[index];
-			PGPROC	   *proc = &allProcs[pgprocno];
+			PGPROC	   *proc = allProcs[pgprocno];
 			uint8		statusFlags = ProcGlobal->statusFlags[index];
 
 			if (proc->databaseId != databaseId)
@@ -3799,7 +3798,7 @@ TerminateOtherDBBackends(Oid databaseId)
 	for (i = 0; i < procArray->numProcs; i++)
 	{
 		int			pgprocno = arrayP->pgprocnos[i];
-		PGPROC	   *proc = &allProcs[pgprocno];
+		PGPROC	   *proc = allProcs[pgprocno];
 
 		if (proc->databaseId != databaseId)
 			continue;
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 62f3471448e..c84a2a5f1bc 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -2844,7 +2844,7 @@ FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag
 	 */
 	for (i = 0; i < ProcGlobal->allProcCount; i++)
 	{
-		PGPROC	   *proc = &ProcGlobal->allProcs[i];
+		PGPROC	   *proc = ProcGlobal->allProcs[i];
 		uint32		j;
 
 		LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE);
@@ -3103,7 +3103,7 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp)
 		 */
 		for (i = 0; i < ProcGlobal->allProcCount; i++)
 		{
-			PGPROC	   *proc = &ProcGlobal->allProcs[i];
+			PGPROC	   *proc = ProcGlobal->allProcs[i];
 			uint32		j;
 
 			/* A backend never blocks itself */
@@ -3790,7 +3790,7 @@ GetLockStatusData(void)
 	 */
 	for (i = 0; i < ProcGlobal->allProcCount; ++i)
 	{
-		PGPROC	   *proc = &ProcGlobal->allProcs[i];
+		PGPROC	   *proc = ProcGlobal->allProcs[i];
 
 		/* Skip backends with pid=0, as they don't hold fast-path locks */
 		if (proc->pid == 0)
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index e9ef0fbfe32..11259151a7d 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -29,21 +29,29 @@
  */
 #include "postgres.h"
 
+#include <sched.h>
 #include <signal.h>
 #include <unistd.h>
 #include <sys/time.h>
 
+#ifdef USE_LIBNUMA
+#include <numa.h>
+#include <numaif.h>
+#endif
+
 #include "access/transam.h"
 #include "access/twophase.h"
 #include "access/xlogutils.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "port/pg_numa.h"
 #include "postmaster/autovacuum.h"
 #include "replication/slotsync.h"
 #include "replication/syncrep.h"
 #include "storage/condition_variable.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
+#include "storage/pg_shmem.h"
 #include "storage/pmsignal.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
@@ -90,6 +98,31 @@ static void AuxiliaryProcKill(int code, Datum arg);
 static void CheckDeadLock(void);
 
 
+/* number of NUMA nodes (as returned by numa_num_configured_nodes) */
+static int	numa_nodes = -1;	/* number of nodes when sizing */
+static Size numa_page_size = 0; /* page used to size partitions */
+static bool numa_can_partition = false; /* can map to NUMA nodes? */
+static int	numa_procs_per_node = -1;	/* pgprocs per node */
+
+static Size get_memory_page_size(void); /* XXX duplicate with bufi_init.c */
+
+static void pgproc_partitions_prepare(void);
+static char *pgproc_partition_init(char *ptr, int num_procs,
+								   int allprocs_index, int node);
+static char *fastpath_partition_init(char *ptr, int num_procs,
+									 int allprocs_index, int node,
+									 Size fpLockBitsSize, Size fpRelIdSize);
+
+typedef struct PGProcPartition
+{
+	int			num_procs;
+	int			numa_node;
+	void	   *pgproc_ptr;
+	void	   *fastpath_ptr;
+} PGProcPartition;
+
+static PGProcPartition *partitions = NULL;
+
 /*
  * Report shared-memory space needed by PGPROC.
  */
@@ -100,11 +133,63 @@ PGProcShmemSize(void)
 	Size		TotalProcs =
 		add_size(MaxBackends, add_size(NUM_AUXILIARY_PROCS, max_prepared_xacts));
 
+	size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC *)));
 	size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC)));
 	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->xids)));
 	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->subxidStates)));
 	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->statusFlags)));
 
+	/*
+	 * To support NUMA partitioning, the PGPROC array will be divided into
+	 * multiple chunks - one per NUMA node, and one extra for auxiliary/2PC
+	 * entries (which are not assigned to any NUMA node).
+	 *
+	 * We can't simply map pages of a single continuous array, because the
+	 * PGPROC entries are very small and too many of them would fit on a
+	 * single page (at least with huge pages). Far more than reasonable values
+	 * of max_connections. So instead we cut the array into separate pieces
+	 * for each node.
+	 *
+	 * Each piece may need up to one memory page of padding, to make it
+	 * aligned with memory page (for NUMA), So we just add a page - it's a bit
+	 * wasteful, but should not matter much - NUMA is meant for large boxes,
+	 * so a couple pages is negligible.
+	 *
+	 * We only do this with NUMA partitioning. With the GUC disabled, or when
+	 * we find we can't do that for some reason, we just allocate the PGPROC
+	 * array as a single chunk. This is determined by the earlier call to
+	 * pgproc_partitions_prepare().
+	 *
+	 * XXX It might be more painful with very large huge pages (e.g. 1GB).
+	 */
+
+	/*
+	 * If PGPROC partitioning is enabled, and we decided it's possible, we
+	 * need to add one memory page per NUMA node (and one for auxiliary/2PC
+	 * processes) to allow proper alignment.
+	 *
+	 * XXX This is a a bit wasteful, because it might actually add pages even
+	 * when not strictly needed (if it's already aligned). And we always
+	 * assume we'll add a whole page, even if the alignment needs only less
+	 * memory.
+	 */
+	if (numa_procs_interleave && numa_can_partition)
+	{
+		Assert(numa_nodes > 0);
+		size = add_size(size, mul_size((numa_nodes + 1), numa_page_size));
+
+		/*
+		 * Also account for a small registry of partitions, a simple array of
+		 * partitions at the beginning.
+		 */
+		size = add_size(size, mul_size((numa_nodes + 1), sizeof(PGProcPartition)));
+	}
+	else
+	{
+		/* otherwise add only a tiny registry, with a single partition */
+		size = add_size(size, sizeof(PGProcPartition));
+	}
+
 	return size;
 }
 
@@ -129,6 +214,25 @@ FastPathLockShmemSize(void)
 
 	size = add_size(size, mul_size(TotalProcs, (fpLockBitsSize + fpRelIdSize)));
 
+	/*
+	 * When applying NUMA to the fast-path locks, we follow the same logic as
+	 * for PGPROC entries. See the comments in PGProcShmemSize().
+	 *
+	 * If PGPROC partitioning is enabled, and we decided it's possible, we
+	 * need to add one memory page per NUMA node (and one for auxiliary/2PC
+	 * processes) to allow proper alignment.
+	 *
+	 * XXX This is a a bit wasteful, because it might actually add pages even
+	 * when not strictly needed (if it's already aligned). And we always
+	 * assume we'll add a whole page, even if the alignment needs only less
+	 * memory.
+	 */
+	if (numa_procs_interleave && numa_can_partition)
+	{
+		Assert(numa_nodes > 0);
+		size = add_size(size, mul_size((numa_nodes + 1), numa_page_size));
+	}
+
 	return size;
 }
 
@@ -140,6 +244,9 @@ ProcGlobalShmemSize(void)
 {
 	Size		size = 0;
 
+	/* calculate partition info for pgproc entries etc */
+	pgproc_partitions_prepare();
+
 	/* ProcGlobal */
 	size = add_size(size, sizeof(PROC_HDR));
 	size = add_size(size, sizeof(slock_t));
@@ -191,7 +298,7 @@ ProcGlobalSemas(void)
 void
 InitProcGlobal(void)
 {
-	PGPROC	   *procs;
+	PGPROC	  **procs;
 	int			i,
 				j;
 	bool		found;
@@ -205,6 +312,8 @@ InitProcGlobal(void)
 	Size		requestSize;
 	char	   *ptr;
 
+	Size		mem_page_size = get_memory_page_size();
+
 	/* Create the ProcGlobal shared structure */
 	ProcGlobal = (PROC_HDR *)
 		ShmemInitStruct("Proc Header", sizeof(PROC_HDR), &found);
@@ -241,19 +350,115 @@ InitProcGlobal(void)
 
 	MemSet(ptr, 0, requestSize);
 
-	procs = (PGPROC *) ptr;
-	ptr = (char *) ptr + TotalProcs * sizeof(PGPROC);
+	/* allprocs (array of pointers to PGPROC entries) */
+	procs = (PGPROC **) ptr;
+	ptr = (char *) ptr + TotalProcs * sizeof(PGPROC *);
 
 	ProcGlobal->allProcs = procs;
 	/* XXX allProcCount isn't really all of them; it excludes prepared xacts */
 	ProcGlobal->allProcCount = MaxBackends + NUM_AUXILIARY_PROCS;
 
+	/*
+	 * If NUMA partitioning is enabled, and we decided we actually can do the
+	 * partitioning, allocate the chunks.
+	 *
+	 * Otherwise we'll allocate a single array for everything. It's not quite
+	 * what we did without NUMA, because there's an extra level of
+	 * indirection, but it's the best we can do.
+	 */
+	if (numa_procs_interleave && numa_can_partition)
+	{
+		int			node_procs;
+		int			total_procs = 0;
+
+		Assert(numa_procs_per_node > 0);
+		Assert(numa_nodes > 0);
+
+		/*
+		 * Now initialize the PGPROC partition registry with one partitoion
+		 * per NUMA node.
+		 */
+		partitions = (PGProcPartition *) ptr;
+		ptr += (numa_nodes * sizeof(PGProcPartition));
+
+		/* build PGPROC entries for NUMA nodes */
+		for (i = 0; i < numa_nodes; i++)
+		{
+			/* the last NUMA node may get fewer PGPROC entries, but meh */
+			node_procs = Min(numa_procs_per_node, MaxBackends - total_procs);
+
+			/* make sure to align the PGPROC array to memory page */
+			ptr = (char *) TYPEALIGN(numa_page_size, ptr);
+
+			/* fill in the partition info */
+			partitions[i].num_procs = node_procs;
+			partitions[i].numa_node = i;
+			partitions[i].pgproc_ptr = ptr;
+
+			ptr = pgproc_partition_init(ptr, node_procs, total_procs, i);
+
+			total_procs += node_procs;
+
+			/* don't underflow/overflow the allocation */
+			Assert((ptr > (char *) procs) && (ptr <= (char *) procs + requestSize));
+		}
+
+		Assert(total_procs == MaxBackends);
+
+		/*
+		 * Also build PGPROC entries for auxiliary procs / prepared xacts (we
+		 * however don't assign those to any NUMA node).
+		 */
+		node_procs = (NUM_AUXILIARY_PROCS + max_prepared_xacts);
+
+		/* make sure to align the PGPROC array to memory page */
+		ptr = (char *) TYPEALIGN(numa_page_size, ptr);
+
+		/* fill in the partition info */
+		partitions[numa_nodes].num_procs = node_procs;
+		partitions[numa_nodes].numa_node = -1;
+		partitions[numa_nodes].pgproc_ptr = ptr;
+
+		ptr = pgproc_partition_init(ptr, node_procs, total_procs, -1);
+
+		total_procs += node_procs;
+
+		/* don't overflow the allocation */
+		Assert((ptr > (char *) procs) && (ptr <= (char *) procs + requestSize));
+
+		Assert(total_procs = TotalProcs);
+	}
+	else
+	{
+		/*
+		 * Now initialize the PGPROC partition registry with a single
+		 * partition for all the procs.
+		 */
+		partitions = (PGProcPartition *) ptr;
+		ptr += sizeof(PGProcPartition);
+
+		/* just treat everything as a single array, with no alignment */
+		ptr = pgproc_partition_init(ptr, TotalProcs, 0, -1);
+
+		/* fill in the partition info */
+		partitions[0].num_procs = TotalProcs;
+		partitions[0].numa_node = -1;
+		partitions[0].pgproc_ptr = ptr;
+
+		/* don't overflow the allocation */
+		Assert((ptr > (char *) procs) && (ptr <= (char *) procs + requestSize));
+	}
+
 	/*
 	 * Allocate arrays mirroring PGPROC fields in a dense manner. See
 	 * PROC_HDR.
 	 *
 	 * XXX: It might make sense to increase padding for these arrays, given
 	 * how hotly they are accessed.
+	 *
+	 * XXX Would it make sense to NUMA-partition these chunks too, somehow?
+	 * But those arrays are tiny, fit into a single memory page, so would need
+	 * to be made more complex. Not sure.
 	 */
 	ProcGlobal->xids = (TransactionId *) ptr;
 	ptr = (char *) ptr + (TotalProcs * sizeof(*ProcGlobal->xids));
@@ -286,24 +491,92 @@ InitProcGlobal(void)
 	/* For asserts checking we did not overflow. */
 	fpEndPtr = fpPtr + requestSize;
 
-	for (i = 0; i < TotalProcs; i++)
+	/*
+	 * Mimic the logic we used to partition PGPROC entries.
+	 */
+
+	/*
+	 * If NUMA partitioning is enabled, and we decided we actually can do the
+	 * partitioning, allocate the chunks.
+	 *
+	 * Otherwise we'll allocate a single array for everything. It's not quite
+	 * what we did without NUMA, because there's an extra level of
+	 * indirection, but it's the best we can do.
+	 */
+	if (numa_procs_interleave && numa_can_partition)
 	{
-		PGPROC	   *proc = &procs[i];
+		int			node_procs;
+		int			total_procs = 0;
+
+		Assert(numa_procs_per_node > 0);
+
+		/* build PGPROC entries for NUMA nodes */
+		for (i = 0; i < numa_nodes; i++)
+		{
+			/* the last NUMA node may get fewer PGPROC entries, but meh */
+			node_procs = Min(numa_procs_per_node, MaxBackends - total_procs);
+
+			/* make sure to align the PGPROC array to memory page */
+			fpPtr = (char *) TYPEALIGN(mem_page_size, fpPtr);
 
-		/* Common initialization for all PGPROCs, regardless of type. */
+			/* remember this pointer too */
+			partitions[i].fastpath_ptr = fpPtr;
+			Assert(node_procs == partitions[i].num_procs);
+
+			fpPtr = fastpath_partition_init(fpPtr, node_procs, total_procs, i,
+											fpLockBitsSize, fpRelIdSize);
+
+			total_procs += node_procs;
+
+			/* don't overflow the allocation */
+			Assert(fpPtr <= fpEndPtr);
+		}
+
+		Assert(total_procs == MaxBackends);
 
 		/*
-		 * Set the fast-path lock arrays, and move the pointer. We interleave
-		 * the two arrays, to (hopefully) get some locality for each backend.
+		 * Also build PGPROC entries for auxiliary procs / prepared xacts (we
+		 * however don't assign those to any NUMA node).
 		 */
-		proc->fpLockBits = (uint64 *) fpPtr;
-		fpPtr += fpLockBitsSize;
+		node_procs = (NUM_AUXILIARY_PROCS + max_prepared_xacts);
 
-		proc->fpRelId = (Oid *) fpPtr;
-		fpPtr += fpRelIdSize;
+		/* make sure to align the PGPROC array to memory page */
+		fpPtr = (char *) TYPEALIGN(numa_page_size, fpPtr);
 
+		/* remember this pointer too */
+		partitions[numa_nodes].fastpath_ptr = fpPtr;
+		Assert(node_procs == partitions[numa_nodes].num_procs);
+
+		fpPtr = fastpath_partition_init(fpPtr, node_procs, total_procs, -1,
+										fpLockBitsSize, fpRelIdSize);
+
+		total_procs += node_procs;
+
+		/* don't overflow the allocation */
 		Assert(fpPtr <= fpEndPtr);
 
+		Assert(total_procs = TotalProcs);
+	}
+	else
+	{
+		/* remember this pointer too */
+		partitions[0].fastpath_ptr = fpPtr;
+		Assert(TotalProcs == partitions[0].num_procs);
+
+		/* just treat everything as a single array, with no alignment */
+		fpPtr = fastpath_partition_init(fpPtr, TotalProcs, 0, -1,
+										fpLockBitsSize, fpRelIdSize);
+
+		/* don't overflow the allocation */
+		Assert(fpPtr <= fpEndPtr);
+	}
+
+	for (i = 0; i < TotalProcs; i++)
+	{
+		PGPROC	   *proc = procs[i];
+
+		Assert(proc->procnumber == i);
+
 		/*
 		 * Set up per-PGPROC semaphore, latch, and fpInfoLock.  Prepared xact
 		 * dummy PGPROCs don't need these though - they're never associated
@@ -366,15 +639,12 @@ InitProcGlobal(void)
 		pg_atomic_init_u64(&(proc->waitStart), 0);
 	}
 
-	/* Should have consumed exactly the expected amount of fast-path memory. */
-	Assert(fpPtr == fpEndPtr);
-
 	/*
 	 * Save pointers to the blocks of PGPROC structures reserved for auxiliary
 	 * processes and prepared transactions.
 	 */
-	AuxiliaryProcs = &procs[MaxBackends];
-	PreparedXactProcs = &procs[MaxBackends + NUM_AUXILIARY_PROCS];
+	AuxiliaryProcs = procs[MaxBackends];
+	PreparedXactProcs = procs[MaxBackends + NUM_AUXILIARY_PROCS];
 
 	/* Create ProcStructLock spinlock, too */
 	ProcStructLock = (slock_t *) ShmemInitStruct("ProcStructLock spinlock",
@@ -435,7 +705,45 @@ InitProcess(void)
 
 	if (!dlist_is_empty(procgloballist))
 	{
-		MyProc = dlist_container(PGPROC, links, dlist_pop_head_node(procgloballist));
+		/*
+		 * With numa interleaving of PGPROC, try to get a PROC entry from the
+		 * right NUMA node (when the process starts).
+		 *
+		 * XXX The process may move to a different NUMA node later, but
+		 * there's not much we can do about that.
+		 */
+		if (numa_procs_interleave)
+		{
+			dlist_mutable_iter iter;
+			unsigned	cpu;
+			unsigned	node;
+			int			rc;
+
+			rc = getcpu(&cpu, &node);
+			if (rc != 0)
+				elog(ERROR, "getcpu failed: %m");
+
+			MyProc = NULL;
+
+			dlist_foreach_modify(iter, procgloballist)
+			{
+				PGPROC	   *proc;
+
+				proc = dlist_container(PGPROC, links, iter.cur);
+
+				if (proc->numa_node == node)
+				{
+					MyProc = proc;
+					dlist_delete(iter.cur);
+					break;
+				}
+			}
+		}
+
+		/* didn't find PGPROC from the correct NUMA node, pick any free one */
+		if (MyProc == NULL)
+			MyProc = dlist_container(PGPROC, links, dlist_pop_head_node(procgloballist));
+
 		SpinLockRelease(ProcStructLock);
 	}
 	else
@@ -1988,7 +2296,7 @@ ProcSendSignal(ProcNumber procNumber)
 	if (procNumber < 0 || procNumber >= ProcGlobal->allProcCount)
 		elog(ERROR, "procNumber out of range");
 
-	SetLatch(&ProcGlobal->allProcs[procNumber].procLatch);
+	SetLatch(&ProcGlobal->allProcs[procNumber]->procLatch);
 }
 
 /*
@@ -2063,3 +2371,222 @@ BecomeLockGroupMember(PGPROC *leader, int pid)
 
 	return ok;
 }
+
+/* copy from buf_init.c */
+static Size
+get_memory_page_size(void)
+{
+	Size		os_page_size;
+	Size		huge_page_size;
+
+#ifdef WIN32
+	SYSTEM_INFO sysinfo;
+
+	GetSystemInfo(&sysinfo);
+	os_page_size = sysinfo.dwPageSize;
+#else
+	os_page_size = sysconf(_SC_PAGESIZE);
+#endif
+
+	/*
+	 * XXX This is a bit annoying/confusing, because we may get a different
+	 * result depending on when we call it. Before mmap() we don't know if the
+	 * huge pages get used, so we assume they will. And then if we don't get
+	 * huge pages, we'll waste memory etc.
+	 */
+
+	/* assume huge pages get used, unless HUGE_PAGES_OFF */
+	if (huge_pages_status == HUGE_PAGES_OFF)
+		huge_page_size = 0;
+	else
+		GetHugePageSize(&huge_page_size, NULL);
+
+	return Max(os_page_size, huge_page_size);
+}
+
+/*
+ * pgproc_partitions_prepare
+ *		Calculate parameters for partitioning buffers.
+ *
+ * NUMA partitioning
+ *
+ * Now build the actual PGPROC arrays, one "chunk" per NUMA node (and one
+ * extra for auxiliary processes and 2PC transactions, not associated with
+ * any particular node).
+ *
+ * First determine how many "backend" procs to allocate per NUMA node. The
+ * count may not be exactly divisible, but we mostly ignore that. The last
+ * node may get somewhat fewer PGPROC entries, but the imbalance ought to
+ * be pretty small (if MaxBackends >> numa_nodes).
+ *
+ * XXX A fairer distribution is possible, but not worth it for now.
+ */
+static void
+pgproc_partitions_prepare(void)
+{
+	/* bail out if already initialized (calculate only once) */
+	if (numa_nodes != -1)
+		return;
+
+	/* XXX only gives us the number, the nodes may not be 0, 1, 2, ... */
+	numa_nodes = numa_num_configured_nodes();
+
+	/* XXX can this happen? */
+	if (numa_nodes < 1)
+		numa_nodes = 1;
+
+	/*
+	 * XXX A bit weird. Do we need to worry about postmaster? Could this even
+	 * run outside postmaster? I don't think so.
+	 *
+	 * XXX Another issue is we may get different values than when sizing the
+	 * the memory, because at that point we didn't know if we get huge pages,
+	 * so we assumed we will. Shouldn't cause crashes, but we might allocate
+	 * shared memory and then not use some of it (because of the alignment
+	 * that we don't actually need). Not sure about better way, good for now.
+	 */
+	if (IsUnderPostmaster)
+		numa_page_size = pg_get_shmem_pagesize();
+	else
+		numa_page_size = get_memory_page_size();
+
+	numa_procs_per_node = (MaxBackends + (numa_nodes - 1)) / numa_nodes;
+
+	elog(LOG, "NUMA: pgproc backends %d num_nodes %d per_node %d",
+		 MaxBackends, numa_nodes, numa_procs_per_node);
+
+	Assert(numa_nodes * numa_procs_per_node >= MaxBackends);
+
+	/* success */
+	numa_can_partition = true;
+}
+
+static void
+pg_numa_move_to_node(char *startptr, char *endptr, int node)
+{
+	Size		mem_page_size;
+	Size		sz;
+
+	/*
+	 * Get the "actual" memory page size, not the one we used for sizing. We
+	 * might have used huge page for sizing, but only get regular pages when
+	 * allocating, so we must use the smaller pages here.
+	 *
+	 * XXX A bit weird. Do we need to worry about postmaster? Could this even
+	 * run outside postmaster? I don't think so.
+	 */
+	if (IsUnderPostmaster)
+		mem_page_size = pg_get_shmem_pagesize();
+	else
+		mem_page_size = get_memory_page_size();
+
+	Assert((int64) startptr % mem_page_size == 0);
+
+	sz = (endptr - startptr);
+	numa_tonode_memory(startptr, sz, node);
+}
+
+/*
+ * doesn't do alignment
+ */
+static char *
+pgproc_partition_init(char *ptr, int num_procs, int allprocs_index, int node)
+{
+	PGPROC	   *procs_node;
+
+	/* allocate the PGPROC chunk for this node */
+	procs_node = (PGPROC *) ptr;
+
+	/* pointer right after this array */
+	ptr = (char *) ptr + num_procs * sizeof(PGPROC);
+
+	elog(LOG, "NUMA: pgproc_init_partition procs %p endptr %p num_procs %d node %d",
+		 procs_node, ptr, num_procs, node);
+
+	/*
+	 * if node specified, move to node - do this before we start touching the
+	 * memory, to make sure it's not mapped to any node yet
+	 */
+	if (node != -1)
+		pg_numa_move_to_node((char *) procs_node, ptr, node);
+
+	/* add pointers to the PGPROC entries to allProcs */
+	for (int i = 0; i < num_procs; i++)
+	{
+		procs_node[i].numa_node = node;
+		procs_node[i].procnumber = allprocs_index;
+
+		ProcGlobal->allProcs[allprocs_index] = &procs_node[i];
+
+		allprocs_index++;
+	}
+
+	return ptr;
+}
+
+static char *
+fastpath_partition_init(char *ptr, int num_procs, int allprocs_index, int node,
+						Size fpLockBitsSize, Size fpRelIdSize)
+{
+	char	   *endptr = ptr + num_procs * (fpLockBitsSize + fpRelIdSize);
+
+	/*
+	 * if node specified, move to node - do this before we start touching the
+	 * memory, to make sure it's not mapped to any node yet
+	 */
+	if (node != -1)
+		pg_numa_move_to_node(ptr, endptr, node);
+
+	/*
+	 * Now point the PGPROC entries to the fast-path arrays, and also advance
+	 * the fpPtr.
+	 */
+	for (int i = 0; i < num_procs; i++)
+	{
+		PGPROC	   *proc = ProcGlobal->allProcs[allprocs_index];
+
+		/* cross-check we got the expected NUMA node */
+		Assert(proc->numa_node == node);
+		Assert(proc->procnumber == allprocs_index);
+
+		/*
+		 * Set the fast-path lock arrays, and move the pointer. We interleave
+		 * the two arrays, to (hopefully) get some locality for each backend.
+		 */
+		proc->fpLockBits = (uint64 *) ptr;
+		ptr += fpLockBitsSize;
+
+		proc->fpRelId = (Oid *) ptr;
+		ptr += fpRelIdSize;
+
+		Assert(ptr <= endptr);
+
+		allprocs_index++;
+	}
+
+	Assert(ptr == endptr);
+
+	return endptr;
+}
+
+int
+ProcPartitionCount(void)
+{
+	if (numa_procs_interleave && numa_can_partition)
+		return (numa_nodes + 1);
+
+	return 1;
+}
+
+void
+ProcPartitionGet(int idx, int *node, int *nprocs, void **procsptr, void **fpptr)
+{
+	PGProcPartition *part = &partitions[idx];
+
+	Assert((idx >= 0) && (idx < ProcPartitionCount()));
+
+	*nprocs = part->num_procs;
+	*procsptr = part->pgproc_ptr;
+	*fpptr = part->fastpath_ptr;
+	*node = part->numa_node;
+}
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index a11bc71a386..6ee4684d1b8 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -149,6 +149,7 @@ int			MaxBackends = 0;
 bool		numa_buffers_interleave = false;
 bool		numa_localalloc = false;
 bool		numa_partition_freelist = false;
+bool		numa_procs_interleave = false;
 
 /* GUC parameters for vacuum */
 int			VacuumBufferUsageLimit = 2048;
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 0552ed62cc7..7b718760248 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -2146,6 +2146,16 @@ struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"numa_procs_interleave", PGC_POSTMASTER, DEVELOPER_OPTIONS,
+			gettext_noop("Enables NUMA interleaving of PGPROC entries."),
+			gettext_noop("When enabled, the PGPROC entries are interleaved to all NUMA nodes."),
+		},
+		&numa_procs_interleave,
+		false,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"sync_replication_slots", PGC_SIGHUP, REPLICATION_STANDBY,
 			gettext_noop("Enables a physical standby to synchronize logical failover replication slots from the primary server."),
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 66baf2bf33e..cdeee8dccba 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -181,6 +181,7 @@ extern PGDLLIMPORT int max_parallel_workers;
 extern PGDLLIMPORT bool numa_buffers_interleave;
 extern PGDLLIMPORT bool numa_localalloc;
 extern PGDLLIMPORT bool numa_partition_freelist;
+extern PGDLLIMPORT bool numa_procs_interleave;
 
 extern PGDLLIMPORT int commit_timestamp_buffers;
 extern PGDLLIMPORT int multixact_member_buffers;
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index c6f5ebceefd..d2d269941fc 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -202,6 +202,8 @@ struct PGPROC
 								 * vacuum must not remove tuples deleted by
 								 * xid >= xmin ! */
 
+	int			procnumber;		/* index in ProcGlobal->allProcs */
+
 	int			pid;			/* Backend's process ID; 0 if prepared xact */
 
 	int			pgxactoff;		/* offset into various ProcGlobal->arrays with
@@ -327,6 +329,9 @@ struct PGPROC
 	PGPROC	   *lockGroupLeader;	/* lock group leader, if I'm a member */
 	dlist_head	lockGroupMembers;	/* list of members, if I'm a leader */
 	dlist_node	lockGroupLink;	/* my member link, if I'm a member */
+
+	/* NUMA node */
+	int			numa_node;
 };
 
 /* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */
@@ -391,7 +396,7 @@ extern PGDLLIMPORT PGPROC *MyProc;
 typedef struct PROC_HDR
 {
 	/* Array of PGPROC structures (not including dummies for prepared txns) */
-	PGPROC	   *allProcs;
+	PGPROC	  **allProcs;
 
 	/* Array mirroring PGPROC.xid for each PGPROC currently in the procarray */
 	TransactionId *xids;
@@ -443,8 +448,8 @@ extern PGDLLIMPORT PGPROC *PreparedXactProcs;
 /*
  * Accessors for getting PGPROC given a ProcNumber and vice versa.
  */
-#define GetPGProcByNumber(n) (&ProcGlobal->allProcs[(n)])
-#define GetNumberFromPGProc(proc) ((proc) - &ProcGlobal->allProcs[0])
+#define GetPGProcByNumber(n) (ProcGlobal->allProcs[(n)])
+#define GetNumberFromPGProc(proc) ((proc)->procnumber)
 
 /*
  * We set aside some extra PGPROC structures for "special worker" processes,
@@ -520,4 +525,7 @@ extern PGPROC *AuxiliaryPidGetProc(int pid);
 extern void BecomeLockGroupLeader(void);
 extern bool BecomeLockGroupMember(PGPROC *leader, int pid);
 
+extern int	ProcPartitionCount(void);
+extern void ProcPartitionGet(int idx, int *node, int *nprocs, void **procsptr, void **fpptr);
+
 #endif							/* _PROC_H_ */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 8540d537a3e..ded2db30422 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1876,6 +1876,7 @@ PGP_MPI
 PGP_PubKey
 PGP_S2K
 PGPing
+PGProcPartition
 PGQueryClass
 PGRUsage
 PGSemaphore
-- 
2.50.1