From b6e4879a397210aff0f0f720708471568fe99ea4 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Tue, 11 Nov 2025 12:10:03 +0100 Subject: [PATCH v20251126 8/9] NUMA: partition PGPROC The goal is to distribute ProcArray (or rather PGPROC entries and associated fast-path arrays) to NUMA nodes. We can't do this by simply interleaving pages, because that wouldn't work for both parts at the same time. We want to place the PGPROC and it's fast-path locking structs on the same node, but the structs are of different sizes, etc. Another problem is that PGPROC entries are fairly small, so with huge pages and reasonable values of max_connections everything fits onto a single page. We don't want to make this incompatible with huge pages. Note: If we eventually switch to allocating separate shared segments for different parts (to allow on-line resizing), we could keep using regular pages for procarray, and this would not be such an issue. To make this work, we split the PGPROC array into per-node segments, each with about (MaxBackends / numa_nodes) entries, and one segment for auxiliary processes and prepared transations. And we do the same thing for fast-path arrays. The PGPROC segments are laid out like this (e.g. for 2 NUMA nodes): - PGPROC array / node #0 - PGPROC array / node #1 - PGPROC array / aux processes + 2PC transactions - fast-path arrays / node #0 - fast-path arrays / node #1 - fast-path arrays / aux processes + 2PC transaction Each segment is aligned to (starts at) memory page, and is effectively a multiple of multiple memory pages. Having a single PGPROC array made certain operations easiers - e.g. it was possible to iterate the array, and GetNumberFromPGProc() could calculate offset by simply subtracting PGPROC pointers. With multiple segments that's not possible, but the fallout is minimal. Most places accessed PGPROC through PROC_HDR->allProcs, and can continue to do so, except that now they get a pointer to the PGPROC (which most places wanted anyway). With the feature disabled, there's only a single "partition" for all PGPROC entries. Similarly to the buffer partitioning, this introduces a small "registry" of partitions, as a source of truth. And then also a new "system" view "pg_buffercache_pgproc" showing basic infromation abouut the partitions. Note: There's an indirection, though. But the pointer does not change, so hopefully that's not an issue. And each PGPROC entry gets an explicit procnumber field, which is the index in allProcs, GetNumberFromPGProc can simply return that. Each PGPROC also gets numa_node, tracking the NUMA node, so that we don't have to recalculate that. This is used by InitProcess() to pick a PGPROC entry from the local NUMA node. Note: The scheduler may migrate the process to a different CPU/node later. Maybe we should consider pinning the process to the node? Note: There's some challenges in making this work on EXEC_BACKEND, even if we don't support NUMA on platforms that require this. --- .../pg_buffercache--1.7--1.8.sql | 19 + contrib/pg_buffercache/pg_buffercache_pages.c | 94 +++ src/backend/access/transam/clog.c | 4 +- src/backend/access/transam/twophase.c | 3 +- src/backend/postmaster/launch_backend.c | 4 +- src/backend/postmaster/pgarch.c | 2 +- src/backend/postmaster/walsummarizer.c | 2 +- src/backend/storage/buffer/buf_init.c | 2 + src/backend/storage/buffer/freelist.c | 2 +- src/backend/storage/ipc/procarray.c | 85 ++- src/backend/storage/lmgr/lock.c | 6 +- src/backend/storage/lmgr/proc.c | 551 +++++++++++++++++- src/include/port/pg_numa.h | 1 + src/include/storage/proc.h | 18 +- src/tools/pgindent/typedefs.list | 1 + 15 files changed, 722 insertions(+), 72 deletions(-) diff --git a/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql b/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql index 43d2e84f9d2..265c35c8252 100644 --- a/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql +++ b/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql @@ -31,3 +31,22 @@ REVOKE ALL ON pg_buffercache_partitions FROM PUBLIC; GRANT EXECUTE ON FUNCTION pg_buffercache_partitions() TO pg_monitor; GRANT SELECT ON pg_buffercache_partitions TO pg_monitor; + +-- Register the new functions. +CREATE OR REPLACE FUNCTION pg_buffercache_pgproc() +RETURNS SETOF RECORD +AS 'MODULE_PATHNAME', 'pg_buffercache_pgproc' +LANGUAGE C PARALLEL SAFE; + +-- Create a view for convenient access. +CREATE VIEW pg_buffercache_pgproc AS + SELECT P.* FROM pg_buffercache_pgproc() AS P + (partition integer, + numa_node integer, num_procs integer, pgproc_ptr bigint, fastpath_ptr bigint); + +-- Don't want these to be available to public. +REVOKE ALL ON FUNCTION pg_buffercache_pgproc() FROM PUBLIC; +REVOKE ALL ON pg_buffercache_pgproc FROM PUBLIC; + +GRANT EXECUTE ON FUNCTION pg_buffercache_pgproc() TO pg_monitor; +GRANT SELECT ON pg_buffercache_pgproc TO pg_monitor; diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index eae75375152..c6629f767d1 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -15,6 +15,7 @@ #include "port/pg_numa.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "storage/proc.h" #include "utils/array.h" #include "utils/builtins.h" #include "utils/rel.h" @@ -30,6 +31,7 @@ #define NUM_BUFFERCACHE_OS_PAGES_ELEM 3 #define NUM_BUFFERCACHE_PARTITIONS_ELEM 12 +#define NUM_BUFFERCACHE_PGPROC_ELEM 5 PG_MODULE_MAGIC_EXT( .name = "pg_buffercache", @@ -105,6 +107,7 @@ PG_FUNCTION_INFO_V1(pg_buffercache_evict); PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation); PG_FUNCTION_INFO_V1(pg_buffercache_evict_all); PG_FUNCTION_INFO_V1(pg_buffercache_partitions); +PG_FUNCTION_INFO_V1(pg_buffercache_pgproc); /* Only need to touch memory once per backend process lifetime */ @@ -981,3 +984,94 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS) else SRF_RETURN_DONE(funcctx); } + +/* + * Inquire about partitioning of PGPROC array. + */ +Datum +pg_buffercache_pgproc(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + MemoryContext oldcontext; + TupleDesc tupledesc; + TupleDesc expected_tupledesc; + HeapTuple tuple; + Datum result; + + if (SRF_IS_FIRSTCALL()) + { + funcctx = SRF_FIRSTCALL_INIT(); + + /* Switch context when allocating stuff to be used in later calls */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + if (expected_tupledesc->natts != NUM_BUFFERCACHE_PGPROC_ELEM) + elog(ERROR, "incorrect number of output arguments"); + + /* Construct a tuple descriptor for the result rows. */ + tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts); + TupleDescInitEntry(tupledesc, (AttrNumber) 1, "partition", + INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 2, "numa_node", + INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 3, "num_procs", + INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 4, "pgproc_ptr", + INT8OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 5, "fastpath_ptr", + INT8OID, -1, 0); + + funcctx->user_fctx = BlessTupleDesc(tupledesc); + + /* Return to original context when allocating transient memory */ + MemoryContextSwitchTo(oldcontext); + + /* Set max calls and remember the user function context. */ + funcctx->max_calls = ProcPartitionCount(); + } + + funcctx = SRF_PERCALL_SETUP(); + + if (funcctx->call_cntr < funcctx->max_calls) + { + uint32 i = funcctx->call_cntr; + + int numa_node, + num_procs; + + void *pgproc_ptr, + *fastpath_ptr; + + Datum values[NUM_BUFFERCACHE_PGPROC_ELEM]; + bool nulls[NUM_BUFFERCACHE_PGPROC_ELEM]; + + ProcPartitionGet(i, &numa_node, &num_procs, + &pgproc_ptr, &fastpath_ptr); + + values[0] = Int32GetDatum(i); + nulls[0] = false; + + values[1] = Int32GetDatum(numa_node); + nulls[1] = false; + + values[2] = Int32GetDatum(num_procs); + nulls[2] = false; + + values[3] = PointerGetDatum(pgproc_ptr); + nulls[3] = false; + + values[4] = PointerGetDatum(fastpath_ptr); + nulls[4] = false; + + /* Build and return the tuple. */ + tuple = heap_form_tuple((TupleDesc) funcctx->user_fctx, values, nulls); + result = HeapTupleGetDatum(tuple); + + SRF_RETURN_NEXT(funcctx, result); + } + else + SRF_RETURN_DONE(funcctx); +} diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index ea43b432daf..7d589bac115 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -575,7 +575,7 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, /* Walk the list and update the status of all XIDs. */ while (nextidx != INVALID_PROC_NUMBER) { - PGPROC *nextproc = &ProcGlobal->allProcs[nextidx]; + PGPROC *nextproc = ProcGlobal->allProcs[nextidx]; int64 thispageno = nextproc->clogGroupMemberPage; /* @@ -634,7 +634,7 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, */ while (wakeidx != INVALID_PROC_NUMBER) { - PGPROC *wakeproc = &ProcGlobal->allProcs[wakeidx]; + PGPROC *wakeproc = ProcGlobal->allProcs[wakeidx]; wakeidx = pg_atomic_read_u32(&wakeproc->clogGroupNext); pg_atomic_write_u32(&wakeproc->clogGroupNext, INVALID_PROC_NUMBER); diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 89d0bfa7760..e0e17293536 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -282,7 +282,7 @@ TwoPhaseShmemInit(void) TwoPhaseState->freeGXacts = &gxacts[i]; /* associate it with a PGPROC assigned by InitProcGlobal */ - gxacts[i].pgprocno = GetNumberFromPGProc(&PreparedXactProcs[i]); + gxacts[i].pgprocno = GetNumberFromPGProc(PreparedXactProcs[i]); } } else @@ -447,6 +447,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, FullTransactionId fxid, /* Initialize the PGPROC entry */ MemSet(proc, 0, sizeof(PGPROC)); + proc->procnumber = gxact->pgprocno; dlist_node_init(&proc->links); proc->waitStatus = PROC_WAIT_STATUS_OK; if (LocalTransactionIdIsValid(MyProc->vxid.lxid)) diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c index 976638a58ac..5e7b0ac8850 100644 --- a/src/backend/postmaster/launch_backend.c +++ b/src/backend/postmaster/launch_backend.c @@ -107,8 +107,8 @@ typedef struct LWLockPadded *MainLWLockArray; slock_t *ProcStructLock; PROC_HDR *ProcGlobal; - PGPROC *AuxiliaryProcs; - PGPROC *PreparedXactProcs; + PGPROC **AuxiliaryProcs; + PGPROC **PreparedXactProcs; volatile PMSignalData *PMSignalState; ProcSignalHeader *ProcSignal; pid_t PostmasterPid; diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index ce6b5299324..3288900bb6f 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -292,7 +292,7 @@ PgArchWakeup(void) * be relaunched shortly and will start archiving. */ if (arch_pgprocno != INVALID_PROC_NUMBER) - SetLatch(&ProcGlobal->allProcs[arch_pgprocno].procLatch); + SetLatch(&ProcGlobal->allProcs[arch_pgprocno]->procLatch); } diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index c4a888a081c..f5844aa5b6a 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -649,7 +649,7 @@ WakeupWalSummarizer(void) LWLockRelease(WALSummarizerLock); if (pgprocno != INVALID_PROC_NUMBER) - SetLatch(&ProcGlobal->allProcs[pgprocno].procLatch); + SetLatch(&ProcGlobal->allProcs[pgprocno]->procLatch); } /* diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 9ba455488a0..55016cce93f 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -824,6 +824,8 @@ check_debug_numa(char **newval, void **extra, GucSource source) if (pg_strcasecmp(item, "buffers") == 0) flags |= NUMA_BUFFERS; + else if (pg_strcasecmp(item, "procs") == 0) + flags |= NUMA_PROCS; else { GUC_check_errdetail("Invalid option \"%s\".", item); diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 810a549efce..0937292643f 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -472,7 +472,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r * actually fine because procLatch isn't ever freed, so we just can * potentially set the wrong process' (or no process') latch. */ - SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch); + SetLatch(&ProcGlobal->allProcs[bgwprocno]->procLatch); } /* diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 200f72c6e25..7e28fbdfea3 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -268,7 +268,7 @@ typedef enum KAXCompressReason static ProcArrayStruct *procArray; -static PGPROC *allProcs; +static PGPROC **allProcs; /* * Cache to reduce overhead of repeated calls to TransactionIdIsInProgress() @@ -369,6 +369,8 @@ static inline FullTransactionId FullXidRelativeTo(FullTransactionId rel, TransactionId xid); static void GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons); +static void AssertCheckAllProcs(void); + /* * Report shared-memory space needed by ProcArrayShmemInit */ @@ -476,6 +478,8 @@ ProcArrayAdd(PGPROC *proc) LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + AssertCheckAllProcs(); + if (arrayP->numProcs >= arrayP->maxProcs) { /* @@ -502,7 +506,7 @@ ProcArrayAdd(PGPROC *proc) int this_procno = arrayP->pgprocnos[index]; Assert(this_procno >= 0 && this_procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS)); - Assert(allProcs[this_procno].pgxactoff == index); + Assert(allProcs[this_procno]->pgxactoff == index); /* If we have found our right position in the array, break */ if (this_procno > pgprocno) @@ -538,11 +542,13 @@ ProcArrayAdd(PGPROC *proc) int procno = arrayP->pgprocnos[index]; Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS)); - Assert(allProcs[procno].pgxactoff == index - 1); + Assert(allProcs[procno]->pgxactoff == index - 1); - allProcs[procno].pgxactoff = index; + allProcs[procno]->pgxactoff = index; } + AssertCheckAllProcs(); + /* * Release in reversed acquisition order, to reduce frequency of having to * wait for XidGenLock while holding ProcArrayLock. @@ -578,10 +584,12 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + AssertCheckAllProcs(); + myoff = proc->pgxactoff; Assert(myoff >= 0 && myoff < arrayP->numProcs); - Assert(ProcGlobal->allProcs[arrayP->pgprocnos[myoff]].pgxactoff == myoff); + Assert(ProcGlobal->allProcs[arrayP->pgprocnos[myoff]]->pgxactoff == myoff); if (TransactionIdIsValid(latestXid)) { @@ -636,11 +644,13 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) int procno = arrayP->pgprocnos[index]; Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS)); - Assert(allProcs[procno].pgxactoff - 1 == index); + Assert(allProcs[procno]->pgxactoff - 1 == index); - allProcs[procno].pgxactoff = index; + allProcs[procno]->pgxactoff = index; } + AssertCheckAllProcs(); + /* * Release in reversed acquisition order, to reduce frequency of having to * wait for XidGenLock while holding ProcArrayLock. @@ -860,7 +870,7 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid) /* Walk the list and clear all XIDs. */ while (nextidx != INVALID_PROC_NUMBER) { - PGPROC *nextproc = &allProcs[nextidx]; + PGPROC *nextproc = allProcs[nextidx]; ProcArrayEndTransactionInternal(nextproc, nextproc->procArrayGroupMemberXid); @@ -880,7 +890,7 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid) */ while (wakeidx != INVALID_PROC_NUMBER) { - PGPROC *nextproc = &allProcs[wakeidx]; + PGPROC *nextproc = allProcs[wakeidx]; wakeidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext); pg_atomic_write_u32(&nextproc->procArrayGroupNext, INVALID_PROC_NUMBER); @@ -1526,7 +1536,7 @@ TransactionIdIsInProgress(TransactionId xid) pxids = other_subxidstates[pgxactoff].count; pg_read_barrier(); /* pairs with barrier in GetNewTransactionId() */ pgprocno = arrayP->pgprocnos[pgxactoff]; - proc = &allProcs[pgprocno]; + proc = allProcs[pgprocno]; for (j = pxids - 1; j >= 0; j--) { /* Fetch xid just once - see GetNewTransactionId */ @@ -1622,7 +1632,6 @@ TransactionIdIsInProgress(TransactionId xid) return false; } - /* * Determine XID horizons. * @@ -1740,7 +1749,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h) for (int index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; int8 statusFlags = ProcGlobal->statusFlags[index]; TransactionId xid; TransactionId xmin; @@ -2224,7 +2233,7 @@ GetSnapshotData(Snapshot snapshot) TransactionId xid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]); uint8 statusFlags; - Assert(allProcs[arrayP->pgprocnos[pgxactoff]].pgxactoff == pgxactoff); + Assert(allProcs[arrayP->pgprocnos[pgxactoff]]->pgxactoff == pgxactoff); /* * If the transaction has no XID assigned, we can skip it; it @@ -2298,7 +2307,7 @@ GetSnapshotData(Snapshot snapshot) if (nsubxids > 0) { int pgprocno = pgprocnos[pgxactoff]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; pg_read_barrier(); /* pairs with GetNewTransactionId */ @@ -2499,7 +2508,7 @@ ProcArrayInstallImportedXmin(TransactionId xmin, for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; int statusFlags = ProcGlobal->statusFlags[index]; TransactionId xid; @@ -2725,7 +2734,7 @@ GetRunningTransactionData(void) if (TransactionIdPrecedes(xid, oldestDatabaseRunningXid)) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; if (proc->databaseId == MyDatabaseId) oldestDatabaseRunningXid = xid; @@ -2756,7 +2765,7 @@ GetRunningTransactionData(void) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; int nsubxids; /* @@ -2858,7 +2867,7 @@ GetOldestActiveTransactionId(bool inCommitOnly, bool allDbs) { TransactionId xid; int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; /* Fetch xid just once - see GetNewTransactionId */ xid = UINT32_ACCESS_ONCE(other_xids[index]); @@ -3020,7 +3029,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids, int type) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; if ((proc->delayChkptFlags & type) != 0) { @@ -3061,7 +3070,7 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; VirtualTransactionId vxid; GET_VXID_FROM_PGPROC(vxid, *proc); @@ -3189,7 +3198,7 @@ BackendPidGetProcWithLock(int pid) for (index = 0; index < arrayP->numProcs; index++) { - PGPROC *proc = &allProcs[arrayP->pgprocnos[index]]; + PGPROC *proc = allProcs[arrayP->pgprocnos[index]]; if (proc->pid == pid) { @@ -3232,7 +3241,7 @@ BackendXidGetPid(TransactionId xid) if (other_xids[index] == xid) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; result = proc->pid; break; @@ -3301,7 +3310,7 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0, for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; uint8 statusFlags = ProcGlobal->statusFlags[index]; if (proc == MyProc) @@ -3403,7 +3412,7 @@ GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; /* Exclude prepared transactions */ if (proc->pid == 0) @@ -3468,7 +3477,7 @@ SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode, for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; VirtualTransactionId procvxid; GET_VXID_FROM_PGPROC(procvxid, *proc); @@ -3523,7 +3532,7 @@ MinimumActiveBackends(int min) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; /* * Since we're not holding a lock, need to be prepared to deal with @@ -3569,7 +3578,7 @@ CountDBBackends(Oid databaseid) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; if (proc->pid == 0) continue; /* do not count prepared xacts */ @@ -3598,7 +3607,7 @@ CountDBConnections(Oid databaseid) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; if (proc->pid == 0) continue; /* do not count prepared xacts */ @@ -3629,7 +3638,7 @@ CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; if (databaseid == InvalidOid || proc->databaseId == databaseid) { @@ -3670,7 +3679,7 @@ CountUserBackends(Oid roleid) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; if (proc->pid == 0) continue; /* do not count prepared xacts */ @@ -3733,7 +3742,7 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared) for (index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; uint8 statusFlags = ProcGlobal->statusFlags[index]; if (proc->databaseId != databaseId) @@ -3799,7 +3808,7 @@ TerminateOtherDBBackends(Oid databaseId) for (i = 0; i < procArray->numProcs; i++) { int pgprocno = arrayP->pgprocnos[i]; - PGPROC *proc = &allProcs[pgprocno]; + PGPROC *proc = allProcs[pgprocno]; if (proc->databaseId != databaseId) continue; @@ -5227,3 +5236,15 @@ KnownAssignedXidsReset(void) LWLockRelease(ProcArrayLock); } + +static void +AssertCheckAllProcs(void) +{ + ProcArrayStruct *arrayP = procArray; + int numProcs = arrayP->numProcs; + + for (int pgxactoff = 0; pgxactoff < numProcs; pgxactoff++) + { + Assert(allProcs[arrayP->pgprocnos[pgxactoff]]->pgxactoff == pgxactoff); + } +} diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 9cb78ead105..f82e664ad3f 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -2876,7 +2876,7 @@ FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag */ for (i = 0; i < ProcGlobal->allProcCount; i++) { - PGPROC *proc = &ProcGlobal->allProcs[i]; + PGPROC *proc = ProcGlobal->allProcs[i]; uint32 j; LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE); @@ -3135,7 +3135,7 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp) */ for (i = 0; i < ProcGlobal->allProcCount; i++) { - PGPROC *proc = &ProcGlobal->allProcs[i]; + PGPROC *proc = ProcGlobal->allProcs[i]; uint32 j; /* A backend never blocks itself */ @@ -3822,7 +3822,7 @@ GetLockStatusData(void) */ for (i = 0; i < ProcGlobal->allProcCount; ++i) { - PGPROC *proc = &ProcGlobal->allProcs[i]; + PGPROC *proc = ProcGlobal->allProcs[i]; /* Skip backends with pid=0, as they don't hold fast-path locks */ if (proc->pid == 0) diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 1504fafe6d8..0a0ce98b725 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -29,22 +29,33 @@ */ #include "postgres.h" +#ifdef USE_LIBNUMA +#include +#endif + #include #include #include +#ifdef USE_LIBNUMA +#include +#include +#endif + #include "access/transam.h" #include "access/twophase.h" #include "access/xlogutils.h" #include "access/xlogwait.h" #include "miscadmin.h" #include "pgstat.h" +#include "port/pg_numa.h" #include "postmaster/autovacuum.h" #include "replication/slotsync.h" #include "replication/syncrep.h" #include "storage/condition_variable.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#include "storage/pg_shmem.h" #include "storage/pmsignal.h" #include "storage/proc.h" #include "storage/procarray.h" @@ -77,8 +88,8 @@ NON_EXEC_STATIC slock_t *ProcStructLock = NULL; /* Pointers to shared-memory structures */ PROC_HDR *ProcGlobal = NULL; -NON_EXEC_STATIC PGPROC *AuxiliaryProcs = NULL; -PGPROC *PreparedXactProcs = NULL; +NON_EXEC_STATIC PGPROC **AuxiliaryProcs = NULL; +PGPROC **PreparedXactProcs = NULL; static DeadLockState deadlock_state = DS_NOT_YET_CHECKED; @@ -91,6 +102,29 @@ static void AuxiliaryProcKill(int code, Datum arg); static void CheckDeadLock(void); +/* number of NUMA nodes (as returned by numa_num_configured_nodes) */ +static int numa_nodes = -1; /* number of nodes when sizing */ +static Size numa_page_size = 0; /* page used to size partitions */ +static bool numa_can_partition = false; /* can map to NUMA nodes? */ +static int numa_procs_per_node = -1; /* pgprocs per node */ + +static void pgproc_partitions_prepare(void); +static char *pgproc_partition_init(char *ptr, int num_procs, + int allprocs_index, int node); +static char *fastpath_partition_init(char *ptr, int num_procs, + int allprocs_index, int node, + Size fpLockBitsSize, Size fpRelIdSize); + +typedef struct PGProcPartition +{ + int num_procs; + int numa_node; + void *pgproc_ptr; + void *fastpath_ptr; +} PGProcPartition; + +static PGProcPartition *partitions = NULL; + /* * Report shared-memory space needed by PGPROC. */ @@ -101,11 +135,41 @@ PGProcShmemSize(void) Size TotalProcs = add_size(MaxBackends, add_size(NUM_AUXILIARY_PROCS, max_prepared_xacts)); + size = add_size(size, CACHELINEALIGN(mul_size(TotalProcs, sizeof(PGPROC *)))); size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC))); size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->xids))); size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->subxidStates))); size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->statusFlags))); + /* + * To support NUMA partitioning, the PGPROC array will be divided into + * multiple chunks - one per NUMA node, and one extra for auxiliary/2PC + * entries (which are not assigned to any NUMA node). + * + * We can't simply map pages of a single continuous array, because the + * PGPROC entries are very small and too many of them would fit on a + * single page (at least with huge pages). Far more than reasonable values + * of max_connections. So instead we cut the array into separate pieces + * for each node. + * + * Each piece may need up to one memory page of padding, to make it + * aligned with memory page (for NUMA), So we just add a page - it's a bit + * wasteful, but should not matter much - NUMA is meant for large boxes, + * so a couple pages is negligible. + * + * We only do this with NUMA partitioning. With the GUC disabled, or when + * we find we can't do that for some reason, we just allocate the PGPROC + * array as a single chunk. This is determined by the earlier call to + * pgproc_partitions_prepare(). + * + * XXX It might be more painful with very large huge pages (e.g. 1GB). + */ + if (((numa_flags & NUMA_PROCS) != 0) && numa_can_partition) + { + Assert(numa_nodes > 0); + size = add_size(size, mul_size((numa_nodes + 1), numa_page_size)); + } + return size; } @@ -130,6 +194,60 @@ FastPathLockShmemSize(void) size = add_size(size, mul_size(TotalProcs, (fpLockBitsSize + fpRelIdSize))); + /* + * When applying NUMA to the fast-path locks, we follow the same logic as + * for PGPROC entries. See the comments in PGProcShmemSize(). + * + * If PGPROC partitioning is enabled, and we decided it's possible, we + * need to add one memory page per NUMA node (and one for auxiliary/2PC + * processes) to allow proper alignment. + * + * XXX This is a a bit wasteful, because it might actually add pages even + * when not strictly needed (if it's already aligned). And we always + * assume we'll add a whole page, even if the alignment needs only less + * memory. + */ + if (((numa_flags & NUMA_PROCS) != 0) && numa_can_partition) + { + Assert(numa_nodes > 0); + size = add_size(size, mul_size((numa_nodes + 1), numa_page_size)); + } + + return size; +} + +static Size +PGProcPartitionsShmemSize(void) +{ + Size size = 0; + + /* + * If PGPROC partitioning is enabled, and we decided it's possible, we + * need to add one memory page per NUMA node (and one for auxiliary/2PC + * processes) to allow proper alignment. + * + * XXX This is a a bit wasteful, because it might actually add pages even + * when not strictly needed (if it's already aligned). And we always + * assume we'll add a whole page, even if the alignment needs only less + * memory. + */ + if (((numa_flags & NUMA_PROCS) != 0) && numa_can_partition) + { + Assert(numa_nodes > 0); + size = add_size(size, mul_size((numa_nodes + 1), numa_page_size)); + + /* + * Also account for a small registry of partitions, a simple array of + * partitions at the beginning. + */ + size = add_size(size, mul_size((numa_nodes + 1), sizeof(PGProcPartition))); + } + else + { + /* otherwise add only a tiny registry, with a single partition */ + size = add_size(size, sizeof(PGProcPartition)); + } + return size; } @@ -141,6 +259,9 @@ ProcGlobalShmemSize(void) { Size size = 0; + /* calculate partition info for pgproc entries etc */ + pgproc_partitions_prepare(); + /* ProcGlobal */ size = add_size(size, sizeof(PROC_HDR)); size = add_size(size, sizeof(slock_t)); @@ -149,6 +270,8 @@ ProcGlobalShmemSize(void) size = add_size(size, PGProcShmemSize()); size = add_size(size, FastPathLockShmemSize()); + size = add_size(size, PGProcPartitionsShmemSize()); + return size; } @@ -193,7 +316,7 @@ ProcGlobalSemas(void) void InitProcGlobal(void) { - PGPROC *procs; + PGPROC **procs; int i, j; bool found; @@ -212,6 +335,9 @@ InitProcGlobal(void) ShmemInitStruct("Proc Header", sizeof(PROC_HDR), &found); Assert(!found); + /* XXX call again, EXEC_BACKEND may not see the already computed value */ + pgproc_partitions_prepare(); + /* * Initialize the data structures. */ @@ -226,6 +352,15 @@ InitProcGlobal(void) pg_atomic_init_u32(&ProcGlobal->procArrayGroupFirst, INVALID_PROC_NUMBER); pg_atomic_init_u32(&ProcGlobal->clogGroupFirst, INVALID_PROC_NUMBER); + /* PGPROC partition registry */ + requestSize = PGProcPartitionsShmemSize(); + + ptr = ShmemInitStruct("PGPROC partitions", + requestSize, + &found); + + partitions = (PGProcPartition *) ptr; + /* * Create and initialize all the PGPROC structures we'll need. There are * six separate consumers: (1) normal backends, (2) autovacuum workers and @@ -241,21 +376,110 @@ InitProcGlobal(void) requestSize, &found); - MemSet(ptr, 0, requestSize); - - procs = (PGPROC *) ptr; - ptr = (char *) ptr + TotalProcs * sizeof(PGPROC); + /* allprocs (array of pointers to PGPROC entries) */ + procs = (PGPROC **) ptr; + ptr = (char *) ptr + CACHELINEALIGN(TotalProcs * sizeof(PGPROC *)); ProcGlobal->allProcs = procs; /* XXX allProcCount isn't really all of them; it excludes prepared xacts */ ProcGlobal->allProcCount = MaxBackends + NUM_AUXILIARY_PROCS; + /* + * If NUMA partitioning is enabled, and we decided we actually can do the + * partitioning, allocate the chunks. + * + * Otherwise we'll allocate a single array for everything. It's not quite + * what we did without NUMA, because there's an extra level of + * indirection, but it's the best we can do. + */ + if (((numa_flags & NUMA_PROCS) != 0) && numa_can_partition) + { + int node_procs; + int total_procs = 0; + + Assert(numa_procs_per_node > 0); + Assert(numa_nodes > 0); + + /* make sure to align the PGPROC array to memory page */ + ptr = (char *) TYPEALIGN(numa_page_size, ptr); + + /* + * Now initialize the PGPROC partition registry with one partition + * per NUMA node (and then one extra partition for auxiliary procs). + */ + for (i = 0; i < numa_nodes; i++) + { + /* the last NUMA node may get fewer PGPROC entries, but meh */ + node_procs = Min(numa_procs_per_node, MaxBackends - total_procs); + + /* fill in the partition info */ + partitions[i].num_procs = node_procs; + partitions[i].numa_node = i; + partitions[i].pgproc_ptr = ptr; + + ptr = pgproc_partition_init(ptr, node_procs, total_procs, i); + + /* should have been aligned */ + Assert(ptr == (char *) TYPEALIGN(numa_page_size, ptr)); + + total_procs += node_procs; + + /* don't underflow/overflow the allocation */ + Assert((ptr > (char *) procs) && (ptr <= (char *) procs + requestSize)); + } + + Assert(total_procs == MaxBackends); + + /* + * Also build PGPROC entries for auxiliary procs / prepared xacts (we + * however don't assign those to any NUMA node). + */ + node_procs = (NUM_AUXILIARY_PROCS + max_prepared_xacts); + + /* fill in the partition info */ + partitions[numa_nodes].num_procs = node_procs; + partitions[numa_nodes].numa_node = -1; + partitions[numa_nodes].pgproc_ptr = ptr; + + ptr = pgproc_partition_init(ptr, node_procs, total_procs, -1); + + total_procs += node_procs; + + /* don't overflow the allocation */ + Assert((ptr > (char *) procs) && (ptr <= (char *) procs + requestSize)); + + Assert(total_procs = TotalProcs); + } + else + { + /* just treat everything as a single array, with no alignment */ + ptr = pgproc_partition_init(ptr, TotalProcs, 0, -1); + + /* fill in the partition info */ + partitions[0].num_procs = TotalProcs; + partitions[0].numa_node = -1; + partitions[0].pgproc_ptr = ptr; + + /* don't overflow the allocation */ + Assert((ptr > (char *) procs) && (ptr <= (char *) procs + requestSize)); + } + + /* + * Don't memset the memory before locating it to NUMA nodes (which requires + * the pages to be allocated but not yet faulted in memory). + */ + MemSet(ptr, 0, requestSize); + /* * Allocate arrays mirroring PGPROC fields in a dense manner. See * PROC_HDR. * * XXX: It might make sense to increase padding for these arrays, given * how hotly they are accessed. + * + * XXX Would it make sense to NUMA-partition these chunks too, somehow? + * But those arrays are tiny, fit into a single memory page, so would need + * to be made more complex. Not sure. */ ProcGlobal->xids = (TransactionId *) ptr; ptr = (char *) ptr + (TotalProcs * sizeof(*ProcGlobal->xids)); @@ -291,23 +515,91 @@ InitProcGlobal(void) /* Reserve space for semaphores. */ PGReserveSemaphores(ProcGlobalSemas()); - for (i = 0; i < TotalProcs; i++) + /* + * Mimic the logic we used to partition PGPROC entries. + */ + + /* + * If NUMA partitioning is enabled, and we decided we actually can do the + * partitioning, allocate the chunks. + * + * Otherwise we'll allocate a single array for everything. It's not quite + * what we did without NUMA, because there's an extra level of + * indirection, but it's the best we can do. + */ + if (((numa_flags & NUMA_PROCS) != 0) && numa_can_partition) { - PGPROC *proc = &procs[i]; + int node_procs; + int total_procs = 0; - /* Common initialization for all PGPROCs, regardless of type. */ + Assert(numa_procs_per_node > 0); + + /* build PGPROC entries for NUMA nodes */ + for (i = 0; i < numa_nodes; i++) + { + /* the last NUMA node may get fewer PGPROC entries, but meh */ + node_procs = Min(numa_procs_per_node, MaxBackends - total_procs); + + /* make sure to align the PGPROC array to memory page */ + fpPtr = (char *) TYPEALIGN(numa_page_size, fpPtr); + + /* remember this pointer too */ + partitions[i].fastpath_ptr = fpPtr; + Assert(node_procs == partitions[i].num_procs); + + fpPtr = fastpath_partition_init(fpPtr, node_procs, total_procs, i, + fpLockBitsSize, fpRelIdSize); + + total_procs += node_procs; + + /* don't overflow the allocation */ + Assert(fpPtr <= fpEndPtr); + } + + Assert(total_procs == MaxBackends); /* - * Set the fast-path lock arrays, and move the pointer. We interleave - * the two arrays, to (hopefully) get some locality for each backend. + * Also build PGPROC entries for auxiliary procs / prepared xacts (we + * however don't assign those to any NUMA node). */ - proc->fpLockBits = (uint64 *) fpPtr; - fpPtr += fpLockBitsSize; + node_procs = (NUM_AUXILIARY_PROCS + max_prepared_xacts); + + /* make sure to align the PGPROC array to memory page */ + fpPtr = (char *) TYPEALIGN(numa_page_size, fpPtr); + + /* remember this pointer too */ + partitions[numa_nodes].fastpath_ptr = fpPtr; + Assert(node_procs == partitions[numa_nodes].num_procs); - proc->fpRelId = (Oid *) fpPtr; - fpPtr += fpRelIdSize; + fpPtr = fastpath_partition_init(fpPtr, node_procs, total_procs, -1, + fpLockBitsSize, fpRelIdSize); + total_procs += node_procs; + + /* don't overflow the allocation */ + Assert(fpPtr <= fpEndPtr); + + Assert(total_procs = TotalProcs); + } + else + { + /* remember this pointer too */ + partitions[0].fastpath_ptr = fpPtr; + Assert(TotalProcs == partitions[0].num_procs); + + /* just treat everything as a single array, with no alignment */ + fpPtr = fastpath_partition_init(fpPtr, TotalProcs, 0, -1, + fpLockBitsSize, fpRelIdSize); + + /* don't overflow the allocation */ Assert(fpPtr <= fpEndPtr); + } + + for (i = 0; i < TotalProcs; i++) + { + PGPROC *proc = procs[i]; + + Assert(proc->procnumber == i); /* * Set up per-PGPROC semaphore, latch, and fpInfoLock. Prepared xact @@ -371,9 +663,6 @@ InitProcGlobal(void) pg_atomic_init_u64(&(proc->waitStart), 0); } - /* Should have consumed exactly the expected amount of fast-path memory. */ - Assert(fpPtr == fpEndPtr); - /* * Save pointers to the blocks of PGPROC structures reserved for auxiliary * processes and prepared transactions. @@ -440,7 +729,51 @@ InitProcess(void) if (!dlist_is_empty(procgloballist)) { - MyProc = dlist_container(PGPROC, links, dlist_pop_head_node(procgloballist)); + /* + * With numa interleaving of PGPROC, try to get a PROC entry from the + * right NUMA node (when the process starts). + * + * XXX The process may move to a different NUMA node later, but + * there's not much we can do about that. + */ + if ((numa_flags & NUMA_PROCS) != 0) + { + dlist_mutable_iter iter; + int node; + +#ifdef USE_LIBNUMA + int cpu = sched_getcpu(); + + if (cpu < 0) + elog(ERROR, "getcpu failed: %m"); + + node = numa_node_of_cpu(cpu); +#else + /* FIXME is defaulting to 0 correct? */ + node = 0; +#endif + + MyProc = NULL; + + dlist_foreach_modify(iter, procgloballist) + { + PGPROC *proc; + + proc = dlist_container(PGPROC, links, iter.cur); + + if (proc->numa_node == node) + { + MyProc = proc; + dlist_delete(iter.cur); + break; + } + } + } + + /* didn't find PGPROC from the correct NUMA node, pick any free one */ + if (MyProc == NULL) + MyProc = dlist_container(PGPROC, links, dlist_pop_head_node(procgloballist)); + SpinLockRelease(ProcStructLock); } else @@ -651,7 +984,7 @@ InitAuxiliaryProcess(void) */ for (proctype = 0; proctype < NUM_AUXILIARY_PROCS; proctype++) { - auxproc = &AuxiliaryProcs[proctype]; + auxproc = AuxiliaryProcs[proctype]; if (auxproc->pid == 0) break; } @@ -1059,7 +1392,7 @@ AuxiliaryProcKill(int code, Datum arg) if (MyProc->pid != (int) getpid()) elog(PANIC, "AuxiliaryProcKill() called in child process"); - auxproc = &AuxiliaryProcs[proctype]; + auxproc = AuxiliaryProcs[proctype]; Assert(MyProc == auxproc); @@ -1108,7 +1441,7 @@ AuxiliaryPidGetProc(int pid) for (index = 0; index < NUM_AUXILIARY_PROCS; index++) { - PGPROC *proc = &AuxiliaryProcs[index]; + PGPROC *proc = AuxiliaryProcs[index]; if (proc->pid == pid) { @@ -1998,7 +2331,7 @@ ProcSendSignal(ProcNumber procNumber) if (procNumber < 0 || procNumber >= ProcGlobal->allProcCount) elog(ERROR, "procNumber out of range"); - SetLatch(&ProcGlobal->allProcs[procNumber].procLatch); + SetLatch(&ProcGlobal->allProcs[procNumber]->procLatch); } /* @@ -2073,3 +2406,173 @@ BecomeLockGroupMember(PGPROC *leader, int pid) return ok; } + +/* + * pgproc_partitions_prepare + * Calculate parameters for partitioning buffers. + * + * NUMA partitioning + * + * Now build the actual PGPROC arrays, one "chunk" per NUMA node (and one + * extra for auxiliary processes and 2PC transactions, not associated with + * any particular node). + * + * First determine how many "backend" procs to allocate per NUMA node. The + * count may not be exactly divisible, but we mostly ignore that. The last + * node may get somewhat fewer PGPROC entries, but the imbalance ought to + * be pretty small (if MaxBackends >> numa_nodes). + * + * XXX A fairer distribution is possible, but not worth it for now. + */ +static void +pgproc_partitions_prepare(void) +{ + /* bail out if already initialized (calculate only once) */ + if (numa_nodes != -1) + return; + + /* XXX only gives us the number, the nodes may not be 0, 1, 2, ... */ +#ifdef USE_LIBNUMA + numa_nodes = numa_num_configured_nodes(); +#else + numa_nodes = 1; +#endif + + /* XXX can this happen? */ + if (numa_nodes < 1) + numa_nodes = 1; + + /* + * XXX A bit weird. Do we need to worry about postmaster? Could this even + * run outside postmaster? I don't think so. + * + * XXX Another issue is we may get different values than when sizing the + * the memory, because at that point we didn't know if we get huge pages, + * so we assumed we will. Shouldn't cause crashes, but we might allocate + * shared memory and then not use some of it (because of the alignment + * that we don't actually need). Not sure about better way, good for now. + */ + // Assert(!IsUnderPostmaster); + + numa_page_size = pg_numa_page_size(); + + numa_procs_per_node = (MaxBackends + (numa_nodes - 1)) / numa_nodes; + + elog(DEBUG1, "NUMA: pgproc backends %d num_nodes %d per_node %d", + MaxBackends, numa_nodes, numa_procs_per_node); + + Assert(numa_nodes * numa_procs_per_node >= MaxBackends); + + /* success */ + numa_can_partition = true; +} + +/* + * + */ +static char * +pgproc_partition_init(char *ptr, int num_procs, int allprocs_index, int node) +{ + PGPROC *procs_node; + + /* allocate the PGPROC chunk for this node */ + procs_node = (PGPROC *) ptr; + + /* pointer right after this array */ + ptr = (char *) ptr + num_procs * sizeof(PGPROC); + + /* + * if node specified, move to node - do this before we start touching the + * memory, to make sure it's not mapped to any node yet + */ + if (node != -1) + { + /* align the pointer to the next page */ + ptr = (char *) TYPEALIGN(numa_page_size, ptr); + + pg_numa_move_to_node((char *) procs_node, ptr, node); + } + + elog(DEBUG1, "NUMA: pgproc_partition_init procs %p endptr %p num_procs %d node %d", + procs_node, ptr, num_procs, node); + + /* add pointers to the PGPROC entries to allProcs */ + for (int i = 0; i < num_procs; i++) + { + procs_node[i].numa_node = node; + procs_node[i].procnumber = allprocs_index; + + ProcGlobal->allProcs[allprocs_index] = &procs_node[i]; + + allprocs_index++; + } + + return ptr; +} + +static char * +fastpath_partition_init(char *ptr, int num_procs, int allprocs_index, int node, + Size fpLockBitsSize, Size fpRelIdSize) +{ + char *endptr = ptr + num_procs * (fpLockBitsSize + fpRelIdSize); + + /* + * if node specified, move to node - do this before we start touching the + * memory, to make sure it's not mapped to any node yet + */ + if (node != -1) + pg_numa_move_to_node(ptr, endptr, node); + + /* + * Now point the PGPROC entries to the fast-path arrays, and also advance + * the fpPtr. + */ + for (int i = 0; i < num_procs; i++) + { + PGPROC *proc = ProcGlobal->allProcs[allprocs_index]; + + /* cross-check we got the expected NUMA node */ + Assert(proc->numa_node == node); + Assert(proc->procnumber == allprocs_index); + + /* + * Set the fast-path lock arrays, and move the pointer. We interleave + * the two arrays, to (hopefully) get some locality for each backend. + */ + proc->fpLockBits = (uint64 *) ptr; + ptr += fpLockBitsSize; + + proc->fpRelId = (Oid *) ptr; + ptr += fpRelIdSize; + + Assert(ptr <= endptr); + + allprocs_index++; + } + + Assert(ptr == endptr); + + return endptr; +} + +int +ProcPartitionCount(void) +{ + if (((numa_flags & NUMA_PROCS) != 0) && numa_can_partition) + return (numa_nodes + 1); + + return 1; +} + +void +ProcPartitionGet(int idx, int *node, int *nprocs, void **procsptr, void **fpptr) +{ + PGProcPartition *part = &partitions[idx]; + + Assert((idx >= 0) && (idx < ProcPartitionCount())); + + *nprocs = part->num_procs; + *procsptr = part->pgproc_ptr; + *fpptr = part->fastpath_ptr; + *node = part->numa_node; +} diff --git a/src/include/port/pg_numa.h b/src/include/port/pg_numa.h index 9734aa315ff..aa524f6f7f3 100644 --- a/src/include/port/pg_numa.h +++ b/src/include/port/pg_numa.h @@ -23,6 +23,7 @@ extern PGDLLIMPORT void pg_numa_move_to_node(char *startptr, char *endptr, int n extern PGDLLIMPORT int numa_flags; #define NUMA_BUFFERS 0x01 +#define NUMA_PROCS 0x02 #ifdef USE_LIBNUMA diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index c6f5ebceefd..21f2619fd40 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -202,6 +202,8 @@ struct PGPROC * vacuum must not remove tuples deleted by * xid >= xmin ! */ + int procnumber; /* index in ProcGlobal->allProcs */ + int pid; /* Backend's process ID; 0 if prepared xact */ int pgxactoff; /* offset into various ProcGlobal->arrays with @@ -327,6 +329,9 @@ struct PGPROC PGPROC *lockGroupLeader; /* lock group leader, if I'm a member */ dlist_head lockGroupMembers; /* list of members, if I'm a leader */ dlist_node lockGroupLink; /* my member link, if I'm a member */ + + /* NUMA node */ + int numa_node; }; /* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */ @@ -391,7 +396,7 @@ extern PGDLLIMPORT PGPROC *MyProc; typedef struct PROC_HDR { /* Array of PGPROC structures (not including dummies for prepared txns) */ - PGPROC *allProcs; + PGPROC **allProcs; /* Array mirroring PGPROC.xid for each PGPROC currently in the procarray */ TransactionId *xids; @@ -438,13 +443,13 @@ typedef struct PROC_HDR extern PGDLLIMPORT PROC_HDR *ProcGlobal; -extern PGDLLIMPORT PGPROC *PreparedXactProcs; +extern PGDLLIMPORT PGPROC **PreparedXactProcs; /* * Accessors for getting PGPROC given a ProcNumber and vice versa. */ -#define GetPGProcByNumber(n) (&ProcGlobal->allProcs[(n)]) -#define GetNumberFromPGProc(proc) ((proc) - &ProcGlobal->allProcs[0]) +#define GetPGProcByNumber(n) (ProcGlobal->allProcs[(n)]) +#define GetNumberFromPGProc(proc) ((proc)->procnumber) /* * We set aside some extra PGPROC structures for "special worker" processes, @@ -480,7 +485,7 @@ extern PGDLLIMPORT bool log_lock_waits; #ifdef EXEC_BACKEND extern PGDLLIMPORT slock_t *ProcStructLock; -extern PGDLLIMPORT PGPROC *AuxiliaryProcs; +extern PGDLLIMPORT PGPROC **AuxiliaryProcs; #endif @@ -520,4 +525,7 @@ extern PGPROC *AuxiliaryPidGetProc(int pid); extern void BecomeLockGroupLeader(void); extern bool BecomeLockGroupMember(PGPROC *leader, int pid); +extern int ProcPartitionCount(void); +extern void ProcPartitionGet(int idx, int *node, int *nprocs, void **procsptr, void **fpptr); + #endif /* _PROC_H_ */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 241f175e9da..e1bf02a3567 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1888,6 +1888,7 @@ PGP_MPI PGP_PubKey PGP_S2K PGPing +PGProcPartition PGQueryClass PGRUsage PGSemaphore -- 2.51.1