From 72b0a9b86da7e1a6e239f5fdead24aeaa3d99f27 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Fri, 5 Dec 2025 14:53:47 +0100 Subject: [PATCH v20251208] numa: Simple interleaving and MAP_POPULATE Allows NUMA interleaving on the shared memory segment, to ensure memory is balanced between the NUMA nodes. The patch also allows prefaulting the shared memory (by setting MAP_POPULATE flag), to actually allocate the pages on nodes. The commit addes two GUC parameters, both set to 'off' by default: - shared_memory_interleave (enables NUMA interleaving) - shared_memory_populate (sets MAP_POPULATE) The memory is interleaved on all nodes enabled in the cpuset, as returned by numa_get_mems_allowed(). The interleaving is applied at the memory page granularity, and is oblivious to what's stored in it. --- src/backend/port/sysv_shmem.c | 51 ++++++++++++++++++- src/backend/utils/misc/guc_parameters.dat | 12 +++++ src/backend/utils/misc/guc_tables.c | 2 + src/backend/utils/misc/postgresql.conf.sample | 2 + src/include/port/pg_numa.h | 4 ++ src/include/storage/pg_shmem.h | 3 ++ src/port/pg_numa.c | 42 +++++++++++++++ 7 files changed, 115 insertions(+), 1 deletion(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 298ceb3e218..b300f5ef4a7 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -19,6 +19,7 @@ */ #include "postgres.h" +#include #include #include #include @@ -29,6 +30,7 @@ #include "miscadmin.h" #include "port/pg_bitutils.h" +#include "port/pg_numa.h" #include "portability/mem.h" #include "storage/dsm.h" #include "storage/fd.h" @@ -602,6 +604,28 @@ CreateAnonymousSegment(Size *size) void *ptr = MAP_FAILED; int mmap_errno = 0; + /* + * When asked to use NUMA-interleaving, we need to set the policy before + * touching the memory. By default that'll happen later, when either + * initializing some of the shmem structures (e.g. buffer descriptors), or + * when running queries. In that case it's enough to set the policy after + * the mmap() call, and we don't need to do anything here. + * + * With MAP_POPULATE, the mmap() itself will prefault the pages, so we + * need to set the policy to interleave before the mmap() call, and then + * revert to localalloc (so that private memory is allocated locally). + * + * XXX It probably is not a good idea to enable interleaving with regular + * memory pages, because then each buffer will get split on two nodes, and + * the system won't be able to fix that by migrating one of the pages. But + * we leave that up to the admin, instead of forbidding it. + */ + if (shared_memory_interleave && shared_memory_populate) + { + /* set the allocation to interleave on nodes allowed by the cpuset */ + pg_numa_set_interleave(); + } + #ifndef MAP_HUGETLB /* PGSharedMemoryCreate should have dealt with this case */ Assert(huge_pages != HUGE_PAGES_ON); @@ -619,6 +643,10 @@ CreateAnonymousSegment(Size *size) if (allocsize % hugepagesize != 0) allocsize += hugepagesize - (allocsize % hugepagesize); + /* populate the shared memory if requested */ + if (shared_memory_populate) + mmap_flags |= MAP_POPULATE; + ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS | mmap_flags, -1, 0); mmap_errno = errno; @@ -638,13 +666,19 @@ CreateAnonymousSegment(Size *size) if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON) { + int mmap_flags = 0; + + /* populate the shared memory if requested */ + if (shared_memory_populate) + mmap_flags |= MAP_POPULATE; + /* * Use the original size, not the rounded-up value, when falling back * to non-huge pages. */ allocsize = *size; ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, - PG_MMAP_FLAGS, -1, 0); + PG_MMAP_FLAGS | mmap_flags, -1, 0); mmap_errno = errno; } @@ -663,6 +697,21 @@ CreateAnonymousSegment(Size *size) allocsize) : 0)); } + /* + * With NUMA interleaving, we need to either apply interleaving for the + * shmem segment we just allocated, or reset the memory policy to local + * allocation (when using MAP_POPULATE). + */ + if (shared_memory_interleave) + { + if (shared_memory_populate) + /* revert back to using the local node */ + pg_numa_set_localalloc(); + else + /* apply interleaving to the new memory segment */ + pg_numa_interleave_memory(ptr, allocsize); + } + *size = allocsize; return ptr; } diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 3b9d8349078..ce1c0c4327f 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -2597,6 +2597,18 @@ max => 'INT_MAX / 2', }, +{ name => 'shared_memory_interleave', type => 'bool', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Enables NUMA interleaving of shared memory.', + variable => 'shared_memory_interleave', + boot_val => 'false', +}, + +{ name => 'shared_memory_populate', type => 'bool', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Populates shared memory at start.', + variable => 'shared_memory_populate', + boot_val => 'false', +}, + { name => 'shared_memory_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', short_desc => 'Shows the size of the server\'s main shared memory area (rounded up to the nearest MB).', flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_UNIT_MB | GUC_RUNTIME_COMPUTED', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index f87b558c2c6..cfee0df987f 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -580,6 +580,8 @@ static int ssl_renegotiation_limit; int huge_pages = HUGE_PAGES_TRY; int huge_page_size; int huge_pages_status = HUGE_PAGES_UNKNOWN; +bool shared_memory_interleave = false; +bool shared_memory_populate = false; /* * These variables are all dummies that don't do anything, except in some diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index dc9e2255f8a..de1276f6897 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -152,6 +152,8 @@ # sysv # windows # (change requires restart) +#shared_memory_interleave = off # interleave all memory on available NUMA nodes +#shared_memory_populate = off # prefault shared memory on start #dynamic_shared_memory_type = posix # the default is usually the first option # supported by the operating system: # posix diff --git a/src/include/port/pg_numa.h b/src/include/port/pg_numa.h index 9d1ea6d0db8..dc9f13d9fa0 100644 --- a/src/include/port/pg_numa.h +++ b/src/include/port/pg_numa.h @@ -18,6 +18,10 @@ extern PGDLLIMPORT int pg_numa_init(void); extern PGDLLIMPORT int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status); extern PGDLLIMPORT int pg_numa_get_max_node(void); +extern PGDLLIMPORT void pg_numa_set_interleave(void); +extern PGDLLIMPORT void pg_numa_set_localalloc(void); +extern PGDLLIMPORT void pg_numa_interleave_memory(void *ptr, Size size); + #ifdef USE_LIBNUMA /* diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 5f7d4b83a60..7b56bd5b44f 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -47,6 +47,9 @@ extern PGDLLIMPORT int huge_pages; extern PGDLLIMPORT int huge_page_size; extern PGDLLIMPORT int huge_pages_status; +extern PGDLLIMPORT bool shared_memory_interleave; +extern PGDLLIMPORT bool shared_memory_populate; + /* Possible values for huge_pages and huge_pages_status */ typedef enum { diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c index 540ada3f8ef..a91d339033f 100644 --- a/src/port/pg_numa.c +++ b/src/port/pg_numa.c @@ -116,6 +116,33 @@ pg_numa_get_max_node(void) return numa_max_node(); } +/* + * Set allocation memory to interleave on all memory nodes in the cpuset. + */ +void +pg_numa_set_interleave(void) +{ + numa_set_membind(numa_get_mems_allowed()); +} + +/* + * Set allocation memory to localalloc. + */ +void +pg_numa_set_localalloc(void) +{ + numa_set_localalloc(); +} + +/* + * Set policy for memory to interleaving (on all nodes per cpuset). + */ +void +pg_numa_interleave_memory(void *ptr, Size size) +{ + numa_interleave_memory(ptr, size, numa_get_mems_allowed()); +} + #else /* Empty wrappers */ @@ -138,4 +165,19 @@ pg_numa_get_max_node(void) return 0; } +void +pg_numa_set_interleave(void) +{ +} + +void +pg_numa_set_localalloc(void) +{ +} + +void +pg_numa_interleave_memory(void *ptr, Size size) +{ +} + #endif -- 2.51.1