From 58e17af7c48fd6eeafcff9523ecdacbd53e90ede Mon Sep 17 00:00:00 2001 From: Jakub Wartak Date: Fri, 21 Feb 2025 14:20:18 +0100 Subject: [PATCH v22 6/7] Add new pg_shmem_numa_allocations view Introduce new pg_shmem_numa_alloctions view that allows viewing the shared memory split layout across NUMA nodes. Author: Jakub Wartak Reviewed-by: Andres Freund Reviewed-by: Bertrand Drouvot Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com --- doc/src/sgml/system-views.sgml | 79 ++++++++++++++ src/backend/catalog/system_views.sql | 8 ++ src/backend/storage/ipc/shmem.c | 130 +++++++++++++++++++++++ src/include/catalog/pg_proc.dat | 8 ++ src/test/regress/expected/numa.out | 12 +++ src/test/regress/expected/numa_1.out | 3 + src/test/regress/expected/privileges.out | 16 ++- src/test/regress/expected/rules.out | 4 + src/test/regress/parallel_schedule | 2 +- src/test/regress/sql/numa.sql | 9 ++ src/test/regress/sql/privileges.sql | 6 +- 11 files changed, 272 insertions(+), 5 deletions(-) create mode 100644 src/test/regress/expected/numa.out create mode 100644 src/test/regress/expected/numa_1.out create mode 100644 src/test/regress/sql/numa.sql diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml index 4f336ee0adf..6bb5c8a5669 100644 --- a/doc/src/sgml/system-views.sgml +++ b/doc/src/sgml/system-views.sgml @@ -181,6 +181,11 @@ shared memory allocations + + pg_shmem_numa_allocations + NUMA node mappings for shared memory allocations + + pg_stats planner statistics @@ -4051,6 +4056,80 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx + + <structname>pg_shmem_numa_allocations</structname> + + + pg_shmem_numa_allocations + + + + The pg_shmem_numa_allocations shows how shared + memory allocations in the server's main shared memory segment are distributed + across NUMA nodes. This includes both memory allocated by + PostgreSQL itself and memory allocated + by extensions using the mechanisms detailed in + . + + + + Note that this view does not include memory allocated using the dynamic + shared memory infrastructure. + + + + <structname>pg_shmem_numa_allocations</structname> Columns + + + + + Column Type + + + Description + + + + + + + + name text + + + The name of the shared memory allocation. + + + + + + node_id int4 + + + ID of NUMA node + + + + + + size int4 + + + Size of the allocation on this particular NUMA memory node in bytes + + + + + +
+ + + By default, the pg_shmem_numa_allocations view can be + read only by superusers or roles with privileges of the + pg_read_all_stats role. + +
+ <structname>pg_stats</structname> diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 273008db37f..52ab03a37be 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -658,6 +658,14 @@ GRANT SELECT ON pg_shmem_allocations TO pg_read_all_stats; REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC; GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats; +CREATE VIEW pg_shmem_numa_allocations AS + SELECT * FROM pg_get_shmem_numa_allocations(); + +REVOKE ALL ON pg_shmem_numa_allocations FROM PUBLIC; +GRANT SELECT ON pg_shmem_numa_allocations TO pg_read_all_stats; +REVOKE EXECUTE ON FUNCTION pg_get_shmem_numa_allocations() FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_get_shmem_numa_allocations() TO pg_read_all_stats; + CREATE VIEW pg_backend_memory_contexts AS SELECT * FROM pg_get_backend_memory_contexts(); diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index e453f856794..36d89a58783 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -68,6 +68,7 @@ #include "fmgr.h" #include "funcapi.h" #include "miscadmin.h" +#include "port/pg_numa.h" #include "storage/lwlock.h" #include "storage/pg_shmem.h" #include "storage/shmem.h" @@ -90,6 +91,8 @@ slock_t *ShmemLock; /* spinlock for shared memory and LWLock static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */ +/* To get reliable results for NUMA inquiry we need to "touch pages" once */ +static bool firstNumaTouch = true; /* * InitShmemAccess() --- set up basic pointers to shared memory. @@ -570,3 +573,130 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS) return (Datum) 0; } + +/* SQL SRF showing NUMA memory nodes for allocated shared memory */ +Datum +pg_get_shmem_numa_allocations(PG_FUNCTION_ARGS) +{ +#define PG_GET_SHMEM_NUMA_SIZES_COLS 3 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + HASH_SEQ_STATUS hstat; + ShmemIndexEnt *ent; + Datum values[PG_GET_SHMEM_NUMA_SIZES_COLS]; + bool nulls[PG_GET_SHMEM_NUMA_SIZES_COLS]; + Size os_page_size; + void **page_ptrs; + int *pages_status; + uint64 shm_total_page_count, + shm_ent_page_count, + max_nodes; + Size *nodes; + + InitMaterializedSRF(fcinfo, 0); + + if (pg_numa_init() == -1) + { + elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform"); + return (Datum) 0; + } + max_nodes = pg_numa_get_max_node(); + nodes = palloc(sizeof(Size) * (max_nodes + 1)); + + /* + * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while + * the OS may have different memory page sizes. + * + * To correctly map between them, we need to: 1. Determine the OS memory + * page size 2. Calculate how many OS pages are used by all buffer blocks + * 3. Calculate how many OS pages are contained within each database + * block. + * + * This information is needed before calling move_pages() for NUMA memory + * node inquiry. + */ + os_page_size = pg_numa_get_pagesize(); + + /* + * Allocate memory for page pointers and status based on total shared + * memory size. This simplified approach allocates enough space for all + * pages in shared memory rather than calculating the exact requirements + * for each segment. + */ + shm_total_page_count = ShmemSegHdr->totalsize / os_page_size; + page_ptrs = palloc0(sizeof(void *) * shm_total_page_count); + pages_status = palloc(sizeof(int) * shm_total_page_count); + + if (firstNumaTouch) + elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts"); + + LWLockAcquire(ShmemIndexLock, LW_SHARED); + + hash_seq_init(&hstat, ShmemIndex); + + /* output all allocated entries */ + memset(nulls, 0, sizeof(nulls)); + while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL) + { + int i; + + /* Get number of OS aliged pages */ + shm_ent_page_count = TYPEALIGN(os_page_size, ent->allocated_size) / os_page_size; + + /* + * If we get ever 0xff back from kernel inquiry, then we probably have + * bug in our buffers to OS page mapping code here. + */ + memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count); + + for (i = 0; i < shm_ent_page_count; i++) + { + /* + * In order to get reliable results we also need to touch memory + * pages, so that inquiry about NUMA memory node doesn't return -2 + * (which indicates unmapped/unallocated pages). + */ + volatile uint64 touch pg_attribute_unused(); + + page_ptrs[i] = (char *) ent->location + (i * os_page_size); + if (firstNumaTouch) + pg_numa_touch_mem_if_required(touch, page_ptrs[i]); + + CHECK_FOR_INTERRUPTS(); + } + + if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1) + elog(ERROR, "failed NUMA pages inquiry status: %m"); + + memset(nodes, 0, sizeof(Size) * (max_nodes + 1)); + /* Count number of NUMA nodes used for this shared memory entry */ + for (i = 0; i < shm_ent_page_count; i++) + { + int s = pages_status[i]; + + /* Ensure we are adding only valid index to the array */ + if (s >= 0 && s <= max_nodes) + nodes[s]++; + } + + for (i = 0; i <= max_nodes; i++) + { + values[0] = CStringGetTextDatum(ent->key); + values[1] = i; + values[2] = Int64GetDatum(nodes[i] * os_page_size); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + } + + /* + * We are ignoring the following memory regions (as compared to + * pg_get_shmem_allocations()): 1. output shared memory allocated but not + * counted via the shmem index 2. output as-of-yet unused shared memory. + */ + + LWLockRelease(ShmemIndexLock); + firstNumaTouch = false; + + return (Datum) 0; +} diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 63859661951..72efe8df667 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -8522,6 +8522,14 @@ proname => 'pg_numa_available', provolatile => 's', prorettype => 'bool', proargtypes => '', prosrc => 'pg_numa_available' }, +# shared memory usage with NUMA info +{ oid => '9686', descr => 'NUMA mappings for the main shared memory segment', + proname => 'pg_get_shmem_numa_allocations', prorows => '50', proretset => 't', + provolatile => 'v', prorettype => 'record', proargtypes => '', + proallargtypes => '{text,int4,int8}', proargmodes => '{o,o,o}', + proargnames => '{name,node_id,size}', + prosrc => 'pg_get_shmem_numa_allocations' }, + # memory context of local backend { oid => '2282', descr => 'information about all memory contexts of local backend', diff --git a/src/test/regress/expected/numa.out b/src/test/regress/expected/numa.out new file mode 100644 index 00000000000..fb882c5b771 --- /dev/null +++ b/src/test/regress/expected/numa.out @@ -0,0 +1,12 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit +\endif +-- switch to superuser +\c - +SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_numa_allocations; + ok +---- + t +(1 row) + diff --git a/src/test/regress/expected/numa_1.out b/src/test/regress/expected/numa_1.out new file mode 100644 index 00000000000..6dd6824b4e4 --- /dev/null +++ b/src/test/regress/expected/numa_1.out @@ -0,0 +1,3 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index 5588d83e1bf..f66cf1bbfbd 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -3127,8 +3127,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user; -- clean up DROP TABLE lock_table; DROP USER regress_locktable_user; --- test to check privileges of system views pg_shmem_allocations and --- pg_backend_memory_contexts. +-- test to check privileges of system views pg_shmem_allocations, +-- pg_shmem_numa_allocations and pg_backend_memory_contexts. -- switch to superuser \c - CREATE ROLE regress_readallstats; @@ -3150,6 +3150,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT f (1 row) +SELECT has_table_privilege('regress_readallstats','pg_shmem_numa_allocations','SELECT'); -- no + has_table_privilege +--------------------- + f +(1 row) + GRANT pg_read_all_stats TO regress_readallstats; SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes has_table_privilege @@ -3169,6 +3175,12 @@ SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT t (1 row) +SELECT has_table_privilege('regress_readallstats','pg_shmem_numa_allocations','SELECT'); -- yes + has_table_privilege +--------------------- + t +(1 row) + -- run query to ensure that functions within views can be executed SET ROLE regress_readallstats; SELECT COUNT(*) >= 0 AS ok FROM pg_aios; diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 673c63b8d1b..8b5862cb11a 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1757,6 +1757,10 @@ pg_shmem_allocations| SELECT name, size, allocated_size FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, off, size, allocated_size); +pg_shmem_numa_allocations| SELECT name, + node_id, + size + FROM pg_get_shmem_numa_allocations() pg_get_shmem_numa_allocations(name, node_id, size); pg_stat_activity| SELECT s.datid, d.datname, s.pid, diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 0a35f2f8f6a..0f38caa0d24 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -119,7 +119,7 @@ test: plancache limit plpgsql copy2 temp domain rangefuncs prepare conversion tr # The stats test resets stats, so nothing else needing stats access can be in # this group. # ---------- -test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate +test: partition_join partition_prune reloptions hash_part indexing partition_aggregate partition_info tuplesort explain compression memoize stats predicate numa # event_trigger depends on create_am and cannot run concurrently with # any test that runs DDL diff --git a/src/test/regress/sql/numa.sql b/src/test/regress/sql/numa.sql new file mode 100644 index 00000000000..fddb21a260a --- /dev/null +++ b/src/test/regress/sql/numa.sql @@ -0,0 +1,9 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit +\endif + +-- switch to superuser +\c - + +SELECT COUNT(*) >= 0 AS ok FROM pg_shmem_numa_allocations; diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql index 286b1d03756..ca51dfd7702 100644 --- a/src/test/regress/sql/privileges.sql +++ b/src/test/regress/sql/privileges.sql @@ -1911,8 +1911,8 @@ REVOKE MAINTAIN ON lock_table FROM regress_locktable_user; DROP TABLE lock_table; DROP USER regress_locktable_user; --- test to check privileges of system views pg_shmem_allocations and --- pg_backend_memory_contexts. +-- test to check privileges of system views pg_shmem_allocations, +-- pg_shmem_numa_allocations and pg_backend_memory_contexts. -- switch to superuser \c - @@ -1922,12 +1922,14 @@ CREATE ROLE regress_readallstats; SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no +SELECT has_table_privilege('regress_readallstats','pg_shmem_numa_allocations','SELECT'); -- no GRANT pg_read_all_stats TO regress_readallstats; SELECT has_table_privilege('regress_readallstats','pg_aios','SELECT'); -- yes SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- yes SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- yes +SELECT has_table_privilege('regress_readallstats','pg_shmem_numa_allocations','SELECT'); -- yes -- run query to ensure that functions within views can be executed SET ROLE regress_readallstats; -- 2.49.0