From 77149659f37dd8943a85ab5cf61c96c6cc9dcebd Mon Sep 17 00:00:00 2001 From: Jakub Wartak Date: Fri, 21 Feb 2025 11:17:28 +0100 Subject: [PATCH v7 2/3] Extend pg_buffercache with new view pg_buffercache_numa to show NUMA zone for indvidual buffer. Author: Jakub Wartak Co-authored-by: Bertrand Drouvot Reviewed-by: Andres Freund Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com --- contrib/pg_buffercache/Makefile | 3 +- .../expected/pg_buffercache.out | 30 +++- contrib/pg_buffercache/meson.build | 1 + .../pg_buffercache--1.5--1.6.sql | 35 ++++ contrib/pg_buffercache/pg_buffercache.control | 2 +- contrib/pg_buffercache/pg_buffercache_pages.c | 156 +++++++++++++++++- contrib/pg_buffercache/sql/pg_buffercache.sql | 19 ++- src/backend/utils/misc/guc_tables.c | 2 +- src/include/storage/pg_shmem.h | 1 + 9 files changed, 237 insertions(+), 12 deletions(-) create mode 100644 contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile index eae65ead9e..2a33602537 100644 --- a/contrib/pg_buffercache/Makefile +++ b/contrib/pg_buffercache/Makefile @@ -8,7 +8,8 @@ OBJS = \ EXTENSION = pg_buffercache DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \ pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \ - pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql + pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \ + pg_buffercache--1.5--1.6.sql PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time" REGRESS = pg_buffercache diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out index b745dc69ea..f34f137075 100644 --- a/contrib/pg_buffercache/expected/pg_buffercache.out +++ b/contrib/pg_buffercache/expected/pg_buffercache.out @@ -8,6 +8,18 @@ from pg_buffercache; t (1 row) +-- to ignore potential NOTICE: libnuma initialization failed.. +SET client_min_messages TO warning ; +select count(*) = (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_numa; + ?column? +---------- + t +(1 row) + +RESET client_min_messages; select buffers_used + buffers_unused > 0, buffers_dirty <= buffers_used, buffers_pinned <= buffers_used @@ -28,12 +40,19 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; SET ROLE pg_database_owner; SELECT * FROM pg_buffercache; ERROR: permission denied for view pg_buffercache -SELECT * FROM pg_buffercache_pages() AS p (wrong int); +SELECT * FROM pg_buffercache_pages(false) AS p (wrong int); +ERROR: permission denied for function pg_buffercache_pages +SELECT * FROM pg_buffercache_pages(true) AS p (wrong int); ERROR: permission denied for function pg_buffercache_pages SELECT * FROM pg_buffercache_summary(); ERROR: permission denied for function pg_buffercache_summary SELECT * FROM pg_buffercache_usage_counts(); ERROR: permission denied for function pg_buffercache_usage_counts +-- to ignore potential NOTICE: libnuma initialization failed.. +SET client_min_messages TO warning ; +SELECT * FROM pg_buffercache_numa; +ERROR: permission denied for view pg_buffercache_numa +RESET client_min_messages; RESET role; -- Check that pg_monitor is allowed to query view / function SET ROLE pg_monitor; @@ -55,3 +74,12 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts(); t (1 row) +-- to ignore potential NOTICE: libnuma initialization failed.. +SET client_min_messages TO warning ; +SELECT count(*) > 0 FROM pg_buffercache_numa; + ?column? +---------- + t +(1 row) + +RESET client_min_messages; diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build index 12d1fe4871..9b2e939341 100644 --- a/contrib/pg_buffercache/meson.build +++ b/contrib/pg_buffercache/meson.build @@ -23,6 +23,7 @@ install_data( 'pg_buffercache--1.2.sql', 'pg_buffercache--1.3--1.4.sql', 'pg_buffercache--1.4--1.5.sql', + 'pg_buffercache--1.5--1.6.sql', 'pg_buffercache.control', kwargs: contrib_data_args, ) diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql new file mode 100644 index 0000000000..448d08196f --- /dev/null +++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql @@ -0,0 +1,35 @@ +/* contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pg_buffercache" to load this file. \quit + +-- Register the new function. +DROP VIEW pg_buffercache; +DROP FUNCTION pg_buffercache_pages(); + +CREATE OR REPLACE FUNCTION pg_buffercache_pages(boolean) +RETURNS SETOF RECORD +AS 'MODULE_PATHNAME', 'pg_buffercache_pages' +LANGUAGE C PARALLEL SAFE; + +-- Create a view for convenient access. +CREATE OR REPLACE VIEW pg_buffercache AS + SELECT P.* FROM pg_buffercache_pages(false) AS P + (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2, + pinning_backends int4); + +CREATE OR REPLACE VIEW pg_buffercache_numa AS + SELECT P.* FROM pg_buffercache_pages(true) AS P + (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2, + pinning_backends int4, numa_zone_id int4); + +-- Don't want these to be available to public. +REVOKE ALL ON FUNCTION pg_buffercache_pages(boolean) FROM PUBLIC; +REVOKE ALL ON pg_buffercache FROM PUBLIC; +REVOKE ALL ON pg_buffercache_numa FROM PUBLIC; + +GRANT EXECUTE ON FUNCTION pg_buffercache_pages(boolean) TO pg_monitor; +GRANT SELECT ON pg_buffercache TO pg_monitor; +GRANT SELECT ON pg_buffercache_numa TO pg_monitor; diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control index 5ee875f77d..b030ba3a6f 100644 --- a/contrib/pg_buffercache/pg_buffercache.control +++ b/contrib/pg_buffercache/pg_buffercache.control @@ -1,5 +1,5 @@ # pg_buffercache extension comment = 'examine the shared buffer cache' -default_version = '1.5' +default_version = '1.6' module_pathname = '$libdir/pg_buffercache' relocatable = true diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 3ae0a018e1..f32546fdee 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -11,12 +11,14 @@ #include "access/htup_details.h" #include "catalog/pg_type.h" #include "funcapi.h" +#include "port/pg_numa.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "storage/pg_shmem.h" #define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8 -#define NUM_BUFFERCACHE_PAGES_ELEM 9 +#define NUM_BUFFERCACHE_PAGES_ELEM 10 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4 @@ -43,6 +45,7 @@ typedef struct * because of bufmgr.c's PrivateRefCount infrastructure. */ int32 pinning_backends; + int32 numa_zone_id; } BufferCachePagesRec; @@ -65,6 +68,52 @@ PG_FUNCTION_INFO_V1(pg_buffercache_summary); PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts); PG_FUNCTION_INFO_V1(pg_buffercache_evict); +static void +pg_buffercache_mark_numa_invalid(BufferCachePagesContext *fctx, int n) +{ + int i; + + for (i = 0; i < n; i++) + { + fctx->record[i].numa_zone_id = -1; + } +} + +/* +* Many buffers can point to the same page (in case of +* BLCKSZ < 4kB), but we want to also query just first +* address. +* +* In order to get reliable results we also need to touch +* memory pages, so that inquiry about NUMA zone doesn't +* return -2. +*/ +static inline void +pg_buffercache_numa_prepare_ptrs(int i, float pages_per_blk, Size os_page_size, + void **os_page_ptrs, bool firstUseInBackend) +{ + int j = 0, + blk2page = (int) i * pages_per_blk; + + do + { + if (os_page_ptrs[blk2page + j] == 0) + { + volatile uint64 touch pg_attribute_unused(); + + /* NBuffers count start really from 1 */ + os_page_ptrs[blk2page + j] = (char *) BufferGetBlock(i + 1) + (os_page_size * j); + + /* We just need to do it only once in backend */ + if (firstUseInBackend == true) + pg_numa_touch_mem_if_required(touch, os_page_ptrs[blk2page + j]); + + CHECK_FOR_INTERRUPTS(); + } + j++; + } while (j < (int) pages_per_blk); +} + Datum pg_buffercache_pages(PG_FUNCTION_ARGS) { @@ -75,14 +124,32 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) TupleDesc tupledesc; TupleDesc expected_tupledesc; HeapTuple tuple; + Buffer query_numa = PG_GETARG_BOOL(0); + static bool firstUseInBackend = true; if (SRF_IS_FIRSTCALL()) { int i; + Size os_page_size = 0; + void **os_page_ptrs = NULL; + int *os_pages_status = NULL; + int os_page_count = 0; + float pages_per_blk = 0; funcctx = SRF_FIRSTCALL_INIT(); - /* Switch context when allocating stuff to be used in later calls */ + if (query_numa) + { + if (pg_numa_init() == -1) + { + elog(NOTICE, "libnuma initialization failed or NUMA is not supported on this platform, some NUMA data might be unavailable."); + query_numa = false; + } + } + + /* + * Switch context when allocating stuff to be used in later calls + */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* Create a user function context for cross-call persistence */ @@ -122,10 +189,14 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count", INT2OID, -1, 0); - if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) + if (expected_tupledesc->natts >= NUM_BUFFERCACHE_PAGES_ELEM - 1) TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends", INT4OID, -1, 0); + if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) + TupleDescInitEntry(tupledesc, (AttrNumber) 10, "numa_zone_id", + INT4OID, -1, 0); + fctx->tupdesc = BlessTupleDesc(tupledesc); /* Allocate NBuffers worth of BufferCachePagesRec records. */ @@ -137,9 +208,41 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) funcctx->max_calls = NBuffers; funcctx->user_fctx = fctx; - /* Return to original context when allocating transient memory */ + /* + * Return to original context when allocating transient memory + */ MemoryContextSwitchTo(oldcontext); + if (query_numa) + { + /* + * This is for gathering some NUMA statistics. We might be using + * various DB block sizes (4kB, 8kB , .. 32kB) that end up being + * allocated in various different OS memory pages sizes, so first + * we need to understand the OS memory page size before calling + * move_pages() + */ + os_page_size = pg_numa_get_pagesize(); + os_page_count = ((uint64) NBuffers * BLCKSZ) / os_page_size; + pages_per_blk = (float) BLCKSZ / os_page_size; + + elog(DEBUG1, "NUMA: os_page_count=%d os_page_size=%zu pages_per_blk=%f", + os_page_count, os_page_size, pages_per_blk); + + os_page_ptrs = palloc(sizeof(void *) * os_page_count); + os_pages_status = palloc(sizeof(int) * os_page_count); + memset(os_page_ptrs, 0, sizeof(void *) * os_page_count); + + /* + * If we ever get 0xff back from kernel inquiry, then we probably + * have bug in our buffers to OS page mapping code here + */ + memset(os_pages_status, 0xff, sizeof(int) * os_page_count); + + if (firstUseInBackend == true) + elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts"); + } + /* * Scan through all the buffers, saving the relevant fields in the * fctx->record structure. @@ -171,14 +274,41 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) else fctx->record[i].isdirty = false; - /* Note if the buffer is valid, and has storage created */ + /* + * Note if the buffer is valid, and has storage created + */ if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) fctx->record[i].isvalid = true; else fctx->record[i].isvalid = false; + if (unlikely(query_numa)) + pg_buffercache_numa_prepare_ptrs(i, pages_per_blk, os_page_size, os_page_ptrs, firstUseInBackend); + UnlockBufHdr(bufHdr, buf_state); } + + + if (query_numa) + { + if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_pages_status) == -1) + elog(ERROR, "failed NUMA pages inquiry: %m"); + + for (i = 0; i < NBuffers; i++) + { + int blk2page = (int) i * pages_per_blk; + + /* + * Technically we can get errors too here and pass that to + * user. Also we could somehow report single DB block spanning + * more than one NUMA zone, but it should be rare. + */ + fctx->record[i].numa_zone_id = os_pages_status[blk2page]; + } + } + else + pg_buffercache_mark_numa_invalid(fctx, NBuffers); + } funcctx = SRF_PERCALL_SETUP(); @@ -209,8 +339,12 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) nulls[5] = true; nulls[6] = true; nulls[7] = true; - /* unused for v1.0 callers, but the array is always long enough */ + + /* + * unused for v1.0 callers, but the array is always long enough + */ nulls[8] = true; + nulls[9] = true; } else { @@ -228,9 +362,14 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) nulls[6] = false; values[7] = Int16GetDatum(fctx->record[i].usagecount); nulls[7] = false; - /* unused for v1.0 callers, but the array is always long enough */ + + /* + * unused for v1.0 callers, but the array is always long enough + */ values[8] = Int32GetDatum(fctx->record[i].pinning_backends); nulls[8] = false; + values[9] = Int32GetDatum(fctx->record[i].numa_zone_id); + nulls[9] = false; } /* Build and return the tuple. */ @@ -240,7 +379,10 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) SRF_RETURN_NEXT(funcctx, result); } else + { + firstUseInBackend = false; SRF_RETURN_DONE(funcctx); + } } Datum diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql index 944fbb1bea..7f2ce683e6 100644 --- a/contrib/pg_buffercache/sql/pg_buffercache.sql +++ b/contrib/pg_buffercache/sql/pg_buffercache.sql @@ -5,6 +5,14 @@ select count(*) = (select setting::bigint where name = 'shared_buffers') from pg_buffercache; +-- to ignore potential NOTICE: libnuma initialization failed.. +SET client_min_messages TO warning ; +select count(*) = (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_numa; +RESET client_min_messages; + select buffers_used + buffers_unused > 0, buffers_dirty <= buffers_used, buffers_pinned <= buffers_used @@ -16,9 +24,14 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; -- having to create a dedicated user, use the pg_database_owner pseudo-role. SET ROLE pg_database_owner; SELECT * FROM pg_buffercache; -SELECT * FROM pg_buffercache_pages() AS p (wrong int); +SELECT * FROM pg_buffercache_pages(false) AS p (wrong int); +SELECT * FROM pg_buffercache_pages(true) AS p (wrong int); SELECT * FROM pg_buffercache_summary(); SELECT * FROM pg_buffercache_usage_counts(); +-- to ignore potential NOTICE: libnuma initialization failed.. +SET client_min_messages TO warning ; +SELECT * FROM pg_buffercache_numa; +RESET client_min_messages; RESET role; -- Check that pg_monitor is allowed to query view / function @@ -26,3 +39,7 @@ SET ROLE pg_monitor; SELECT count(*) > 0 FROM pg_buffercache; SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary(); SELECT count(*) > 0 FROM pg_buffercache_usage_counts(); +-- to ignore potential NOTICE: libnuma initialization failed.. +SET client_min_messages TO warning ; +SELECT count(*) > 0 FROM pg_buffercache_numa; +RESET client_min_messages; diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index ad25cbb39c..dd34c79f52 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -563,7 +563,7 @@ static int ssl_renegotiation_limit; */ int huge_pages = HUGE_PAGES_TRY; int huge_page_size; -static int huge_pages_status = HUGE_PAGES_UNKNOWN; +int huge_pages_status = HUGE_PAGES_UNKNOWN; /* * These variables are all dummies that don't do anything, except in some diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index b99ebc9e86..5f7d4b83a6 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -45,6 +45,7 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */ extern PGDLLIMPORT int shared_memory_type; extern PGDLLIMPORT int huge_pages; extern PGDLLIMPORT int huge_page_size; +extern PGDLLIMPORT int huge_pages_status; /* Possible values for huge_pages and huge_pages_status */ typedef enum -- 2.39.5