From 7ad8be9e522a9abc95c81c51da332dfb3edc47fc Mon Sep 17 00:00:00 2001 From: Jakub Wartak Date: Fri, 21 Feb 2025 11:17:28 +0100 Subject: [PATCH v6 2/3] Extend pg_buffercache with new view pg_buffercache_numa to show NUMA zone for indvidual buffer. Author: Jakub Wartak Co-authored-by: Bertrand Drouvot Reviewed-by: Andres Freund Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com --- contrib/pg_buffercache/Makefile | 3 +- .../expected/pg_buffercache.out | 30 +++- contrib/pg_buffercache/meson.build | 1 + .../pg_buffercache--1.5--1.6.sql | 35 ++++ contrib/pg_buffercache/pg_buffercache.control | 2 +- contrib/pg_buffercache/pg_buffercache_pages.c | 154 +++++++++++++++++- contrib/pg_buffercache/sql/pg_buffercache.sql | 19 ++- src/backend/utils/misc/guc_tables.c | 2 +- src/include/storage/pg_shmem.h | 1 + 9 files changed, 234 insertions(+), 13 deletions(-) create mode 100644 contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile index eae65ead9e5..2a33602537e 100644 --- a/contrib/pg_buffercache/Makefile +++ b/contrib/pg_buffercache/Makefile @@ -8,7 +8,8 @@ OBJS = \ EXTENSION = pg_buffercache DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \ pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \ - pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql + pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \ + pg_buffercache--1.5--1.6.sql PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time" REGRESS = pg_buffercache diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out index b745dc69eae..f34f137075e 100644 --- a/contrib/pg_buffercache/expected/pg_buffercache.out +++ b/contrib/pg_buffercache/expected/pg_buffercache.out @@ -8,6 +8,18 @@ from pg_buffercache; t (1 row) +-- to ignore potential NOTICE: libnuma initialization failed.. +SET client_min_messages TO warning ; +select count(*) = (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_numa; + ?column? +---------- + t +(1 row) + +RESET client_min_messages; select buffers_used + buffers_unused > 0, buffers_dirty <= buffers_used, buffers_pinned <= buffers_used @@ -28,12 +40,19 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; SET ROLE pg_database_owner; SELECT * FROM pg_buffercache; ERROR: permission denied for view pg_buffercache -SELECT * FROM pg_buffercache_pages() AS p (wrong int); +SELECT * FROM pg_buffercache_pages(false) AS p (wrong int); +ERROR: permission denied for function pg_buffercache_pages +SELECT * FROM pg_buffercache_pages(true) AS p (wrong int); ERROR: permission denied for function pg_buffercache_pages SELECT * FROM pg_buffercache_summary(); ERROR: permission denied for function pg_buffercache_summary SELECT * FROM pg_buffercache_usage_counts(); ERROR: permission denied for function pg_buffercache_usage_counts +-- to ignore potential NOTICE: libnuma initialization failed.. +SET client_min_messages TO warning ; +SELECT * FROM pg_buffercache_numa; +ERROR: permission denied for view pg_buffercache_numa +RESET client_min_messages; RESET role; -- Check that pg_monitor is allowed to query view / function SET ROLE pg_monitor; @@ -55,3 +74,12 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts(); t (1 row) +-- to ignore potential NOTICE: libnuma initialization failed.. +SET client_min_messages TO warning ; +SELECT count(*) > 0 FROM pg_buffercache_numa; + ?column? +---------- + t +(1 row) + +RESET client_min_messages; diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build index 12d1fe48717..9b2e9393410 100644 --- a/contrib/pg_buffercache/meson.build +++ b/contrib/pg_buffercache/meson.build @@ -23,6 +23,7 @@ install_data( 'pg_buffercache--1.2.sql', 'pg_buffercache--1.3--1.4.sql', 'pg_buffercache--1.4--1.5.sql', + 'pg_buffercache--1.5--1.6.sql', 'pg_buffercache.control', kwargs: contrib_data_args, ) diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql new file mode 100644 index 00000000000..448d08196f3 --- /dev/null +++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql @@ -0,0 +1,35 @@ +/* contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pg_buffercache" to load this file. \quit + +-- Register the new function. +DROP VIEW pg_buffercache; +DROP FUNCTION pg_buffercache_pages(); + +CREATE OR REPLACE FUNCTION pg_buffercache_pages(boolean) +RETURNS SETOF RECORD +AS 'MODULE_PATHNAME', 'pg_buffercache_pages' +LANGUAGE C PARALLEL SAFE; + +-- Create a view for convenient access. +CREATE OR REPLACE VIEW pg_buffercache AS + SELECT P.* FROM pg_buffercache_pages(false) AS P + (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2, + pinning_backends int4); + +CREATE OR REPLACE VIEW pg_buffercache_numa AS + SELECT P.* FROM pg_buffercache_pages(true) AS P + (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2, + pinning_backends int4, numa_zone_id int4); + +-- Don't want these to be available to public. +REVOKE ALL ON FUNCTION pg_buffercache_pages(boolean) FROM PUBLIC; +REVOKE ALL ON pg_buffercache FROM PUBLIC; +REVOKE ALL ON pg_buffercache_numa FROM PUBLIC; + +GRANT EXECUTE ON FUNCTION pg_buffercache_pages(boolean) TO pg_monitor; +GRANT SELECT ON pg_buffercache TO pg_monitor; +GRANT SELECT ON pg_buffercache_numa TO pg_monitor; diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control index 5ee875f77dd..b030ba3a6fa 100644 --- a/contrib/pg_buffercache/pg_buffercache.control +++ b/contrib/pg_buffercache/pg_buffercache.control @@ -1,5 +1,5 @@ # pg_buffercache extension comment = 'examine the shared buffer cache' -default_version = '1.5' +default_version = '1.6' module_pathname = '$libdir/pg_buffercache' relocatable = true diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 3ae0a018e10..dfe53eb8471 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -6,6 +6,7 @@ * contrib/pg_buffercache/pg_buffercache_pages.c *------------------------------------------------------------------------- */ +#include "pg_config.h" #include "postgres.h" #include "access/htup_details.h" @@ -13,10 +14,12 @@ #include "funcapi.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "port/pg_numa.h" +#include "storage/pg_shmem.h" #define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8 -#define NUM_BUFFERCACHE_PAGES_ELEM 9 +#define NUM_BUFFERCACHE_PAGES_ELEM 10 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4 @@ -43,6 +46,7 @@ typedef struct * because of bufmgr.c's PrivateRefCount infrastructure. */ int32 pinning_backends; + int32 numa_zone_id; } BufferCachePagesRec; @@ -65,6 +69,17 @@ PG_FUNCTION_INFO_V1(pg_buffercache_summary); PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts); PG_FUNCTION_INFO_V1(pg_buffercache_evict); +static void +pg_buffercache_mark_numa_invalid(BufferCachePagesContext *fctx, int n) +{ + int i; + + for (i = 0; i < n; i++) + { + fctx->record[i].numa_zone_id = -1; + } +} + Datum pg_buffercache_pages(PG_FUNCTION_ARGS) { @@ -75,14 +90,33 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) TupleDesc tupledesc; TupleDesc expected_tupledesc; HeapTuple tuple; + Buffer query_numa = PG_GETARG_BOOL(0); if (SRF_IS_FIRSTCALL()) { - int i; + int i, + blk2page, + j; + Size os_page_size; + void **os_page_ptrs; + int *os_pages_status; + int os_page_count; + float pages_per_blk; funcctx = SRF_FIRSTCALL_INIT(); - /* Switch context when allocating stuff to be used in later calls */ + if (query_numa) + { + if (pg_numa_init() == -1) + { + elog(NOTICE, "libnuma initialization failed or NUMA is not supported on this platform, some NUMA data might be unavailable."); + query_numa = false; + } + } + + /* + * Switch context when allocating stuff to be used in later calls + */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* Create a user function context for cross-call persistence */ @@ -122,10 +156,14 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count", INT2OID, -1, 0); - if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) + if (expected_tupledesc->natts >= NUM_BUFFERCACHE_PAGES_ELEM - 1) TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends", INT4OID, -1, 0); + if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) + TupleDescInitEntry(tupledesc, (AttrNumber) 10, "numa_zone_id", + INT4OID, -1, 0); + fctx->tupdesc = BlessTupleDesc(tupledesc); /* Allocate NBuffers worth of BufferCachePagesRec records. */ @@ -137,9 +175,35 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) funcctx->max_calls = NBuffers; funcctx->user_fctx = fctx; - /* Return to original context when allocating transient memory */ + /* + * Return to original context when allocating transient memory + */ MemoryContextSwitchTo(oldcontext); + /* + * This is for gathering some NUMA statistics. We might be using + * various DB block sizes (4kB, 8kB , .. 32kB) that end up being + * allocated in various different OS memory pages sizes, so first we + * need to understand the OS memory page size before calling + * move_pages() + */ + os_page_size = pg_numa_get_pagesize(); + os_page_count = ((uint64)NBuffers * BLCKSZ) / os_page_size; + pages_per_blk = (float) BLCKSZ / os_page_size; + + elog(DEBUG1, "NUMA os_page_count=%d os_page_size=%ld pages_per_blk=%f", + os_page_count, os_page_size, pages_per_blk); + + os_page_ptrs = palloc(sizeof(void *) * os_page_count); + os_pages_status = palloc(sizeof(int) * os_page_count); + memset(os_page_ptrs, 0, sizeof(void *) * os_page_count); + + /* + * If we ever get 0xff back from kernel inquiry, then we probably have + * bug in our buffers to OS page mapping code here + */ + memset(os_pages_status, 0xff, sizeof(int) * os_page_count); + /* * Scan through all the buffers, saving the relevant fields in the * fctx->record structure. @@ -171,14 +235,79 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) else fctx->record[i].isdirty = false; - /* Note if the buffer is valid, and has storage created */ + /* + * Note if the buffer is valid, and has storage created + */ if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) fctx->record[i].isvalid = true; else fctx->record[i].isvalid = false; + if (query_numa) + { + blk2page = (int) i * pages_per_blk; + j = 0; + do + { + /* + * Many buffers can point to the same page (in case of + * BLCKSZ < 4kB), but we want to also query just first + * address. + * + * In order to get reliable results we also need to touch + * memory pages, so that inquiry about NUMA zone doesn't + * return -2. + */ + if (os_page_ptrs[blk2page + j] == 0) + { + volatile uint64 touch pg_attribute_unused(); + + /* + * NBuffers count start really from 1 + */ + os_page_ptrs[blk2page + j] = (char *) BufferGetBlock(i + 1) + (os_page_size * j); + pg_numa_touch_mem_if_required(touch, os_page_ptrs[blk2page + j]); + + /* + * Every 1GB of scanned memory we give process chance + * to respond + */ +#define ONE_GIGABYTE 1024*1024*1024 + if ((i * os_page_size) % ONE_GIGABYTE == 0) + CHECK_FOR_INTERRUPTS(); + } + j++; + } while (j < (int) pages_per_blk); + } + UnlockBufHdr(bufHdr, buf_state); } + + + if (query_numa) + { + if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_pages_status) == -1) + elog(ERROR, "failed NUMA pages inquiry: %m"); + + for (i = 0; i < NBuffers; i++) + { + blk2page = (int) i * pages_per_blk; + + /* + * Technically we can get errors too here and pass that to + * user + * + * XXX:: also we could somehow report single DB block spanning + * more than 2 NUMA zones, but it should be rare (?) + */ + fctx->record[i].numa_zone_id = os_pages_status[blk2page]; + } + } + else + pg_buffercache_mark_numa_invalid(fctx, NBuffers); + + pfree(os_page_ptrs); + pfree(os_pages_status); } funcctx = SRF_PERCALL_SETUP(); @@ -209,8 +338,12 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) nulls[5] = true; nulls[6] = true; nulls[7] = true; - /* unused for v1.0 callers, but the array is always long enough */ + + /* + * unused for v1.0 callers, but the array is always long enough + */ nulls[8] = true; + nulls[9] = true; } else { @@ -228,9 +361,14 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) nulls[6] = false; values[7] = Int16GetDatum(fctx->record[i].usagecount); nulls[7] = false; - /* unused for v1.0 callers, but the array is always long enough */ + + /* + * unused for v1.0 callers, but the array is always long enough + */ values[8] = Int32GetDatum(fctx->record[i].pinning_backends); nulls[8] = false; + values[9] = Int32GetDatum(fctx->record[i].numa_zone_id); + nulls[9] = false; } /* Build and return the tuple. */ diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql index 944fbb1beae..7f2ce683e6c 100644 --- a/contrib/pg_buffercache/sql/pg_buffercache.sql +++ b/contrib/pg_buffercache/sql/pg_buffercache.sql @@ -5,6 +5,14 @@ select count(*) = (select setting::bigint where name = 'shared_buffers') from pg_buffercache; +-- to ignore potential NOTICE: libnuma initialization failed.. +SET client_min_messages TO warning ; +select count(*) = (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_numa; +RESET client_min_messages; + select buffers_used + buffers_unused > 0, buffers_dirty <= buffers_used, buffers_pinned <= buffers_used @@ -16,9 +24,14 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; -- having to create a dedicated user, use the pg_database_owner pseudo-role. SET ROLE pg_database_owner; SELECT * FROM pg_buffercache; -SELECT * FROM pg_buffercache_pages() AS p (wrong int); +SELECT * FROM pg_buffercache_pages(false) AS p (wrong int); +SELECT * FROM pg_buffercache_pages(true) AS p (wrong int); SELECT * FROM pg_buffercache_summary(); SELECT * FROM pg_buffercache_usage_counts(); +-- to ignore potential NOTICE: libnuma initialization failed.. +SET client_min_messages TO warning ; +SELECT * FROM pg_buffercache_numa; +RESET client_min_messages; RESET role; -- Check that pg_monitor is allowed to query view / function @@ -26,3 +39,7 @@ SET ROLE pg_monitor; SELECT count(*) > 0 FROM pg_buffercache; SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary(); SELECT count(*) > 0 FROM pg_buffercache_usage_counts(); +-- to ignore potential NOTICE: libnuma initialization failed.. +SET client_min_messages TO warning ; +SELECT count(*) > 0 FROM pg_buffercache_numa; +RESET client_min_messages; diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 03a6dd49154..172309d389a 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -562,7 +562,7 @@ static int ssl_renegotiation_limit; */ int huge_pages = HUGE_PAGES_TRY; int huge_page_size; -static int huge_pages_status = HUGE_PAGES_UNKNOWN; +int huge_pages_status = HUGE_PAGES_UNKNOWN; /* * These variables are all dummies that don't do anything, except in some diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index b99ebc9e86f..5f7d4b83a60 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -45,6 +45,7 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */ extern PGDLLIMPORT int shared_memory_type; extern PGDLLIMPORT int huge_pages; extern PGDLLIMPORT int huge_page_size; +extern PGDLLIMPORT int huge_pages_status; /* Possible values for huge_pages and huge_pages_status */ typedef enum -- 2.39.5