From 78607bd84be0b9a448491bcb0a7d3c6b8a042d1c Mon Sep 17 00:00:00 2001 From: Jakub Wartak Date: Fri, 21 Feb 2025 11:17:28 +0100 Subject: [PATCH v12 2/3] Extend pg_buffercache with new view pg_buffercache_numa to show NUMA zone for indvidual buffer. Author: Jakub Wartak Reviewed-by: Andres Freund Reviewed-by: Bertrand Drouvot Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com --- contrib/pg_buffercache/Makefile | 3 +- .../expected/pg_buffercache_numa.out | 28 ++ .../expected/pg_buffercache_numa_1.out | 3 + contrib/pg_buffercache/meson.build | 2 + .../pg_buffercache--1.5--1.6.sql | 42 ++ contrib/pg_buffercache/pg_buffercache.control | 2 +- contrib/pg_buffercache/pg_buffercache_pages.c | 463 +++++++++++++----- .../sql/pg_buffercache_numa.sql | 20 + doc/src/sgml/pgbuffercache.sgml | 64 ++- 9 files changed, 493 insertions(+), 134 deletions(-) create mode 100644 contrib/pg_buffercache/expected/pg_buffercache_numa.out create mode 100644 contrib/pg_buffercache/expected/pg_buffercache_numa_1.out create mode 100644 contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql create mode 100644 contrib/pg_buffercache/sql/pg_buffercache_numa.sql diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile index eae65ead9e5..2a33602537e 100644 --- a/contrib/pg_buffercache/Makefile +++ b/contrib/pg_buffercache/Makefile @@ -8,7 +8,8 @@ OBJS = \ EXTENSION = pg_buffercache DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \ pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \ - pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql + pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \ + pg_buffercache--1.5--1.6.sql PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time" REGRESS = pg_buffercache diff --git a/contrib/pg_buffercache/expected/pg_buffercache_numa.out b/contrib/pg_buffercache/expected/pg_buffercache_numa.out new file mode 100644 index 00000000000..d4de5ea52fc --- /dev/null +++ b/contrib/pg_buffercache/expected/pg_buffercache_numa.out @@ -0,0 +1,28 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit +\endif +select count(*) = (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_numa; + ?column? +---------- + t +(1 row) + +-- Check that the functions / views can't be accessed by default. To avoid +-- having to create a dedicated user, use the pg_database_owner pseudo-role. +SET ROLE pg_database_owner; +SELECT count(*) > 0 FROM pg_buffercache_numa; +ERROR: permission denied for view pg_buffercache_numa +RESET role; +-- Check that pg_monitor is allowed to query view / function +SET ROLE pg_monitor; +SELECT count(*) > 0 FROM pg_buffercache_numa; + ?column? +---------- + t +(1 row) + +RESET role; diff --git a/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out b/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out new file mode 100644 index 00000000000..6dd6824b4e4 --- /dev/null +++ b/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out @@ -0,0 +1,3 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build index 12d1fe48717..7cd039a1df9 100644 --- a/contrib/pg_buffercache/meson.build +++ b/contrib/pg_buffercache/meson.build @@ -23,6 +23,7 @@ install_data( 'pg_buffercache--1.2.sql', 'pg_buffercache--1.3--1.4.sql', 'pg_buffercache--1.4--1.5.sql', + 'pg_buffercache--1.5--1.6.sql', 'pg_buffercache.control', kwargs: contrib_data_args, ) @@ -34,6 +35,7 @@ tests += { 'regress': { 'sql': [ 'pg_buffercache', + 'pg_buffercache_numa', ], }, } diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql new file mode 100644 index 00000000000..52f63aa258c --- /dev/null +++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql @@ -0,0 +1,42 @@ +/* contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION pg_buffercache" to load this file. \quit + +-- Register the new function. +DROP VIEW pg_buffercache; +DROP FUNCTION pg_buffercache_pages(); + +CREATE OR REPLACE FUNCTION pg_buffercache_pages() +RETURNS SETOF RECORD +AS 'MODULE_PATHNAME', 'pg_buffercache_pages' +LANGUAGE C PARALLEL SAFE; + +CREATE OR REPLACE FUNCTION pg_buffercache_numa_pages() +RETURNS SETOF RECORD +AS 'MODULE_PATHNAME', 'pg_buffercache_numa_pages' +LANGUAGE C PARALLEL SAFE; + +-- Create a view for convenient access. +CREATE OR REPLACE VIEW pg_buffercache AS + SELECT P.* FROM pg_buffercache_pages() AS P + (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2, + pinning_backends int4); + +CREATE OR REPLACE VIEW pg_buffercache_numa AS + SELECT P.* FROM pg_buffercache_numa_pages() AS P + (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2, + pinning_backends int4, numa_zone_id int4); + +-- Don't want these to be available to public. +REVOKE ALL ON FUNCTION pg_buffercache_pages() FROM PUBLIC; +REVOKE ALL ON FUNCTION pg_buffercache_numa_pages() FROM PUBLIC; +REVOKE ALL ON pg_buffercache FROM PUBLIC; +REVOKE ALL ON pg_buffercache_numa FROM PUBLIC; + +GRANT EXECUTE ON FUNCTION pg_buffercache_pages() TO pg_monitor; +GRANT EXECUTE ON FUNCTION pg_buffercache_numa_pages() TO pg_monitor; +GRANT SELECT ON pg_buffercache TO pg_monitor; +GRANT SELECT ON pg_buffercache_numa TO pg_monitor; diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control index 5ee875f77dd..b030ba3a6fa 100644 --- a/contrib/pg_buffercache/pg_buffercache.control +++ b/contrib/pg_buffercache/pg_buffercache.control @@ -1,5 +1,5 @@ # pg_buffercache extension comment = 'examine the shared buffer cache' -default_version = '1.5' +default_version = '1.6' module_pathname = '$libdir/pg_buffercache' relocatable = true diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 3ae0a018e10..b27add81f0a 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -11,12 +11,12 @@ #include "access/htup_details.h" #include "catalog/pg_type.h" #include "funcapi.h" +#include "port/pg_numa.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" - #define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8 -#define NUM_BUFFERCACHE_PAGES_ELEM 9 +#define NUM_BUFFERCACHE_PAGES_ELEM 10 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4 @@ -43,6 +43,7 @@ typedef struct * because of bufmgr.c's PrivateRefCount infrastructure. */ int32 pinning_backends; + int32 numa_zone_id; } BufferCachePagesRec; @@ -61,84 +62,255 @@ typedef struct * relation node/tablespace/database/blocknum and dirty indicator. */ PG_FUNCTION_INFO_V1(pg_buffercache_pages); +PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages); PG_FUNCTION_INFO_V1(pg_buffercache_summary); PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts); PG_FUNCTION_INFO_V1(pg_buffercache_evict); -Datum -pg_buffercache_pages(PG_FUNCTION_ARGS) +/* + * To get reliable results we need to "touch pages" once, see + * comments nearby pg_buffercache_numa_prepare_ptrs(). + */ +static bool firstUseInBackend = true; + +/* + * Helper routine to map Buffers into addresses that can be + * later consumed by pg_numa_query_pages() + * + * Many buffers can point to the same page (in case of + * BLCKSZ < 4kB), but we want to also query just first + * address. + * + * In order to get reliable results we also need to touch + * memory pages, so that inquiry about NUMA zone doesn't + * return -2. + */ +static inline void +pg_buffercache_numa_prepare_ptrs(int buffer_id, float pages_per_blk, Size os_page_size, + void **os_page_ptrs) +{ + size_t blk2page = (size_t)(buffer_id * pages_per_blk); + + for (size_t j = 0; j < pages_per_blk; j++) + { + size_t blk2pageoff = blk2page + j; + if (os_page_ptrs[blk2pageoff] == 0) + { + volatile uint64 touch pg_attribute_unused(); + + /* NBuffers count start really from 1 */ + os_page_ptrs[blk2pageoff] = (char *) BufferGetBlock(buffer_id + 1) + (os_page_size * j); + + /* We just need to do it only once in backend */ + if (firstUseInBackend) + pg_numa_touch_mem_if_required(touch, os_page_ptrs[blk2pageoff]); + + CHECK_FOR_INTERRUPTS(); + } + } +} + +/* + * Helper routine for pg_buffercache_(numa_)pages. + * + * We need fcinfo here and we pass it here with PG_FUNCTION_ARGS + */ +static BufferCachePagesContext * +pg_buffercache_init_entries(FuncCallContext *funcctx, PG_FUNCTION_ARGS) { - FuncCallContext *funcctx; - Datum result; - MemoryContext oldcontext; BufferCachePagesContext *fctx; /* User function context. */ + MemoryContext oldcontext; TupleDesc tupledesc; TupleDesc expected_tupledesc; - HeapTuple tuple; - if (SRF_IS_FIRSTCALL()) - { - int i; + /* + * Switch context when allocating stuff to be used in later calls + */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); - funcctx = SRF_FIRSTCALL_INIT(); + /* Create a user function context for cross-call persistence */ + fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext)); - /* Switch context when allocating stuff to be used in later calls */ - oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + /* + * To smoothly support upgrades from version 1.0 of this extension + * transparently handle the (non-)existence of the pinning_backends + * column. We unfortunately have to get the result type for that... - we + * can't use the result type determined by the function definition without + * potentially crashing when somebody uses the old (or even wrong) + * function definition though. + */ + if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM || + expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM) + elog(ERROR, "incorrect number of output arguments"); + + /* Construct a tuple descriptor for the result rows. */ + tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts); + TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", + INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber", + INT2OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber", + INT8OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty", + BOOLOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count", + INT2OID, -1, 0); + + if (expected_tupledesc->natts >= NUM_BUFFERCACHE_PAGES_ELEM - 1) + TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends", + INT4OID, -1, 0); + if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) + TupleDescInitEntry(tupledesc, (AttrNumber) 10, "numa_zone_id", + INT4OID, -1, 0); + + fctx->tupdesc = BlessTupleDesc(tupledesc); + + /* Allocate NBuffers worth of BufferCachePagesRec records. */ + fctx->record = (BufferCachePagesRec *) + MemoryContextAllocHuge(CurrentMemoryContext, + sizeof(BufferCachePagesRec) * NBuffers); + + /* Set max calls and remember the user function context. */ + funcctx->max_calls = NBuffers; + funcctx->user_fctx = fctx; + + /* + * Return to original context when allocating transient memory + */ + MemoryContextSwitchTo(oldcontext); + return fctx; +} + +/* + * Helper routine for pg_buffercache_(numa_)pages + */ +static void +pg_buffercache_build_tuple(int i, BufferCachePagesContext *fctx) +{ + BufferDesc *bufHdr; + uint32 buf_state; + + bufHdr = GetBufferDescriptor(i); + /* Lock each buffer header before inspecting. */ + buf_state = LockBufHdr(bufHdr); + + fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr); + fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag); + fctx->record[i].reltablespace = bufHdr->tag.spcOid; + fctx->record[i].reldatabase = bufHdr->tag.dbOid; + fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag); + fctx->record[i].blocknum = bufHdr->tag.blockNum; + fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state); + fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state); + + if (buf_state & BM_DIRTY) + fctx->record[i].isdirty = true; + else + fctx->record[i].isdirty = false; + + /* + * Note if the buffer is valid, and has storage created + */ + if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) + fctx->record[i].isvalid = true; + else + fctx->record[i].isvalid = false; + + fctx->record[i].numa_zone_id = -1; + + UnlockBufHdr(bufHdr, buf_state); +} + +/* + * Helper routine for pg_buffercache_(numa_)pages + */ +static Datum +get_buffercache_tuple(int i, BufferCachePagesContext *fctx) +{ + Datum values[NUM_BUFFERCACHE_PAGES_ELEM]; + bool nulls[NUM_BUFFERCACHE_PAGES_ELEM]; + HeapTuple tuple; + + values[0] = Int32GetDatum(fctx->record[i].bufferid); + nulls[0] = false; - /* Create a user function context for cross-call persistence */ - fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext)); + /* + * Set all fields except the bufferid to null if the buffer is unused or + * not valid. + */ + if (fctx->record[i].blocknum == InvalidBlockNumber || + fctx->record[i].isvalid == false) + { + nulls[1] = true; + nulls[2] = true; + nulls[3] = true; + nulls[4] = true; + nulls[5] = true; + nulls[6] = true; + nulls[7] = true; + + /* + * unused for v1.0 callers, but the array is always long enough + */ + nulls[8] = true; + nulls[9] = true; + } + else + { + values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber); + nulls[1] = false; + values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace); + nulls[2] = false; + values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase); + nulls[3] = false; + values[4] = ObjectIdGetDatum(fctx->record[i].forknum); + nulls[4] = false; + values[5] = Int64GetDatum((int64) fctx->record[i].blocknum); + nulls[5] = false; + values[6] = BoolGetDatum(fctx->record[i].isdirty); + nulls[6] = false; + values[7] = Int16GetDatum(fctx->record[i].usagecount); + nulls[7] = false; /* - * To smoothly support upgrades from version 1.0 of this extension - * transparently handle the (non-)existence of the pinning_backends - * column. We unfortunately have to get the result type for that... - - * we can't use the result type determined by the function definition - * without potentially crashing when somebody uses the old (or even - * wrong) function definition though. + * unused for v1.0 callers, but the array is always long enough */ - if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) - elog(ERROR, "return type must be a row type"); + values[8] = Int32GetDatum(fctx->record[i].pinning_backends); + nulls[8] = false; + values[9] = Int32GetDatum(fctx->record[i].numa_zone_id); + nulls[9] = false; + } - if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM || - expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM) - elog(ERROR, "incorrect number of output arguments"); + /* Build and return the tuple. */ + tuple = heap_form_tuple(fctx->tupdesc, values, nulls); + return HeapTupleGetDatum(tuple); +} - /* Construct a tuple descriptor for the result rows. */ - tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts); - TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", - INT4OID, -1, 0); - TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", - OIDOID, -1, 0); - TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", - OIDOID, -1, 0); - TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", - OIDOID, -1, 0); - TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber", - INT2OID, -1, 0); - TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber", - INT8OID, -1, 0); - TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty", - BOOLOID, -1, 0); - TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count", - INT2OID, -1, 0); - - if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) - TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends", - INT4OID, -1, 0); - - fctx->tupdesc = BlessTupleDesc(tupledesc); - - /* Allocate NBuffers worth of BufferCachePagesRec records. */ - fctx->record = (BufferCachePagesRec *) - MemoryContextAllocHuge(CurrentMemoryContext, - sizeof(BufferCachePagesRec) * NBuffers); - - /* Set max calls and remember the user function context. */ - funcctx->max_calls = NBuffers; - funcctx->user_fctx = fctx; - - /* Return to original context when allocating transient memory */ - MemoryContextSwitchTo(oldcontext); +/* + * When updating this routine please sync it with below one: + * pg_buffercache_numa_pages() + */ +Datum +pg_buffercache_pages(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + BufferCachePagesContext *fctx; /* User function context. */ + + if (SRF_IS_FIRSTCALL()) + { + int i; + + funcctx = SRF_FIRSTCALL_INIT(); + fctx = pg_buffercache_init_entries(funcctx, fcinfo); /* * Scan through all the buffers, saving the relevant fields in the @@ -149,36 +321,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) * locks, so the information of each buffer is self-consistent. */ for (i = 0; i < NBuffers; i++) - { - BufferDesc *bufHdr; - uint32 buf_state; - - bufHdr = GetBufferDescriptor(i); - /* Lock each buffer header before inspecting. */ - buf_state = LockBufHdr(bufHdr); - - fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr); - fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag); - fctx->record[i].reltablespace = bufHdr->tag.spcOid; - fctx->record[i].reldatabase = bufHdr->tag.dbOid; - fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag); - fctx->record[i].blocknum = bufHdr->tag.blockNum; - fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state); - fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state); - - if (buf_state & BM_DIRTY) - fctx->record[i].isdirty = true; - else - fctx->record[i].isdirty = false; - - /* Note if the buffer is valid, and has storage created */ - if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) - fctx->record[i].isvalid = true; - else - fctx->record[i].isvalid = false; - - UnlockBufHdr(bufHdr, buf_state); - } + pg_buffercache_build_tuple(i, fctx); } funcctx = SRF_PERCALL_SETUP(); @@ -188,59 +331,117 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) if (funcctx->call_cntr < funcctx->max_calls) { + Datum result; uint32 i = funcctx->call_cntr; - Datum values[NUM_BUFFERCACHE_PAGES_ELEM]; - bool nulls[NUM_BUFFERCACHE_PAGES_ELEM]; - values[0] = Int32GetDatum(fctx->record[i].bufferid); - nulls[0] = false; + result = get_buffercache_tuple(i, fctx); + SRF_RETURN_NEXT(funcctx, result); + } + else + { + SRF_RETURN_DONE(funcctx); + } +} + +/* + * This is almost identical to the above, but performs + * NUMA inuqiry about memory mappings + */ +Datum +pg_buffercache_numa_pages(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + BufferCachePagesContext *fctx; /* User function context. */ + + if (SRF_IS_FIRSTCALL()) + { + int i; + Size os_page_size = 0; + void **os_page_ptrs = NULL; + int *os_pages_status = NULL; + int os_page_count = 0; + float pages_per_blk = 0; + + funcctx = SRF_FIRSTCALL_INIT(); + if (pg_numa_init() == -1) + elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform, some NUMA data might be unavailable."); + fctx = pg_buffercache_init_entries(funcctx, fcinfo); /* - * Set all fields except the bufferid to null if the buffer is unused - * or not valid. + * This is for gathering some NUMA statistics. We might be using + * various DB block sizes (4kB, 8kB , .. 32kB) that end up being + * allocated in various different OS memory pages sizes, so first + * we need to understand the OS memory page size before calling + * move_pages() + */ + os_page_size = pg_numa_get_pagesize(); + os_page_count = ((uint64) NBuffers * BLCKSZ) / os_page_size; + pages_per_blk = (float) BLCKSZ / os_page_size; + + elog(DEBUG1, "NUMA: os_page_count=%d os_page_size=%zu pages_per_blk=%f", + os_page_count, os_page_size, pages_per_blk); + + os_page_ptrs = palloc(sizeof(void *) * os_page_count); + os_pages_status = palloc(sizeof(int) * os_page_count); + memset(os_page_ptrs, 0, sizeof(void *) * os_page_count); + + /* + * If we ever get 0xff back from kernel inquiry, then we probably + * have bug in our buffers to OS page mapping code here + */ + memset(os_pages_status, 0xff, sizeof(int) * os_page_count); + + if (firstUseInBackend) + elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts"); + + /* + * Scan through all the buffers, saving the relevant fields in the + * fctx->record structure. + * + * We don't hold the partition locks, so we don't get a consistent + * snapshot across all buffers, but we do grab the buffer header + * locks, so the information of each buffer is self-consistent. */ - if (fctx->record[i].blocknum == InvalidBlockNumber || - fctx->record[i].isvalid == false) + for (i = 0; i < NBuffers; i++) { - nulls[1] = true; - nulls[2] = true; - nulls[3] = true; - nulls[4] = true; - nulls[5] = true; - nulls[6] = true; - nulls[7] = true; - /* unused for v1.0 callers, but the array is always long enough */ - nulls[8] = true; + pg_buffercache_build_tuple(i, fctx); + pg_buffercache_numa_prepare_ptrs(i, pages_per_blk, os_page_size, os_page_ptrs); } - else + + if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_pages_status) == -1) + elog(ERROR, "failed NUMA pages inquiry: %m"); + + for (i = 0; i < NBuffers; i++) { - values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber); - nulls[1] = false; - values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace); - nulls[2] = false; - values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase); - nulls[3] = false; - values[4] = ObjectIdGetDatum(fctx->record[i].forknum); - nulls[4] = false; - values[5] = Int64GetDatum((int64) fctx->record[i].blocknum); - nulls[5] = false; - values[6] = BoolGetDatum(fctx->record[i].isdirty); - nulls[6] = false; - values[7] = Int16GetDatum(fctx->record[i].usagecount); - nulls[7] = false; - /* unused for v1.0 callers, but the array is always long enough */ - values[8] = Int32GetDatum(fctx->record[i].pinning_backends); - nulls[8] = false; + int blk2page = (int) i * pages_per_blk; + + /* + * Technically we can get errors too here and pass that to + * user. Also we could somehow report single DB block spanning + * more than one NUMA zone, but it should be rare. + */ + fctx->record[i].numa_zone_id = os_pages_status[blk2page]; } + } + + funcctx = SRF_PERCALL_SETUP(); - /* Build and return the tuple. */ - tuple = heap_form_tuple(fctx->tupdesc, values, nulls); - result = HeapTupleGetDatum(tuple); + /* Get the saved state */ + fctx = funcctx->user_fctx; + if (funcctx->call_cntr < funcctx->max_calls) + { + Datum result; + uint32 i = funcctx->call_cntr; + + result = get_buffercache_tuple(i, fctx); SRF_RETURN_NEXT(funcctx, result); } else + { + firstUseInBackend = false; SRF_RETURN_DONE(funcctx); + } } Datum diff --git a/contrib/pg_buffercache/sql/pg_buffercache_numa.sql b/contrib/pg_buffercache/sql/pg_buffercache_numa.sql new file mode 100644 index 00000000000..2225b879f58 --- /dev/null +++ b/contrib/pg_buffercache/sql/pg_buffercache_numa.sql @@ -0,0 +1,20 @@ +SELECT NOT(pg_numa_available()) AS skip_test \gset +\if :skip_test +\quit +\endif + +select count(*) = (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_numa; + +-- Check that the functions / views can't be accessed by default. To avoid +-- having to create a dedicated user, use the pg_database_owner pseudo-role. +SET ROLE pg_database_owner; +SELECT count(*) > 0 FROM pg_buffercache_numa; +RESET role; + +-- Check that pg_monitor is allowed to query view / function +SET ROLE pg_monitor; +SELECT count(*) > 0 FROM pg_buffercache_numa; +RESET role; diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml index 802a5112d77..75978a6eaed 100644 --- a/doc/src/sgml/pgbuffercache.sgml +++ b/doc/src/sgml/pgbuffercache.sgml @@ -30,7 +30,9 @@ This module provides the pg_buffercache_pages() function (wrapped in the pg_buffercache view), - the pg_buffercache_summary() function, the + pg_buffercache_numa_pages() function (wrapped in the + pg_buffercache_numa view), the + pg_buffercache_summary() function, the pg_buffercache_usage_counts() function and the pg_buffercache_evict() function. @@ -42,6 +44,13 @@ convenient use. + + The similiar pg_buffercache_numa_pages() is a slower + variant of the above, but also can provide NUMA node ID for shared buffer entry. + The pg_buffercache_numa view wraps the function for + convenient use. + + The pg_buffercache_summary() function returns a single row summarizing the state of the shared buffer cache. @@ -200,6 +209,59 @@ + + The <structname>pg_buffercache_numa</structname> View + + + The definitions of the columns exposed are almost identical to the previous + pg_buffercache view, but this one includes one additional + column numa_zone_id as defined in . + + + + <structname>pg_buffercache_numa</structname> Columns + + + + + Column Type + + + Description + + + + + + + + numa_zone_id integer + + + ID (number) of the NUMA node for this particular buffer. NULL if the shared buffer + has not been used yet.On systems without NUMA this usually returns 0. + + + + + +
+ + + This is clone version of the original pg_buffercache view, however it provides + additional numa_zone_id column. Fetching this + information from OS is costly and might take much longer and querying it is not + recommended by automated or monitoring systems. + + + + As NUMA node ID inquiry for each page requires memory pages to be paged-in, first + execution of this function can take long time especially on systems with bigint + shared_buffers and without huge_pages enabled. + + +
+ The <function>pg_buffercache_summary()</function> Function -- 2.39.5