From 8ea0d82a1c72f1fcbf834cfa5a7913fce0778ac8 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Fri, 16 Jan 2026 21:55:02 +0100 Subject: [PATCH] Handle ENOENT status when querying NUMA node We've assumed that touching the memory is sufficient for a page to be located on one of the NUMA nodes. But that's not quite true, because a page may be moved to swap after we touch it. It's not hard to make that happen with commands like CREATE INDEX (which uses only a small circular buffer in shared buffers, while loading large amounts of data into page cache). This memory pressure may force a significant fraction of shared buffers to swap. We touch the memory before querying the status, but there is no guarangee it won't be moved to swap in between. We do the touching only during the first call, so later calls are more likely to be affected. This only happens with regular memory pages (e.g. 4K). Hugepages cannot be swapped out under memory pressure. We can't prevent this - it's up to the kernel to move pages to swap. Therefore, we have to accept ENOENT (-2) status as a valid result, and handle it without failing. This patch simply treats -2 as unknown node, and returns NULL in the two affected views (pg_shmem_allocations_numa and pg_buffercache_numa). Reported by Christoph Berg, investigation and fix by me. Backpatch to 18, where the two views were introduced. Reported-by: Christoph Berg Discussion: 18 Backpatch-through: https://postgr.es/m/aTq5Gt_n-oS_QSpL@msg.df7cb.de --- contrib/pg_buffercache/pg_buffercache_pages.c | 12 +++++-- src/backend/storage/ipc/shmem.c | 32 +++++++++++++++---- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index dcba3fb5473..9ff0eb4b0a0 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -551,8 +551,16 @@ pg_buffercache_os_pages_internal(FunctionCallInfo fcinfo, bool include_numa) if (fctx->include_numa) { - values[2] = Int32GetDatum(fctx->record[i].numa_node); - nulls[2] = false; + /* status is valid node number */ + if (fctx->record[i].numa_node >= 0) + { + values[2] = Int32GetDatum(fctx->record[i].numa_node); + nulls[2] = false; + } else { + /* some kind of error (e.g. pages moved to swap) */ + values[2] = (Datum) 0; + nulls[2] = true; + } } else { diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index d2f4710f141..1b536363152 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -599,7 +599,7 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) InitMaterializedSRF(fcinfo, 0); max_nodes = pg_numa_get_max_node(); - nodes = palloc_array(Size, max_nodes + 1); + nodes = palloc_array(Size, max_nodes + 2); /* * Shared memory allocations can vary in size and may not align with OS @@ -635,7 +635,6 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) hash_seq_init(&hstat, ShmemIndex); /* output all allocated entries */ - memset(nulls, 0, sizeof(nulls)); while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL) { int i; @@ -684,22 +683,33 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) elog(ERROR, "failed NUMA pages inquiry status: %m"); /* Count number of NUMA nodes used for this shared memory entry */ - memset(nodes, 0, sizeof(Size) * (max_nodes + 1)); + memset(nodes, 0, sizeof(Size) * (max_nodes + 2)); for (i = 0; i < shm_ent_page_count; i++) { int s = pages_status[i]; /* Ensure we are adding only valid index to the array */ - if (s < 0 || s > max_nodes) + if (s >= 0 && s <= max_nodes) + { + /* valid NUMA node */ + nodes[s]++; + continue; + } + else if (s == -2) { - elog(ERROR, "invalid NUMA node id outside of allowed range " - "[0, " UINT64_FORMAT "]: %d", max_nodes, s); + /* -2 means ENOENT (e.g. page was moved to swap) */ + nodes[max_nodes + 1]++; + continue; } - nodes[s]++; + elog(ERROR, "invalid NUMA node id outside of allowed range " + "[0, " UINT64_FORMAT "]: %d", max_nodes, s); } + /* no NULLs for regular nodes */ + memset(nulls, 0, sizeof(nulls)); + /* * Add one entry for each NUMA node, including those without allocated * memory for this segment. @@ -713,6 +723,14 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); } + + /* The last entry is used for pages without a NUMA node. */ + nulls[1] = true; + values[0] = CStringGetTextDatum(ent->key); + values[2] = Int64GetDatum(nodes[max_nodes + 1] * os_page_size); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); } LWLockRelease(ShmemIndexLock); -- 2.52.0