From e7d2c281c12425012882263857f76d0d395f3abc Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Mon, 7 Apr 2025 16:43:32 +0200
Subject: [PATCH v26 6/7] reworks

---
 contrib/pg_buffercache/pg_buffercache_pages.c | 169 +++++++++---------
 1 file changed, 83 insertions(+), 86 deletions(-)

diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index fe2ffadcb3a..03fc6574a52 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -306,64 +306,85 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
 	if (SRF_IS_FIRSTCALL())
 	{
 		int			i,
-					j,
 					idx;
 		Size		os_page_size = 0;
 		void	  **os_page_ptrs = NULL;
 		int		   *os_page_status;
 		uint64		os_page_count;
 		int			pages_per_buffer;
-		int			buffers_per_page;
+		int			max_entries;
 		volatile uint64 touch pg_attribute_unused();
-		char	   *startptr = NULL;
+		char	   *startptr,
+				   *endptr;
 
 		if (pg_numa_init() == -1)
 			elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
 
 		/*
-		 * Different database block sizes (4kB, 8kB, ..., 32kB) can be used,
-		 * while the OS may have different memory page sizes.
+		 * The database block size and OS memory page size are unlikely to be
+		 * the same. The block size is 1-32KB, the memory page size depends on
+		 * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but
+		 * there are also features like THP etc. Moreover, we don't quite know
+		 * how the pages and buffers "align" in memory - the buffers may be
+		 * shifted in some way, using more memory pages than necessary.
 		 *
-		 * To correctly map between them, we need to: 1. Determine the OS
-		 * memory page size 2. Calculate how many OS pages are used by all
-		 * buffer blocks 3. Calculate how many OS pages are contained within
-		 * each database block.
+		 * So we need to be careful about mappping buffers to memory pages. We
+		 * calculate the maximum number of pages a buffer might use, so that
+		 * we allocate enough space for the entries. And then we count the
+		 * actual number of entries as we scan the buffers.
 		 *
 		 * This information is needed before calling move_pages() for NUMA
 		 * node id inquiry.
 		 */
 		os_page_size = pg_numa_get_pagesize();
-		buffers_per_page = os_page_size / BLCKSZ;
-		pages_per_buffer = BLCKSZ / os_page_size;
 
 		/*
 		 * The pages and block size is expected to be 2^k, so one divides the
-		 * other (we don't know in which direction).
+		 * other (we don't know in which direction). This does not say
+		 * anything about relative alignment of pages/buffers.
 		 */
 		Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0));
 
 		/*
-		 * Either both counts are 1 (when the pages have the same size), or
-		 * exacly one of them is zero. Both can't be zero at the same time.
+		 * How many addresses we are going to query? Simply get the page for
+		 * the first buffer, and first page after the last buffer, and count
+		 * the pages from that.
 		 */
-		Assert((buffers_per_page > 0) || (pages_per_buffer > 0));
-		Assert(((buffers_per_page == 1) && (pages_per_buffer == 1)) ||
-			   ((buffers_per_page == 0) || (pages_per_buffer == 0)));
+		startptr = (char *) TYPEALIGN_DOWN(os_page_size,
+										   BufferGetBlock(1));
+		endptr = (char *) TYPEALIGN_DOWN(os_page_size,
+										 (char *) BufferGetBlock(NBuffers) + BLCKSZ);
+		os_page_count = (endptr - startptr) / os_page_size;
+
+		/* Used to determine the NUMA node for all OS pages at once */
+		os_page_ptrs = palloc0(sizeof(void *) * os_page_count);
+		os_page_status = palloc(sizeof(uint64) * os_page_count);
+
+		/* Fill pointers for all the memory pages. */
+		idx = 0;
+		for (char *ptr = startptr; ptr < endptr; ptr += os_page_size)
+		{
+			os_page_ptrs[idx++] = ptr;
+
+			/* Only need to touch memory once per backend process lifetime */
+			if (firstNumaTouch)
+				pg_numa_touch_mem_if_required(touch, ptr);
+		}
+
+		Assert(idx == os_page_count);
+
+		elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " "
+			 "os_page_size=%zu", NBuffers, os_page_count, os_page_size);
 
 		/*
-		 * How many addresses we are going to query (store) depends on the
-		 * relation between BLCKSZ : PAGESIZE. We need at least one status per
-		 * buffer - if the memory page is larger than buffer, we still query
-		 * it for each buffer. With multiple memory pages per buffer, we need
-		 * that many entries.
+		 * If we ever get 0xff back from kernel inquiry, then we probably have
+		 * bug in our buffers to OS page mapping code here.
 		 */
-		os_page_count = NBuffers * Max(1, pages_per_buffer);
-
-		elog(DEBUG1, "NUMA: NBuffers=%d os_page_query_count=" UINT64_FORMAT " "
-			 "os_page_size=%zu buffers_per_page=%d pages_per_buffer=%d",
-			 NBuffers, os_page_count, os_page_size,
-			 buffers_per_page, pages_per_buffer);
+		memset(os_page_status, 0xff, sizeof(int) * os_page_count);
 
+		/* Query NUMA status for all the pointers */
+		if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
+			elog(ERROR, "failed NUMA pages inquiry: %m");
 
 		/* Initialize the multi-call context, load entries about buffers */
 
@@ -392,29 +413,24 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
 
 		fctx->tupdesc = BlessTupleDesc(tupledesc);
 
-		/* Allocate NBuffers worth of BufferCachePagesRec records. */
+		/*
+		 * Each buffer needs at least one entry, but it might be offset in
+		 * some way, and use one extra entry. So we allocate space for the
+		 * maximum number of entries we might need, and then count the exact
+		 * number as we're walking buffers. That way we can do it in one pass,
+		 * without reallocating memory.
+		 */
+		pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1;
+		max_entries = NBuffers * pages_per_buffer;
+
+		/* Allocate entries for BufferCachePagesRec records. */
 		fctx->record = (BufferCacheNumaRec *)
 			MemoryContextAllocHuge(CurrentMemoryContext,
-								   sizeof(BufferCacheNumaRec) * os_page_count);
-
-		/* Set max calls and remember the user function context. */
-		funcctx->max_calls = NBuffers;
-		funcctx->user_fctx = fctx;
+								   sizeof(BufferCacheNumaRec) * max_entries);
 
 		/* Return to original context when allocating transient memory */
 		MemoryContextSwitchTo(oldcontext);
 
-
-		/* Used to determine the NUMA node for all OS pages at once */
-		os_page_ptrs = palloc0(sizeof(void *) * os_page_count);
-		os_page_status = palloc(sizeof(uint64) * os_page_count);
-
-		/*
-		 * If we ever get 0xff back from kernel inquiry, then we probably have
-		 * bug in our buffers to OS page mapping code here.
-		 */
-		memset(os_page_status, 0xff, sizeof(int) * os_page_count);
-
 		if (firstNumaTouch)
 			elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
 
@@ -434,9 +450,13 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
 		idx = 0;
 		for (i = 0; i < NBuffers; i++)
 		{
+			char	   *buffptr = (char *) BufferGetBlock(i + 1);
 			BufferDesc *bufHdr;
 			uint32		buf_state;
 			uint32		bufferid;
+			int32		ospageid;
+			char	   *startptr_buff,
+					   *endptr_buff;
 
 			CHECK_FOR_INTERRUPTS();
 
@@ -445,58 +465,35 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
 			/* Lock each buffer header before inspecting. */
 			buf_state = LockBufHdr(bufHdr);
 			bufferid = BufferDescriptorGetBuffer(bufHdr);
-
 			UnlockBufHdr(bufHdr, buf_state);
 
-			/*
-			 * If we have multiple OS pages per buffer, fill those in too. We
-			 * always want at least one OS page, even if there are multiple
-			 * buffers per page.
-			 *
-			 * Altough we could query just once per each OS page, we do it
-			 * repeatably for each Buffer and hit the same address as
-			 * move_pages(2) requires page aligment. This also simplifies
-			 * retrieval code later on. Also NBuffers starts from 1.
-			 */
-			for (j = 0; j < Max(1, pages_per_buffer); j++)
-			{
-				char	   *buffptr = (char *) BufferGetBlock(i + 1);
-
-				fctx->record[idx].bufferid = bufferid;
+			/* start of the first page of this buffer */
+			startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr);
 
-				os_page_ptrs[idx]
-					= (char *) TYPEALIGN_DOWN(os_page_size,
-											  buffptr + (os_page_size * j));
+			/* start of the page right after this buffer */
+			endptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr + BLCKSZ);
 
-				/* calculate ID of the OS memory page */
-				fctx->record[idx].numa_page
-					= ((char *) os_page_ptrs[idx] - startptr) / os_page_size;
+			/* calculate ID of the first page for this buffer */
+			ospageid = (startptr_buff - startptr) / os_page_size;
 
-				/* Only need to touch memory once per backend process lifetime */
-				if (firstNumaTouch)
-					pg_numa_touch_mem_if_required(touch,
-												  buffptr + (os_page_size * j));
+			/* Add an entry for each OS page overlapping with this buffer. */
+			for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
+			{
+				fctx->record[idx].bufferid = bufferid;
+				fctx->record[idx].numa_page = ospageid;
+				fctx->record[idx].numa_node = os_page_status[ospageid];
 
+				/* advance to the next entry/page */
 				++idx;
+				++ospageid;
 			}
-
 		}
 
-		/* We should get exactly the expected number of entrires */
-		Assert(idx == os_page_count);
-
-		/* Query NUMA status for all the pointers */
-		if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1)
-			elog(ERROR, "failed NUMA pages inquiry: %m");
+		Assert((idx >= os_page_count) && (idx <= max_entries));
 
-		/*
-		 * Update the entries with NUMA node ID. The status array is indexed
-		 * the same way as the entry index.
-		 */
-		for (i = 0; i < os_page_count; i++)
-		{
-			fctx->record[i].numa_node = os_page_status[i];
-		}
+		/* Set max calls and remember the user function context. */
+		funcctx->max_calls = idx;
+		funcctx->user_fctx = fctx;
 
 		/* Remember this backend touched the pages */
 		firstNumaTouch = false;
-- 
2.49.0