From 63dd4ccbb0ae0de2eefb72ae5a4a7bf7b5a6455b Mon Sep 17 00:00:00 2001
From: Ashutosh Bapat <ashutosh.bapat.oss@gmail.com>
Date: Thu, 19 Jun 2025 17:38:29 +0200
Subject: [PATCH 14/19] Support shrinking shared buffers

Buffer eviction
===============
When shrinking the shared buffers pool, each buffer in the area being
shrunk needs to be flushed if it's dirty so as not to loose the changes
to that buffer after shrinking. Also, each such buffer needs to be
removed from the buffer mapping table so that backends do not access it
after shrinking.

Buffer eviction requires a separate barrier phase for two reasons:

1. No other backend should map a new page to any of  buffers being
   evicted when eviction is in progress. So they wait while eviction is
   in progress.

2. Since a pinned buffer has the pin recorded in the backend local
   memory as well as the buffer descriptor (which is in shared memory),
   eviction should not coincide with remapping the shared memory of a
   backend. Otherwise we might loose consistency of local and shared
   pinning records. Hence it needs to be carried out in
   ProcessBarrierShmemResize() and not in AnonymousShmemResize() as
   indicated by now removed comment.

If a buffer being evicted is pinned, we raise a FATAL error but this should
improve. There are multiple options 1. to wait for the pinned buffer to get
unpinned, 2. the backend is killed or it itself cancels the query  or 3.
rollback the operation. Note that option 1 and 2 would require the pinning
related local and shared records to be accessed. But we need infrastructure to
do either of this right now.

Removing the evicted buffers from buffer ring
=============================================
If the buffer pool has been shrunk, the buffers in the buffer ring may
not be valid anymore. Modify GetBufferFromRing to check if the buffer is
still valid before using it. This makes GetBufferFromRing() a bit more
expensive because of additional boolean condition and masks any bug that
introduces an invalid buffer into the ring. The alternative fix is more
complex as explained below.

The strategy object is created in CurrentMemoryContext and is not
available in any global structure thus accessible when processing buffer
resizing barriers. We may modify GetAccessStrategy() to register
strategy in a global linked list and then arrange to deregister it once
it's no more in use. Looking at the places which use
GetAccessStrategy(), fixing all those may be some work.

Author: Ashutosh Bapat
Reviewed-by: Tomas Vondra
---
 src/backend/port/sysv_shmem.c                 | 42 ++++++---
 src/backend/storage/buffer/bufmgr.c           | 93 +++++++++++++++++++
 src/backend/storage/buffer/freelist.c         | 18 +++-
 .../utils/activity/wait_event_names.txt       |  1 +
 src/include/storage/bufmgr.h                  |  1 +
 src/include/storage/pg_shmem.h                |  1 +
 6 files changed, 139 insertions(+), 17 deletions(-)

diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 54d335b2e5d..9e1b2c3201f 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -993,14 +993,6 @@ AnonymousShmemResize(void)
 	 */
 	pending_pm_shmem_resize = false;
 
-	/*
-	 * XXX: Currently only increasing of shared_buffers is supported. For
-	 * decreasing something similar has to be done, but buffer blocks with
-	 * data have to be drained first.
-	 */
-	if(NBuffersOld > NBuffers)
-		return false;
-
 #ifndef MAP_HUGETLB
 	/* PrepareHugePages should have dealt with this case */
 	Assert(huge_pages != HUGE_PAGES_ON && !huge_pages_on);
@@ -1099,11 +1091,14 @@ AnonymousShmemResize(void)
 				 * all the pointers are still valid, and we only need to update
 				 * structures size in the ShmemIndex once -- any other backend
 				 * will pick up this shared structure from the index.
-				 *
-				 * XXX: This is the right place for buffer eviction as well.
 				 */
 				BufferManagerShmemInit(NBuffersOld);
 
+				/*
+				 * Wipe out the evictor PID so that it can be used for the next
+				 * buffer resizing operation.
+				*/
+				ShmemCtrl->evictor_pid = 0;
 				/* If all fine, broadcast the new value */
 				pg_atomic_write_u32(&ShmemCtrl->NSharedBuffers, NBuffers);
 			}
@@ -1156,11 +1151,31 @@ ProcessBarrierShmemResize(Barrier *barrier)
 	 * XXX: If we need to be able to abort resizing, this has to be done later,
 	 * after the SHMEM_RESIZE_DONE.
 	 */
-	if (BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_START))
+
+	/*
+	 * Evict extra buffers when shrinking shared buffers. We need to do this
+	 * while the memory for extra buffers is still mapped i.e. before remapping
+	 * the shared memory segments to a smaller memory area.
+	 */
+	if (NBuffersOld > NBuffersPending)
 	{
-		Assert(IsUnderPostmaster);
-		SendPostmasterSignal(PMSIGNAL_SHMEM_RESIZE);
+		BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_START);
+
+		/*
+		 * TODO: If the buffer eviction fails for any reason, we should
+		 * gracefully rollback the shared buffer resizing and try again. But the
+		 * infrastructure to do so is not available right now. Hence just raise
+		 * a FATAL so that the system restarts.
+		 */
+		if (!EvictExtraBuffers(NBuffersPending, NBuffersOld))
+			elog(FATAL, "buffer eviction failed");
+
+		if (BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_EVICT))
+			SendPostmasterSignal(PMSIGNAL_SHMEM_RESIZE);
 	}
+	else
+		if (BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_START))
+			SendPostmasterSignal(PMSIGNAL_SHMEM_RESIZE);
 
 	AnonymousShmemResize();
 
@@ -1684,5 +1699,6 @@ ShmemControlInit(void)
 
 		/* shmem_resizable should be initialized by now */
 		ShmemCtrl->Resizable = shmem_resizable;
+		ShmemCtrl->evictor_pid = 0;
 	}
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index edf17ce3ea1..467d9880f7b 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -57,6 +57,7 @@
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
+#include "storage/pg_shmem.h"
 #include "storage/proc.h"
 #include "storage/read_stream.h"
 #include "storage/smgr.h"
@@ -7457,3 +7458,95 @@ const PgAioHandleCallbacks aio_local_buffer_readv_cb = {
 	.complete_local = local_buffer_readv_complete,
 	.report = buffer_readv_report,
 };
+
+/*
+ * When shrinking shared buffers pool, evict the buffers which will not be part
+ * of the shrunk buffer pool.
+ */
+bool
+EvictExtraBuffers(int newBufSize, int oldBufSize)
+{
+	bool result = true;
+
+	/*
+	 * If the buffer being evicated is locked, this function will need to wait.
+	 * This function should not be called from a Postmaster since it can not wait on a lock.
+	 */
+	Assert(IsUnderPostmaster);
+
+	/*
+	 * Let only one backend perform eviction. We could split the work across all
+	 * the backends but that doesn't seem necessary.
+	 *
+	 * The first backend to acquire ShmemResizeLock, sets its own PID as the
+	 * evictor PID for other backends to know that the eviction is in progress or
+	 * has already been performed. The evictor backend releases the lock when it
+	 * finishes eviction.  While the eviction is in progress, backends other than
+	 * evictor backend won't be able to take the lock. They won't perform
+	 * eviction. A backend may acquire the lock after eviction has completed, but
+	 * it will not perform eviction since the evictor PID is already set. Evictor
+	 * PID is reset only when the buffer resizing finishes. Thus only one backend
+	 * will perform eviction in a given instance of shared buffers resizing.
+	 *
+	 * Any backend which acquires this lock will release it before the eviction
+	 * phase finishes, hence the same lock can be reused for the next phase of
+	 * resizing buffers.
+	 */
+	if (LWLockConditionalAcquire(ShmemResizeLock, LW_EXCLUSIVE))
+	{
+		if (ShmemCtrl->evictor_pid == 0)
+		{
+			ShmemCtrl->evictor_pid = MyProcPid;
+
+			/*
+			 * TODO: Before evicting any buffer, we should check whether any of the
+			 * buffers are pinned. If we find that a buffer is pinned after evicting
+			 * most of them, that will impact performance since all those evicted
+			 * buffers might need to be read again.
+			 */
+			for (Buffer buf = newBufSize + 1; buf <= oldBufSize; buf++)
+			{
+				BufferDesc *desc = GetBufferDescriptor(buf - 1);
+				uint32		buf_state;
+				bool		buffer_flushed;
+
+				buf_state = pg_atomic_read_u32(&desc->state);
+
+				/*
+				 * Nobody is expected to touch the buffers while resizing is
+				 * going one hence unlocked precheck should be safe and saves
+				 * some cycles.
+				 */
+				if (!(buf_state & BM_VALID))
+					continue;
+
+				/*
+				 * XXX: Looks like CurrentResourceOwner can be NULL here, find
+				 * another one in that case?
+				 * */
+				if (CurrentResourceOwner)
+					ResourceOwnerEnlarge(CurrentResourceOwner);
+
+				ReservePrivateRefCountEntry();
+
+				LockBufHdr(desc);
+
+				/*
+				 * Now that we have locked buffer descriptor, make sure that the
+				 * buffer without valid data has been skipped above.
+				 */
+				Assert(buf_state & BM_VALID);
+
+				if (!EvictUnpinnedBufferInternal(desc, &buffer_flushed))
+				{
+					elog(WARNING, "could not remove buffer %u, it is pinned", buf);
+					result = false;
+					break;
+				}
+			}
+		}
+		LWLockRelease(ShmemResizeLock);
+	}
+
+	return result;
+}
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 299f6aa8e7e..0da8fbb580e 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -669,12 +669,22 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
 		strategy->current = 0;
 
 	/*
-	 * If the slot hasn't been filled yet, tell the caller to allocate a new
-	 * buffer with the normal allocation strategy.  He will then fill this
-	 * slot by calling AddBufferToRing with the new buffer.
+	 * If the slot hasn't been filled yet or the buffer in the slot has been
+	 * invalidated when buffer pool was shrunk, tell the caller to allocate a new
+	 * buffer with the normal allocation strategy.  He will then fill this slot
+	 * by calling AddBufferToRing with the new buffer.
+	 * 
+	 * TODO: Ideally we would want to check for bufnum > NBuffers only once
+	 * after every time the buffer pool is shrunk so as to catch any runtime
+	 * bugs that introduce invalid buffers in the ring. But that is complicated.
+	 * The BufferAccessStrategy objects are not accessible outside the
+	 * ScanState. Hence we can not purge the buffers while evicting the buffers.
+	 * After the resizing is finished, it's not possible to notice when we touch
+	 * the first of those objects and the last of objects. See if this can
+	 * fixed. 
 	 */
 	bufnum = strategy->buffers[strategy->current];
-	if (bufnum == InvalidBuffer)
+	if (bufnum == InvalidBuffer || bufnum > NBuffers)
 		return NULL;
 
 	buf = GetBufferDescriptor(bufnum - 1);
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 82cee6b8877..9a6a6275305 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -156,6 +156,7 @@ REPLICATION_SLOT_DROP	"Waiting for a replication slot to become inactive so it c
 RESTORE_COMMAND	"Waiting for <xref linkend="guc-restore-command"/> to complete."
 SAFE_SNAPSHOT	"Waiting to obtain a valid snapshot for a <literal>READ ONLY DEFERRABLE</literal> transaction."
 SHMEM_RESIZE_START	"Waiting for other backends to start resizing shared memory."
+SHMEM_RESIZE_EVICT	"Waiting for other backends to finish buffer evication phase."
 SHMEM_RESIZE_DONE	"Waiting for other backends to finish resizing shared memory."
 SYNC_REP	"Waiting for confirmation from a remote server during synchronous replication."
 WAL_RECEIVER_EXIT	"Waiting for the WAL receiver to exit."
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index e2e97866b40..e7c973adca8 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -316,6 +316,7 @@ extern void EvictRelUnpinnedBuffers(Relation rel,
 									int32 *buffers_evicted,
 									int32 *buffers_flushed,
 									int32 *buffers_skipped);
+extern bool EvictExtraBuffers(int fromBuf, int toBuf);
 
 /* in buf_init.c */
 extern void BufferManagerShmemInit(int);
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index eba28ce8a5c..0a59746b472 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -77,6 +77,7 @@ extern PGDLLIMPORT AnonymousMapping Mappings[ANON_MAPPINGS];
 typedef struct
 {
 	pg_atomic_uint32 	NSharedBuffers;
+	pid_t				evictor_pid;
 	Barrier 			Barrier;
 	pg_atomic_uint64 	Generation;
 	bool                Resizable;
-- 
2.34.1