From 9f804f7a003d00771304af6f0f4f96a9839571f3 Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Fri, 26 Sep 2025 19:12:45 +0530 Subject: [PATCH 18/19] Re-implement UI and synchronization for resizing buffer pool shared_buffers is not PGC_SIGHUP instead of PGC_POSTMASTER. The value of this GUC is saved in NBuffersPending instead of NBuffers which now shows the size of buffer pool in-effect. When the server starts, the shared memory size is estimated and the memory is allocated using NBuffersPending followed by setting NBuffers = NBuffersPending. When a server is running, the new value of GUC (set using ALTER SYSTEM ... SET shared_buffers = ...; followed by SELECT pg_reload_conf()) does not come into effect immediately. Instead a function pg_resize_shared_buffers() is used to resize the buffer pool. The function uses the current value of GUC in the backends where it is executed. The function also coordinates the buffer resizing synchronization across backends. SHOW shared_buffers now shows the current size of the shared buffers but it also shows pending size of shared buffers, if any. A new GUC max_shared_buffers is introduced to control the maximum value of shared_buffers that can be set. By default it is 0 and it is set to shared_buffers' value. When explicitly set it needs to be higher than 'shared_buffers'. This GUC determines the size of address space reserved for future buffer pool sizes and the size of buffer look up table. TODO: In case the backend executing pg_resize_shared_buffers() exits before the operation finishes, we will need somebody to clean up or complete the half-finished resizing operation. Best possibility is to use a background worker (mostly background writer) to do that. But then I think making that background worker the coordinator itself might be a better option since it will be restarted by the postmaster upon premature exit. Author: Ashutosh Bapat --- doc/src/sgml/config.sgml | 44 +- doc/src/sgml/func/func-admin.sgml | 57 ++ src/backend/access/transam/slru.c | 2 +- src/backend/access/transam/xlog.c | 2 +- src/backend/bootstrap/bootstrap.c | 2 + src/backend/port/sysv_shmem.c | 433 ++-------------- src/backend/postmaster/postmaster.c | 40 +- src/backend/storage/buffer/buf_init.c | 264 ++++++++-- src/backend/storage/buffer/bufmgr.c | 126 +++-- src/backend/storage/buffer/freelist.c | 118 ++--- src/backend/storage/ipc/ipci.c | 8 +- src/backend/storage/ipc/procsignal.c | 14 +- src/backend/storage/ipc/shmem.c | 485 +++++++++++++++++- src/backend/tcop/postgres.c | 13 +- .../utils/activity/wait_event_names.txt | 4 +- src/backend/utils/init/globals.c | 6 +- src/backend/utils/init/postinit.c | 32 ++ src/backend/utils/misc/guc.c | 2 +- src/backend/utils/misc/guc_parameters.dat | 16 +- src/include/catalog/pg_proc.dat | 6 + src/include/miscadmin.h | 6 +- src/include/storage/buf_internals.h | 2 +- src/include/storage/bufmgr.h | 17 +- src/include/storage/ipc.h | 1 - src/include/storage/pg_shmem.h | 24 +- src/include/storage/procsignal.h | 5 +- src/include/storage/shmem.h | 8 + src/include/utils/guc.h | 2 + src/test/buffermgr/Makefile | 3 + src/test/buffermgr/buffermgr_test.conf | 9 + src/test/buffermgr/expected/buffer_resize.out | 184 +++++-- src/test/buffermgr/meson.build | 5 + src/test/buffermgr/sql/buffer_resize.sql | 44 +- src/test/buffermgr/t/001_resize_buffer.pl | 44 +- .../buffermgr/t/003_parallel_resize_buffer.pl | 71 +++ 35 files changed, 1387 insertions(+), 712 deletions(-) create mode 100644 src/test/buffermgr/buffermgr_test.conf create mode 100644 src/test/buffermgr/t/003_parallel_resize_buffer.pl diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 39e658b7808..732f9636857 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1724,7 +1724,6 @@ include_dir 'conf.d' that is BLCKSZ bytes, typically 8kB. (Non-default values of BLCKSZ change the minimum value.) - This parameter can only be set at server start. @@ -1747,6 +1746,49 @@ include_dir 'conf.d' appropriate, so as to leave adequate space for the operating system. + + The shared memory consumed by the buffer pool is allocated and + initialized according to the value of the GUC at the time of starting + the server. A desired new value of GUC can be loaded while the server is + running using SIGHUP. But the buffer pool will + not be resized immediately. Use + pg_resize_shared_buffers() to dynamically resize + the shared buffer pool (see for details). + SHOW shared_buffers shows the current number of + shared buffers and pending number, if any. Please note that when the GUC + is changed, the other GUCS which use this GUCs value to set their + defaults will not be changed. They may still require a server restart to + consider new value. + + + + + + max_shared_buffers (integer) + + max_shared_buffers configuration parameter + + + + + Sets the upper limit for the shared_buffers value. + The default value is 0, + which means no explicit limit is set and max_shared_buffers + will be automatically set to the value of shared_buffers + at server startup. + If this value is specified without units, it is taken as blocks, + that is BLCKSZ bytes, typically 8kB. + This parameter can only be set at server start. + + + + This parameter determines the amount of memory address space to reserve + in each backend for expanding the buffer pool in future. While the + memory for buffer pool is allocated on demand as it is resized, the + memory required to hold the buffer manager metadata is allocated + statically at the server start accounting for the largest buffer pool + size allowed by this parameter. + diff --git a/doc/src/sgml/func/func-admin.sgml b/doc/src/sgml/func/func-admin.sgml index 1b465bc8ba7..0dc89b07c76 100644 --- a/doc/src/sgml/func/func-admin.sgml +++ b/doc/src/sgml/func/func-admin.sgml @@ -99,6 +99,63 @@ off + + + + + pg_resize_shared_buffers + + pg_resize_shared_buffers () + boolean + + + Dynamically resizes the shared buffer pool to match the current + value of the shared_buffers parameter. This + function implements a coordinated resize process that ensures all + backend processes acknowledge the change before completing the + operation. The resize happens in multiple phases to maintain + data consistency and system stability. Returns true + if the resize was successful, or raises an error if the operation + fails. This function can only be called by superusers. + + + To resize shared buffers, first update the shared_buffers + setting and reload the configuration, then verify the new value is loaded + before calling this function. For example: + +postgres=# ALTER SYSTEM SET shared_buffers = '256MB'; +ALTER SYSTEM +postgres=# SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +postgres=# SHOW shared_buffers; + shared_buffers +------------------------- + 128MB (pending: 256MB) +(1 row) + +postgres=# SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t +(1 row) + +postgres=# SHOW shared_buffers; + shared_buffers +---------------- + 256MB +(1 row) + + The SHOW shared_buffers step is important to verify + that the configuration reload was successful and the new value is + available to the current session before attempting the resize. The + output shows both the current and pending values when a change is waiting + to be applied. + + diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 5d3fcd62c94..3eae1d0c7e9 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -232,7 +232,7 @@ SimpleLruAutotuneBuffers(int divisor, int max) { return Min(max - (max % SLRU_BANK_SIZE), Max(SLRU_BANK_SIZE, - NBuffers / divisor - (NBuffers / divisor) % SLRU_BANK_SIZE)); + NBuffersPending / divisor - (NBuffersPending / divisor) % SLRU_BANK_SIZE)); } /* diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index eceab341255..ea01befe15c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4662,7 +4662,7 @@ XLOGChooseNumBuffers(void) { int xbuffers; - xbuffers = NBuffers / 32; + xbuffers = NBuffersPending / 32; if (xbuffers > (wal_segment_size / XLOG_BLCKSZ)) xbuffers = (wal_segment_size / XLOG_BLCKSZ); if (xbuffers < 8) diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index fc8638c1b61..226944e4588 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -335,6 +335,8 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) InitializeFastPathLocks(); + InitializeMaxNBuffers(); + CreateSharedMemoryAndSemaphores(); /* diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 3be28e228ae..380ecbc9751 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -102,14 +102,8 @@ void *UsedShmemSegAddr = NULL; AnonymousMapping Mappings[ANON_MAPPINGS]; -/* Flag telling postmaster that resize is needed */ -volatile bool pending_pm_shmem_resize = false; volatile bool delay_shmem_resize = false; -/* Keeps track of the previous NBuffers value */ -static int NBuffersOld = -1; -static int NBuffersPending = -1; - /* * Anonymous mapping layout we use looks like this: * @@ -137,20 +131,9 @@ static int NBuffersPending = -1; * reservation, into which shared memory segment can be extended and is * represented by the second /memfd:main with no permissions. * - * The reserved space for each segment is calculated as a fraction of the total - * reserved space (MaxAvailableMemory), as specified in the SHMEM_RESIZE_RATIO - * array. E.g. we allow BUFFERS_SHMEM_SEGMENT to take up to 60% of the whole - * space when resizing, based on the fact that it most likely will be the main - * consumer of this memory. Those numbers are pulled out of thin air for now, - * makes sense to evaluate them more precise. + * The reserved space for buffer manager related segments is calculated based on + * MaxNBuffers. */ -static double SHMEM_RESIZE_RATIO[6] = { - 0.15, /* MAIN_SHMEM_SEGMENT */ - 0.6, /* BUFFERS_SHMEM_SEGMENT */ - 0.1, /* BUFFER_DESCRIPTORS_SHMEM_SEGMENT */ - 0.1, /* BUFFER_IOCV_SHMEM_SEGMENT */ - 0.05, /* CHECKPOINT_BUFFERS_SHMEM_SEGMENT */ -}; /* * Flag telling that we have decided to use huge pages. @@ -160,13 +143,6 @@ static double SHMEM_RESIZE_RATIO[6] = { */ static bool huge_pages_on = false; -/* - * Flag telling that we have prepared the memory layout to be resizable. If - * false after all shared memory segments creation, it means we failed to setup - * needed layout and falled back to the regular non-resizable approach. - */ -static bool shmem_resizable = false; - /* * Currently broadcasted value of NBuffers in shared memory. * @@ -791,8 +767,7 @@ CreateAnonymousSegment(AnonymousMapping *mapping) if (mapping->shmem_reserved < mapping->shmem_size) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), - errmsg("not enough shared memory is reserved"), - errhint("You may need to increase \"max_available_memory\"."))); + errmsg("not enough shared memory is reserved"))); mmap_flags = PG_MMAP_FLAGS | mmap_flags; } @@ -961,36 +936,27 @@ AnonymousShmemDetach(int status, Datum arg) } /* - * Resize all shared memory segments based on the current NBuffers value, which - * is is applied from NBuffersPending. The actual segment resizing is done via - * ftruncate, which will fail if is not sufficient space to expand the anon - * file. When finished, based on the new and old values initialize new buffer - * blocks if any. - * - * If reinitializing took place, as the last step this function does buffers - * reinitialization as well and broadcasts the new value of NSharedBuffers. All - * of that needs to be done only by one backend, the first one that managed to - * grab the ShmemResizeLock. + * Resize all shared memory segments based on the new shared_buffers value (saved + * in ShmemCtrl area). The actual segment resizing is done via ftruncate, which + * will fail if there is not sufficient space to expand the anon file. + * + * TODO: Rename this to BufferShmemResize() or something. Only buffer manager's + * memory should be resized in this function. */ bool AnonymousShmemResize(void) { - int numSemas; - bool reinit = false; int mmap_flags = PG_MMAP_FLAGS; Size hugepagesize; - NBuffers = NBuffersPending; - - elog(DEBUG1, "Resize shmem from %d to %d", NBuffersOld, NBuffers); - - /* - * XXX: Where to reset the flag is still an open question. E.g. do we - * consider a no-op when NBuffers is equal to NBuffersOld a genuine resize - * and reset the flag? - */ - pending_pm_shmem_resize = false; + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + /* TODO: This is a hack. NBuffersPending should never be written by anything + * other than GUC system. Find a way to pass new NBuffers value to + * BufferManagerShmemSize(). */ + NBuffersPending = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers); + elog(DEBUG1, "Resize shmem from %d to %d", NBuffers, NBuffersPending); + #ifndef MAP_HUGETLB /* PrepareHugePages should have dealt with this case */ Assert(huge_pages != HUGE_PAGES_ON && !huge_pages_on); @@ -1005,8 +971,8 @@ AnonymousShmemResize(void) } #endif - /* Note that CalculateShmemSize indirectly depends on NBuffers */ - CalculateShmemSize(&numSemas); + /* Note that BufferManagerShmemSize() indirectly depends on NBuffersPending. */ + BufferManagerShmemSize(false); for(int i = 0; i < ANON_MAPPINGS; i++) { @@ -1014,10 +980,18 @@ AnonymousShmemResize(void) ShmemSegment *segment = &Segments[i]; PGShmemHeader *shmem_hdr = segment->ShmemSegHdr; + /* Main shared memory segment is always static. Ignore it. */ + if (i == MAIN_SHMEM_SEGMENT) + continue; + + m->shmem_req_size = add_size(m->shmem_req_size, 8192 - (m->shmem_req_size % 8192)); #ifdef MAP_HUGETLB if (huge_pages_on && (m->shmem_req_size % hugepagesize != 0)) m->shmem_req_size += hugepagesize - (m->shmem_req_size % hugepagesize); #endif + elog(DEBUG1, "segment[%s]: requested size %zu, current size %zu, reserved %zu", + MappingName(m->shmem_segment), m->shmem_req_size, m->shmem_size, + m->shmem_reserved); if (m->shmem == NULL) continue; @@ -1025,26 +999,28 @@ AnonymousShmemResize(void) if (m->shmem_size == m->shmem_req_size) continue; + /* We should have reserved enough address space. Also made sure that the + * new size can fit in the existing mapping. PANIC if that's not the + * case. */ if (m->shmem_reserved < m->shmem_req_size) - ereport(ERROR, + ereport(PANIC, (errcode(ERRCODE_INSUFFICIENT_RESOURCES), - errmsg("not enough shared memory is reserved"), - errhint("You may need to increase \"max_available_memory\"."))); + errmsg("not enough shared memory is reserved"))); elog(DEBUG1, "segment[%s]: resize from %zu to %zu at address %p", MappingName(m->shmem_segment), m->shmem_size, m->shmem_req_size, m->shmem); - /* Resize the backing anon file. */ + /* Resize the backing anon file. If the operation fails, in one backend and we do not know the status in other backends, it will lead to inconsistent buffer manager structures across backends. PANIC. */ if(ftruncate(m->segment_fd, m->shmem_req_size) == -1) - ereport(FATAL, + ereport(PANIC, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("could not truncase anonymous file for \"%s\": %m", MappingName(m->shmem_segment)))); /* Adjust memory accessibility */ if(mprotect(m->shmem, m->shmem_req_size, PROT_READ | PROT_WRITE) == -1) - ereport(FATAL, + ereport(PANIC, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("could not mprotect anonymous shared memory for \"%s\": %m", MappingName(m->shmem_segment)))); @@ -1052,308 +1028,19 @@ AnonymousShmemResize(void) /* If shrinking, make reserved space unavailable again */ if(m->shmem_req_size < m->shmem_size && mprotect(m->shmem + m->shmem_req_size, m->shmem_size - m->shmem_req_size, PROT_NONE) == -1) - ereport(FATAL, + ereport(PANIC, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("could not mprotect reserved shared memory for \"%s\": %m", MappingName(m->shmem_segment)))); - reinit = true; m->shmem_size = m->shmem_req_size; shmem_hdr->totalsize = m->shmem_size; segment->ShmemEnd = m->shmem + m->shmem_size; } - if (reinit) - { - if(IsUnderPostmaster && - LWLockConditionalAcquire(ShmemResizeLock, LW_EXCLUSIVE)) - { - /* - * If the new NBuffers was already broadcasted, the buffer pool was - * already initialized before. - * - * Since we're not on a hot path, we use lwlocks and do not need to - * involve memory barrier. - */ - if(pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers) != NBuffers) - { - /* - * Allow the first backend that managed to get the lock to - * reinitialize the new portion of buffer pool. Every other - * process will wait on the shared barrier for that to finish, - * since it's a part of the SHMEM_RESIZE_DONE phase. - * - * Note that it's enough when only one backend will do that, - * even the ShmemInitStruct part. The reason is that resized - * shared memory will maintain the same addresses, meaning that - * all the pointers are still valid, and we only need to update - * structures size in the ShmemIndex once -- any other backend - * will pick up this shared structure from the index. - */ - BufferManagerShmemInit(NBuffersOld); - - /* - * Wipe out the evictor PID so that it can be used for the next - * buffer resizing operation. - */ - ShmemCtrl->evictor_pid = 0; - /* If all fine, broadcast the new value */ - pg_atomic_write_u32(&ShmemCtrl->NSharedBuffers, NBuffers); - } - - LWLockRelease(ShmemResizeLock); - } - } - - return true; -} - -/* - * We are asked to resize shared memory. Wait for all ProcSignal participants - * to join the barrier, then do the resize and wait on the barrier until all - * participating finish resizing as well -- otherwise we face danger of - * inconsistency between backends. - * - * XXX: If a backend is blocked on ReadCommand in PostgresMain, it will not - * proceed with AnonymousShmemResize after receiving SIGHUP, until something - * will be sent. - */ -bool -ProcessBarrierShmemResize(Barrier *barrier) -{ - Assert(IsUnderPostmaster); - - elog(DEBUG1, "Handle a barrier for shmem resizing from %d to %d, %d, %d", - NBuffersOld, NBuffersPending, pending_pm_shmem_resize, delay_shmem_resize); - - /* Wait until we have seen the new NBuffers value */ - if (!pending_pm_shmem_resize) - return false; - - /* Wait till this process becomes ready to resize buffers. */ - if (delay_shmem_resize) - return false; - - /* - * First thing to do after attaching to the barrier is to wait for others. - * We can't simply use BarrierArriveAndWait, because backends might arrive - * here in disjoint groups, e.g. first two backends, pause, then second two - * backends. If the resize is quick enough that can lead to a situation - * when the first group is already finished before the second has appeared, - * and the barrier will only synchonize withing those groups. - */ - if (BarrierAttach(barrier) == SHMEM_RESIZE_REQUESTED) - WaitForProcSignalBarrierReceived( - pg_atomic_read_u64(&ShmemCtrl->Generation)); - - /* - * Now start the procedure, and elect one backend to ping postmaster to do - * the same. - * - * XXX: If we need to be able to abort resizing, this has to be done later, - * after the SHMEM_RESIZE_DONE. - */ - - /* - * Evict extra buffers when shrinking shared buffers. We need to do this - * while the memory for extra buffers is still mapped i.e. before remapping - * the shared memory segments to a smaller memory area. - */ - if (NBuffersOld > NBuffersPending) - { - BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_START); - - /* - * TODO: If the buffer eviction fails for any reason, we should - * gracefully rollback the shared buffer resizing and try again. But the - * infrastructure to do so is not available right now. Hence just raise - * a FATAL so that the system restarts. - */ - if (!EvictExtraBuffers(NBuffersPending, NBuffersOld)) - elog(FATAL, "buffer eviction failed"); - - if (BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_EVICT)) - SendPostmasterSignal(PMSIGNAL_SHMEM_RESIZE); - } - else - if (BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_START)) - SendPostmasterSignal(PMSIGNAL_SHMEM_RESIZE); - - AnonymousShmemResize(); - - /* The second phase means the resize has finished, SHMEM_RESIZE_DONE */ - BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_DONE); - - if (MyBackendType == B_BG_WRITER) - { - /* - * Before resuming regular background writer activity, adjust the - * statistics collected so far. - */ - BgBufferSyncReset(NBuffersOld, NBuffers); - } - - BarrierDetach(barrier); return true; } -/* - * GUC assign hook for shared_buffers. - * - * When setting the GUC first time after starting the server, the GUC value is - * changed immediately since there is not shared memory setup yet. - * - * After the shared memory is setup, changing the GUC value requires resizing and - * reiniatializing (at least parts of) the shared memory structures related to - * shared buffers. That's a long and complicated process. It's recommended for - * an assign hook to be as minimal as possible, thus we just request shared - * memory resize and remember the previous value. - */ -void -assign_shared_buffers(int newval, void *extra, bool *pending) -{ - /* - * TODO: If a backend joins while the buffer resizing is in progress or it - * reads a value of shared_buffers from configuration which is different from - * the value being used by existing backends, this method may not work. Need - * to think of a better solution. - */ - if (BufferBlocks) - { - elog(DEBUG1, "bufferpool is already initialized with size = %d, reinitializing it with size = %d", - NBuffers, newval); - pending_pm_shmem_resize = true; - *pending = true; - NBuffersPending = newval; - NBuffersOld = NBuffers; - } - else - { - elog(DEBUG1, "initializing buffer pool with size = %d", newval); - NBuffers = newval; - *pending = false; - pending_pm_shmem_resize = false; - } -} - -/* - * Test if we have somehow missed a shmem resize signal and NBuffers value - * differs from NSharedBuffers. If yes, catchup and do resize. - */ -void -AdjustShmemSize(void) -{ - uint32 NSharedBuffers = pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers); - - if (NSharedBuffers != NBuffers) - { - /* - * If the broadcasted shared_buffers is different from the one we see, - * it could be that the backend has missed a resize signal. To avoid - * any inconsistency, adjust the shared mappings, before having a - * chance to access the buffer pool. - */ - ereport(LOG, - (errmsg("shared_buffers has been changed from %d to %d, " - "resize shared memory", - NBuffers, NSharedBuffers))); - NBuffers = NSharedBuffers; - AnonymousShmemResize(); - } -} - -/* - * Start resizing procedure, making sure all existing processes will have - * consistent view of shared memory size. Must be called only in postmaster. - */ -void -CoordinateShmemResize(void) -{ - elog(DEBUG1, "Coordinating shmem resize from %d to %d", - NBuffersOld, NBuffers); - Assert(!IsUnderPostmaster); - - /* - * We use dynamic barrier to help dealing with backends that were spawned - * during the resize. - */ - BarrierInit(&ShmemCtrl->Barrier, 0); - - /* - * If the value did not change, or shared memory segments are not - * initialized yet, skip the resize. - */ - if (NBuffersPending == NBuffersOld) - { - elog(DEBUG1, "Skip resizing, new %d, old %d", - NBuffers, NBuffersOld); - return; - } - - /* - * Shared memory resize requires some coordination done by postmaster, - * and consists of three phases: - * - * - Before the resize all existing backends have the same old NBuffers. - * - When resize is in progress, backends are expected to have a - * mixture of old a new values. They're not allowed to touch buffer - * pool during this time frame. - * - After resize has been finished, all existing backends, that can access - * the buffer pool, are expected to have the same new value of NBuffers. - * - * Those phases are ensured by joining the shared barrier associated with - * the procedure. Since resizing takes time, we need to take into account - * that during that time: - * - * - New backends can be spawned. They will check status of the barrier - * early during the bootstrap, and wait until everything is over to work - * with the new NBuffers value. - * - * - Old backends can exit before attempting to resize. Synchronization - * used between backends relies on ProcSignalBarrier and waits for all - * participants received the message at the beginning to gather all - * existing backends. - * - * - Some backends might be blocked and not responsing either before or - * after receiving the message. In the first case such backend still - * have ProcSignalSlot and should be waited for, in the second case - * shared barrier will make sure we still waiting for those backends. In - * any case there is an unbounded wait. - * - * - Backends might join barrier in disjoint groups with some time in - * between. That means that relying only on the shared dynamic barrier is - * not enough -- it will only synchronize resize procedure withing those - * groups. That's why we wait first for all participants of ProcSignal - * mechanism who received the message. - */ - elog(DEBUG1, "Emit a barrier for shmem resizing"); - pg_atomic_init_u64(&ShmemCtrl->Generation, - EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SHMEM_RESIZE)); - - /* To order everything after setting Generation value */ - pg_memory_barrier(); - - /* - * After that postmaster waits for PMSIGNAL_SHMEM_RESIZE as a sign that all - * the rest of the pack has started the procedure and it can resize shared - * memory as well. - * - * Normally we would call WaitForProcSignalBarrier here to wait until every - * backend has reported on the ProcSignalBarrier. But for shared memory - * resize we don't need this, as every participating backend will - * synchronize on the ProcSignal barrier. In fact even if we would like to - * wait here, it wouldn't be possible -- we're in the postmaster, without - * any waiting infrastructure available. - * - * If at some point it will turn out that waiting is essential, we would - * need to consider some alternatives. E.g. it could be a designated - * coordination process, which is not a postmaster. Another option would be - * to introduce a CoordinateShmemResize lock and allow only one process to - * take it (this probably would have to be something different than - * LWLocks, since they block interrupts, and coordination relies on them). - */ -} - /* * PGSharedMemoryCreate * @@ -1374,7 +1061,7 @@ PGSharedMemoryCreate(AnonymousMapping *mapping, void *memAddress; PGShmemHeader *hdr; struct stat statbuf; - Size sysvsize, total_reserved; + Size sysvsize; /* * We use the data directory's ID info (inode and device numbers) to @@ -1398,12 +1085,6 @@ PGSharedMemoryCreate(AnonymousMapping *mapping, /* Prepare the mapping information */ mapping->shmem_size = mapping->shmem_req_size; - total_reserved = (Size) MaxAvailableMemory * BLCKSZ; - mapping->shmem_reserved = total_reserved * SHMEM_RESIZE_RATIO[mapping->shmem_segment]; - - /* Round up to be a multiple of BLCKSZ */ - mapping->shmem_reserved = mapping->shmem_reserved + BLCKSZ - - (mapping->shmem_reserved % BLCKSZ); if (shared_memory_type == SHMEM_TYPE_MMAP) { @@ -1666,29 +1347,6 @@ PGSharedMemoryDetach(void) } } -void -WaitOnShmemBarrier() -{ - Barrier *barrier = &ShmemCtrl->Barrier; - - /* Nothing to do if resizing is not started */ - if (BarrierPhase(barrier) < SHMEM_RESIZE_START) - return; - - BarrierAttach(barrier); - - /* Otherwise wait through all available phases */ - while (BarrierPhase(barrier) < SHMEM_RESIZE_DONE) - { - ereport(LOG, (errmsg("ProcSignal barrier is in phase %d, waiting", - BarrierPhase(barrier)))); - - BarrierArriveAndWait(barrier, 0); - } - - BarrierDetach(barrier); -} - void ShmemControlInit(void) { @@ -1700,16 +1358,13 @@ ShmemControlInit(void) if (!foundShmemCtrl) { - /* - * The barrier is missing here, it will be initialized right before - * starting the resizing process as a convenient way to reset it. - */ - - /* Initialize with the currently known value */ - pg_atomic_init_u32(&ShmemCtrl->NSharedBuffers, NBuffers); - - /* shmem_resizable should be initialized by now */ - ShmemCtrl->Resizable = shmem_resizable; - ShmemCtrl->evictor_pid = 0; + pg_atomic_init_u32(&ShmemCtrl->targetNBuffers, 0); + pg_atomic_init_u32(&ShmemCtrl->activeNBuffers, 0); + pg_atomic_init_u32(&ShmemCtrl->transitNBuffers, 0); + pg_atomic_init_flag(&ShmemCtrl->resize_in_progress); + + ShmemCtrl->coordinator = 0; + ShmemCtrl->pmwork_done = false; + ConditionVariableInit(&ShmemCtrl->pm_cv); } } diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index ba9528d5dfa..3be146abac2 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -110,9 +110,11 @@ #include "replication/slotsync.h" #include "replication/walsender.h" #include "storage/aio_subsys.h" +#include "storage/bufmgr.h" #include "storage/fd.h" #include "storage/io_worker.h" #include "storage/ipc.h" +#include "storage/pg_shmem.h" #include "storage/pmsignal.h" #include "storage/proc.h" #include "tcop/backend_startup.h" @@ -125,7 +127,6 @@ #ifdef EXEC_BACKEND #include "common/file_utils.h" -#include "storage/pg_shmem.h" #endif @@ -959,6 +960,11 @@ PostmasterMain(int argc, char *argv[]) */ InitializeFastPathLocks(); + /* + * Calculate MaxNBuffers for buffer pool resizing. + */ + InitializeMaxNBuffers(); + /* * Give preloaded libraries a chance to request additional shared memory. */ @@ -1698,9 +1704,6 @@ ServerLoop(void) if (pending_pm_pmsignal) process_pm_pmsignal(); - if (pending_pm_shmem_resize) - process_pm_shmem_resize(); - if (events[i].events & WL_SOCKET_ACCEPT) { ClientSocket s; @@ -2046,15 +2049,34 @@ process_pm_reload_request(void) } } +/* + * Handle requests from the coordinator to resize shared memory maps so that the + * new backends can inherit those. + */ static void process_pm_shmem_resize(void) { + elog(LOG, "postmaster received PMSIGNAL_SHMEM_RESIZE, coordinating memory remapping"); + /* - * Failure to resize is considered to be fatal and will not be - * retried, which means we can disable pending flag right here. + * Perform the memory remapping in postmaster process. This should never fail + * since the address map is always reserved. If it fails the address maps in + * the backends will becomes inconsistent which is a fundamental assumption + * in PostgreSQL architecture. Hence PANIC. */ - pending_pm_shmem_resize = false; - CoordinateShmemResize(); + if (!AnonymousShmemResize()) + elog(PANIC, "postmaster failed to resize anonymous shared memory"); + else + { + int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers); + elog(LOG, "postmaster successfully completed shared memory remapping"); + + BufferManagerShmemValidate(targetNBuffers); + elog(LOG, "postmaster successfully validated buffer manager shared memory"); + ShmemCtrl->pmwork_done = true; + ConditionVariableBroadcast(&ShmemCtrl->pm_cv); + NBuffers = targetNBuffers; + } } /* @@ -3878,7 +3900,7 @@ process_pm_pmsignal(void) } if (CheckPostmasterSignal(PMSIGNAL_SHMEM_RESIZE)) - AnonymousShmemResize(); + process_pm_shmem_resize(); /* * Try to advance postmaster's state machine, if a child requests it. diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index be64fa5a136..80a168ec2ce 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -19,6 +19,7 @@ #include "storage/pg_shmem.h" #include "storage/bufmgr.h" #include "storage/pg_shmem.h" +#include "utils/guc.h" BufferDescPadded *BufferDescriptors; char *BufferBlocks; @@ -63,47 +64,39 @@ CkptSortItem *CkptBufferIds; /* * Initialize shared buffer pool * - * This is called once during shared-memory initialization (either in the - * postmaster, or in a standalone backend) or during shared-memory resize. Size - * of data structures initialized here depends on NBuffers, and to be able to - * change NBuffers without a restart we store each structure into a separate - * shared memory segment, which could be resized on demand. - * - * FirstBufferToInit tells where to start initializing buffers. For - * initialization it always will be zero, but when resizing shared-memory it - * indicates the number of already initialized buffers. - * + * This is called once during shared-memory initialization. + * TODO: Restore this function to it's initial form. This function should see no + * change in buffer resize patches, except may be use of NBuffersPending. + * * No locks are taking in this function, it is the caller responsibility to * make sure only one backend can work with new buffers. */ void -BufferManagerShmemInit(int FirstBufferToInit) +BufferManagerShmemInit(void) { bool foundBufs, foundDescs, foundIOCV, foundBufCkpt; int i; - elog(DEBUG1, "BufferManagerShmemInit from %d to %d", - FirstBufferToInit, NBuffers); /* Align descriptors to a cacheline boundary. */ BufferDescriptors = (BufferDescPadded *) ShmemInitStructInSegment("Buffer Descriptors", - NBuffers * sizeof(BufferDescPadded), + NBuffersPending * sizeof(BufferDescPadded), &foundDescs, BUFFER_DESCRIPTORS_SHMEM_SEGMENT); /* Align buffer pool on IO page size boundary. */ BufferBlocks = (char *) TYPEALIGN(PG_IO_ALIGN_SIZE, ShmemInitStructInSegment("Buffer Blocks", - NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, + NBuffersPending * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, &foundBufs, BUFFERS_SHMEM_SEGMENT)); /* Align condition variables to cacheline boundary. */ BufferIOCVArray = (ConditionVariableMinimallyPadded *) ShmemInitStructInSegment("Buffer IO Condition Variables", - NBuffers * sizeof(ConditionVariableMinimallyPadded), + NBuffersPending * sizeof(ConditionVariableMinimallyPadded), &foundIOCV, BUFFER_IOCV_SHMEM_SEGMENT); /* @@ -115,7 +108,7 @@ BufferManagerShmemInit(int FirstBufferToInit) */ CkptBufferIds = (CkptSortItem *) ShmemInitStructInSegment("Checkpoint BufferIds", - NBuffers * sizeof(CkptSortItem), &foundBufCkpt, + NBuffersPending * sizeof(CkptSortItem), &foundBufCkpt, CHECKPOINT_BUFFERS_SHMEM_SEGMENT); if (foundDescs || foundBufs || foundIOCV || foundBufCkpt) @@ -124,15 +117,14 @@ BufferManagerShmemInit(int FirstBufferToInit) Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt); /* * note: this path is only taken in EXEC_BACKEND case when initializing - * shared memory, or in all cases when resizing shared memory. + * shared memory. */ } -#ifndef EXEC_BACKEND /* * Initialize all the buffer headers. */ - for (i = FirstBufferToInit; i < NBuffers; i++) + for (i = 0; i < NBuffersPending; i++) { BufferDesc *buf = GetBufferDescriptor(i); @@ -150,21 +142,18 @@ BufferManagerShmemInit(int FirstBufferToInit) ConditionVariableInit(BufferDescriptorGetIOCV(buf)); } -#endif /* - * Init other shared buffer-management stuff from scratch configuring buffer - * pool the first time. If we are just resizing buffer pool adjust only the - * required structures. + * Init other shared buffer-management stuff. */ - if (FirstBufferToInit == 0) - StrategyInitialize(!foundDescs); - else - StrategyReInitialize(FirstBufferToInit); + StrategyInitialize(!foundDescs); /* Initialize per-backend file flush context */ WritebackContextInit(&BackendWritebackContext, &backend_flush_after); + + /* Declare the size of current buffer pool. */ + NBuffers = NBuffersPending; } /* @@ -175,30 +164,61 @@ BufferManagerShmemInit(int FirstBufferToInit) * shared memory segment. The main segment must not allocate anything * related to buffers, every other segment will receive part of the * data. + * + * If set_reserved is true, also sets the shmem_reserved field for each + * segment based on MaxNBuffers. This should be true during server startup + * but false during buffer pool resizing. */ Size -BufferManagerShmemSize(void) +BufferManagerShmemSize(bool set_reserved) { size_t size; /* size of buffer descriptors, plus alignment padding */ - size = add_size(0, mul_size(NBuffers, sizeof(BufferDescPadded))); + size = add_size(0, mul_size(NBuffersPending, sizeof(BufferDescPadded))); size = add_size(size, PG_CACHE_LINE_SIZE); Mappings[BUFFER_DESCRIPTORS_SHMEM_SEGMENT].shmem_req_size = size; + if (set_reserved) + { + /* reserved size based on MaxNBuffers */ + size = add_size(0, mul_size(MaxNBuffers, sizeof(BufferDescPadded))); + size = add_size(size, PG_CACHE_LINE_SIZE); + Mappings[BUFFER_DESCRIPTORS_SHMEM_SEGMENT].shmem_reserved = size; + } /* size of data pages, plus alignment padding */ size = add_size(0, PG_IO_ALIGN_SIZE); - size = add_size(size, mul_size(NBuffers, BLCKSZ)); + size = add_size(size, mul_size(NBuffersPending, BLCKSZ)); Mappings[BUFFERS_SHMEM_SEGMENT].shmem_req_size = size; + if (set_reserved) + { + /* reserved size based on MaxNBuffers */ + size = add_size(0, PG_IO_ALIGN_SIZE); + size = add_size(size, mul_size(MaxNBuffers, BLCKSZ)); + Mappings[BUFFERS_SHMEM_SEGMENT].shmem_reserved = size; + } /* size of I/O condition variables, plus alignment padding */ - size = add_size(0, mul_size(NBuffers, + size = add_size(0, mul_size(NBuffersPending, sizeof(ConditionVariableMinimallyPadded))); size = add_size(size, PG_CACHE_LINE_SIZE); Mappings[BUFFER_IOCV_SHMEM_SEGMENT].shmem_req_size = size; + if (set_reserved) + { + /* reserved size based on MaxNBuffers */ + size = add_size(0, mul_size(MaxNBuffers, + sizeof(ConditionVariableMinimallyPadded))); + size = add_size(size, PG_CACHE_LINE_SIZE); + Mappings[BUFFER_IOCV_SHMEM_SEGMENT].shmem_reserved = size; + } /* size of checkpoint sort array in bufmgr.c */ - Mappings[CHECKPOINT_BUFFERS_SHMEM_SEGMENT].shmem_req_size = mul_size(NBuffers, sizeof(CkptSortItem)); + Mappings[CHECKPOINT_BUFFERS_SHMEM_SEGMENT].shmem_req_size = mul_size(NBuffersPending, sizeof(CkptSortItem)); + if (set_reserved) + { + /* reserved size based on MaxNBuffers */ + Mappings[CHECKPOINT_BUFFERS_SHMEM_SEGMENT].shmem_reserved = mul_size(MaxNBuffers, sizeof(CkptSortItem)); + } /* Allocations in the main memory segment, at the end. */ @@ -207,3 +227,181 @@ BufferManagerShmemSize(void) return size; } + +/* + * Reinitialize shared buffer manager structures when resizing the buffer pool. + * + * This function is called in the backend which coordinates buffer resizing + * operation. + * + * TODO: Avoid code duplication with BufferManagerShmemInit() and also assess + * which functionality in the latter is required in this function. + */ +void +BufferManagerShmemResize(int currentNBuffers, int targetNBuffers) +{ + bool found; + int i; + void *tmpPtr; + + tmpPtr = (BufferDescPadded *) + ShmemUpdateStructInSegment("Buffer Descriptors", + targetNBuffers * sizeof(BufferDescPadded), + &found, BUFFER_DESCRIPTORS_SHMEM_SEGMENT); + if (BufferDescriptors != tmpPtr || !found) + elog(FATAL, "resizing buffer descriptors failed: expected pointer %p, got %p, found=%d", + BufferDescriptors, tmpPtr, found); + + tmpPtr = (ConditionVariableMinimallyPadded *) + ShmemUpdateStructInSegment("Buffer IO Condition Variables", + targetNBuffers * sizeof(ConditionVariableMinimallyPadded), + &found, BUFFER_IOCV_SHMEM_SEGMENT); + if (BufferIOCVArray != tmpPtr || !found) + elog(FATAL, "resizing buffer IO condition variables failed: expected pointer %p, got %p, found=%d", + BufferIOCVArray, tmpPtr, found); + + tmpPtr = (CkptSortItem *) + ShmemUpdateStructInSegment("Checkpoint BufferIds", + targetNBuffers * sizeof(CkptSortItem), &found, + CHECKPOINT_BUFFERS_SHMEM_SEGMENT); + if (CkptBufferIds != tmpPtr || !found) + elog(FATAL, "resizing checkpoint buffer IDs failed: expected pointer %p, got %p, found=%d", + CkptBufferIds, tmpPtr, found); + + tmpPtr = (char *) + TYPEALIGN(PG_IO_ALIGN_SIZE, + ShmemUpdateStructInSegment("Buffer Blocks", + targetNBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, + &found, BUFFERS_SHMEM_SEGMENT)); + if (BufferBlocks != tmpPtr || !found) + elog(FATAL, "resizing buffer blocks failed: expected pointer %p, got %p, found=%d", + BufferBlocks, tmpPtr, found); + + /* + * Initialize the headers for new buffers. If we are shrinking the + * buffers, currentNBuffers >= targetNBuffers, thus this loop doesn't execute. + */ + for (i = currentNBuffers; i < targetNBuffers; i++) + { + BufferDesc *buf = GetBufferDescriptor(i); + + ClearBufferTag(&buf->tag); + + pg_atomic_init_u32(&buf->state, 0); + buf->wait_backend_pgprocno = INVALID_PROC_NUMBER; + + buf->buf_id = i; + + LWLockInitialize(BufferDescriptorGetContentLock(buf), + LWTRANCHE_BUFFER_CONTENT); + + ConditionVariableInit(BufferDescriptorGetIOCV(buf)); + } + + StrategyReset(targetNBuffers); +} + +/* + * BufferManagerShmemValidate + * Validate that buffer manager shared memory structures have correct + * pointers and sizes after a resize operation. + * + * This function is called by backends during ProcessBarrierShmemResizeStruct + * to ensure their view of the buffer structures is consistent after memory + * remapping. + */ +void +BufferManagerShmemValidate(int targetNBuffers) +{ + bool found; + void *tmpPtr; + + /* Validate Buffer Descriptors */ + tmpPtr = (BufferDescPadded *) + ShmemInitStructInSegment("Buffer Descriptors", + targetNBuffers * sizeof(BufferDescPadded), + &found, BUFFER_DESCRIPTORS_SHMEM_SEGMENT); + if (!found || BufferDescriptors != tmpPtr) + elog(FATAL, "validating buffer descriptors failed: expected pointer %p, got %p, found=%d", + BufferDescriptors, tmpPtr, found); + + /* Validate Buffer IO Condition Variables */ + tmpPtr = (ConditionVariableMinimallyPadded *) + ShmemInitStructInSegment("Buffer IO Condition Variables", + targetNBuffers * sizeof(ConditionVariableMinimallyPadded), + &found, BUFFER_IOCV_SHMEM_SEGMENT); + if (!found || BufferIOCVArray != tmpPtr) + elog(FATAL, "validating buffer IO condition variables failed: expected pointer %p, got %p, found=%d", + BufferIOCVArray, tmpPtr, found); + + /* Validate Checkpoint BufferIds */ + tmpPtr = (CkptSortItem *) + ShmemInitStructInSegment("Checkpoint BufferIds", + targetNBuffers * sizeof(CkptSortItem), &found, + CHECKPOINT_BUFFERS_SHMEM_SEGMENT); + if (!found || CkptBufferIds != tmpPtr) + elog(FATAL, "validating checkpoint buffer IDs failed: expected pointer %p, got %p, found=%d", + CkptBufferIds, tmpPtr, found); + + /* Validate Buffer Blocks */ + tmpPtr = (char *) + TYPEALIGN(PG_IO_ALIGN_SIZE, + ShmemInitStructInSegment("Buffer Blocks", + targetNBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, + &found, BUFFERS_SHMEM_SEGMENT)); + if (!found || BufferBlocks != tmpPtr) + elog(FATAL, "validating buffer blocks failed: expected pointer %p, got %p, found=%d", + BufferBlocks, tmpPtr, found); +} + +/* + * check_shared_buffers + * GUC check_hook for shared_buffers + * + * When reloading the configuration, shared_buffers should not be set to a value + * higher than max_shared_buffers fixed at the boot time. + */ +bool +check_shared_buffers(int *newval, void **extra, GucSource source) +{ + if (finalMaxNBuffers && *newval > MaxNBuffers) + { + GUC_check_errdetail("\"shared_buffers\" must be less than \"max_shared_buffers\"."); + return false; + } + return true; +} + +/* + * show_shared_buffers + * GUC show_hook for shared_buffers + * + * Shows both current and pending buffer counts with proper unit formatting. + */ +const char * +show_shared_buffers(void) +{ + static char buffer[128]; + int64 current_value, pending_value; + const char *current_unit, *pending_unit; + + if (NBuffers == NBuffersPending) + { + /* No buffer pool resizing pending. */ + convert_int_from_base_unit(NBuffers, GUC_UNIT_BLOCKS, ¤t_value, ¤t_unit); + snprintf(buffer, sizeof(buffer), INT64_FORMAT "%s", current_value, current_unit); + } + else + { + /* + * New value for NBuffers is loaded but not applied yet, show both + * current and pending. + */ + convert_int_from_base_unit(NBuffers, GUC_UNIT_BLOCKS, ¤t_value, ¤t_unit); + convert_int_from_base_unit(NBuffersPending, GUC_UNIT_BLOCKS, &pending_value, &pending_unit); + snprintf(buffer, sizeof(buffer), INT64_FORMAT "%s (pending: " INT64_FORMAT "%s)", + current_value, current_unit, pending_value, pending_unit); + } + + return buffer; +} diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index fdcb5556235..14200a38a0f 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3631,12 +3631,12 @@ static float smoothed_alloc = 0; static float smoothed_density = 10.0; void -BgBufferSyncReset(int NBuffersOld, int NBuffersNew) +BgBufferSyncReset(int currentNBuffers, int targetNBuffers) { saved_info_valid = false; #ifdef BGW_DEBUG elog(DEBUG2, "invalidated background writer status after resizing buffers from %d to %d", - NBuffersOld, NBuffersNew); + currentNBuffers, targetNBuffers); #endif } @@ -3686,8 +3686,11 @@ BgBufferSync(WritebackContext *wb_context) * valid. If the buffer pool is being expanded, more buffers will become * available without even this function writing out any. Hence wait till * buffer resizing finishes i.e. go into hibernation mode. + * + * TODO: We may not need this synchronization if background worker itself + * becomes the coordinator. */ - if (pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers) != NBuffers) + if (!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)) return true; /* @@ -3883,7 +3886,7 @@ BgBufferSync(WritebackContext *wb_context) * finish. */ while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est && - pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers) == NBuffers) + !pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)) { int sync_state = SyncOneBuffer(next_to_clean, true, wb_context); @@ -4255,7 +4258,23 @@ DebugPrintBufferRefcount(Buffer buffer) void CheckPointBuffers(int flags) { + /* Mark that buffer sync is in progress - delay any shared memory resizing. */ + /* + * TODO: We need to assess whether we should allow checkpoint and buffer + * resizing to run in parallel. When expanding buffers it may be fine to let + * the checkpointer run in RESIZE_MAP_AND_MEM phase but delay phase EXPAND + * phase till the checkpoint finishes, at the same time not allow checkpoint + * to run during expansion phase. When shrinking the buffers, we should + * delay SHRINK phase till checkpoint finishes and not allow to start + * checkpoint till SHRINK phase is done, but allow it to run in + * RESIZE_MAP_AND_MEM phase. This needs careful analysis and testing. + */ + delay_shmem_resize = true; + BufferSync(flags); + + /* Mark that buffer sync is no longer in progress - allow shared memory resizing */ + delay_shmem_resize = false; } /* @@ -7504,10 +7523,12 @@ const PgAioHandleCallbacks aio_local_buffer_readv_cb = { * of the shrunk buffer pool. */ bool -EvictExtraBuffers(int newBufSize, int oldBufSize) +EvictExtraBuffers(int targetNBuffers, int currentNBuffers) { bool result = true; + Assert(targetNBuffers < currentNBuffers); + /* * If the buffer being evicated is locked, this function will need to wait. * This function should not be called from a Postmaster since it can not wait on a lock. @@ -7515,77 +7536,50 @@ EvictExtraBuffers(int newBufSize, int oldBufSize) Assert(IsUnderPostmaster); /* - * Let only one backend perform eviction. We could split the work across all - * the backends but that doesn't seem necessary. - * - * The first backend to acquire ShmemResizeLock, sets its own PID as the - * evictor PID for other backends to know that the eviction is in progress or - * has already been performed. The evictor backend releases the lock when it - * finishes eviction. While the eviction is in progress, backends other than - * evictor backend won't be able to take the lock. They won't perform - * eviction. A backend may acquire the lock after eviction has completed, but - * it will not perform eviction since the evictor PID is already set. Evictor - * PID is reset only when the buffer resizing finishes. Thus only one backend - * will perform eviction in a given instance of shared buffers resizing. - * - * Any backend which acquires this lock will release it before the eviction - * phase finishes, hence the same lock can be reused for the next phase of - * resizing buffers. + * TODO: Before evicting any buffer, we should check whether any of the + * buffers are pinned. If we find that a buffer is pinned after evicting + * most of them, that will impact performance since all those evicted + * buffers might need to be read again. */ - if (LWLockConditionalAcquire(ShmemResizeLock, LW_EXCLUSIVE)) + for (Buffer buf = targetNBuffers + 1; buf <= currentNBuffers; buf++) { - if (ShmemCtrl->evictor_pid == 0) - { - ShmemCtrl->evictor_pid = MyProcPid; - - /* - * TODO: Before evicting any buffer, we should check whether any of the - * buffers are pinned. If we find that a buffer is pinned after evicting - * most of them, that will impact performance since all those evicted - * buffers might need to be read again. - */ - for (Buffer buf = newBufSize + 1; buf <= oldBufSize; buf++) - { - BufferDesc *desc = GetBufferDescriptor(buf - 1); - uint32 buf_state; - bool buffer_flushed; + BufferDesc *desc = GetBufferDescriptor(buf - 1); + uint32 buf_state; + bool buffer_flushed; - buf_state = pg_atomic_read_u32(&desc->state); + buf_state = pg_atomic_read_u32(&desc->state); - /* - * Nobody is expected to touch the buffers while resizing is - * going one hence unlocked precheck should be safe and saves - * some cycles. - */ - if (!(buf_state & BM_VALID)) - continue; + /* + * Nobody is expected to touch the buffers while resizing is + * going one hence unlocked precheck should be safe and saves + * some cycles. + */ + if (!(buf_state & BM_VALID)) + continue; - /* - * XXX: Looks like CurrentResourceOwner can be NULL here, find - * another one in that case? - * */ - if (CurrentResourceOwner) - ResourceOwnerEnlarge(CurrentResourceOwner); + /* + * XXX: Looks like CurrentResourceOwner can be NULL here, find + * another one in that case? + * */ + if (CurrentResourceOwner) + ResourceOwnerEnlarge(CurrentResourceOwner); - ReservePrivateRefCountEntry(); + ReservePrivateRefCountEntry(); - LockBufHdr(desc); + LockBufHdr(desc); - /* - * Now that we have locked buffer descriptor, make sure that the - * buffer without valid data has been skipped above. - */ - Assert(buf_state & BM_VALID); + /* + * Now that we have locked buffer descriptor, make sure that the + * buffer without valid data has been skipped above. + */ + Assert(buf_state & BM_VALID); - if (!EvictUnpinnedBufferInternal(desc, &buffer_flushed)) - { - elog(WARNING, "could not remove buffer %u, it is pinned", buf); - result = false; - break; - } - } + if (!EvictUnpinnedBufferInternal(desc, &buffer_flushed)) + { + elog(WARNING, "could not remove buffer %u, it is pinned", buf); + result = false; + break; } - LWLockRelease(ShmemResizeLock); } return result; diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 55be5eebe0a..c09875934d4 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -33,10 +33,16 @@ typedef struct /* Spinlock: protects the values below */ slock_t buffer_strategy_lock; + /* + * Number of active buffers that can be allocated. During buffer resizing, + * this may be different from NBuffers which tracks the global buffer count. + */ + pg_atomic_uint32 activeNBuffers; + /* * clock-sweep hand: index of next buffer to consider grabbing. Note that * this isn't a concrete buffer - we only ever increase the value. So, to - * get an actual buffer, it needs to be used modulo NBuffers. + * get an actual buffer, it needs to be used modulo activeNBuffers. */ pg_atomic_uint32 nextVictimBuffer; @@ -101,6 +107,7 @@ static inline uint32 ClockSweepTick(void) { uint32 victim; + int activeBuffers; /* * Atomically move hand ahead one buffer - if there's several processes @@ -110,12 +117,15 @@ ClockSweepTick(void) victim = pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1); - if (victim >= NBuffers) + /* Read the current active buffer count atomically */ + activeBuffers = pg_atomic_read_u32(&StrategyControl->activeNBuffers); + + if (victim >= activeBuffers) { uint32 originalVictim = victim; /* always wrap what we look up in BufferDescriptors */ - victim = victim % NBuffers; + victim = victim % activeBuffers; /* * If we're the one that just caused a wraparound, force @@ -143,7 +153,7 @@ ClockSweepTick(void) */ SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - wrapped = expected % NBuffers; + wrapped = expected % activeBuffers; success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer, &expected, wrapped); @@ -177,6 +187,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r BufferDesc *buf; int bgwprocno; int trycounter; + int activeNBuffers; *from_ring = false; @@ -228,7 +239,9 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); /* Use the "clock sweep" algorithm to find a free buffer */ - trycounter = NBuffers; + activeNBuffers = pg_atomic_read_u32(&StrategyControl->activeNBuffers); + trycounter = activeNBuffers; + for (;;) { uint32 old_buf_state; @@ -280,7 +293,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state, local_buf_state)) { - trycounter = NBuffers; + trycounter = activeNBuffers; break; } } @@ -323,10 +336,12 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) { uint32 nextVictimBuffer; int result; + uint32 activeNBuffers; SpinLockAcquire(&StrategyControl->buffer_strategy_lock); nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer); - result = nextVictimBuffer % NBuffers; + activeNBuffers = pg_atomic_read_u32(&StrategyControl->activeNBuffers); + result = nextVictimBuffer % activeNBuffers; if (complete_passes) { @@ -336,7 +351,7 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) * Additionally add the number of wraparounds that happened before * completePasses could be incremented. C.f. ClockSweepTick(). */ - *complete_passes += nextVictimBuffer / NBuffers; + *complete_passes += nextVictimBuffer / activeNBuffers; } if (num_buf_alloc) @@ -391,6 +406,31 @@ StrategyShmemSize(void) return size; } +void +StrategyReset(int activeNBuffers) +{ + Assert(StrategyControl); + + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + + /* Update the active buffer count for the strategy */ + pg_atomic_write_u32(&StrategyControl->activeNBuffers, activeNBuffers); + + /* Reset the clock-sweep pointer to start from beginning */ + pg_atomic_write_u32(&StrategyControl->nextVictimBuffer, 0); + + /* + * The statistics is viewed in the context of the number of shared buffers. + * Reset it as the size of active number of shared buffers changes. + */ + StrategyControl->completePasses = 0; + pg_atomic_write_u32(&StrategyControl->numBufferAllocs, 0); + + /* TODO: Do we need to seset background writer notifications? */ + StrategyControl->bgwprocno = -1; + SpinLockRelease(&StrategyControl->buffer_strategy_lock); +} + /* * StrategyInitialize -- initialize the buffer cache replacement * strategy. @@ -416,13 +456,13 @@ StrategyInitialize(bool init) * directory without rehashing all the entries. Just allocating more entries * will lead to more contention. Hence we setup the buffer lookup table * considering the maximum possible size of the buffer pool which is - * MaxAvailableMemory. + * MaxNBuffers. * * Additionally BufferAlloc() tries to insert a new entry before deleting the * old. In principle this could be happening in each partition concurrently, * so we need extra NUM_BUFFER_PARTITIONS entries. */ - InitBufTable(MaxAvailableMemory + NUM_BUFFER_PARTITIONS); + InitBufTable(MaxNBuffers + NUM_BUFFER_PARTITIONS); /* * Get or create the shared strategy control block @@ -441,6 +481,8 @@ StrategyInitialize(bool init) SpinLockInit(&StrategyControl->buffer_strategy_lock); + /* Initialize the active buffer count */ + pg_atomic_init_u32(&StrategyControl->activeNBuffers, NBuffersPending); /* Initialize the clock-sweep pointer */ pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); @@ -455,62 +497,6 @@ StrategyInitialize(bool init) Assert(!init); } -/* - * StrategyReInitialize -- re-initialize the buffer cache replacement - * strategy. - * - * To be called when resizing buffer manager and only from the coordinator. - * TODO: Assess the differences between this function and StrategyInitialize(). - */ -void -StrategyReInitialize(int FirstBufferIdToInit) -{ - bool found; - - /* - * Resizing memory for buffer pools should not affect the address of - * StrategyControl. - */ - if (StrategyControl != (BufferStrategyControl *) - ShmemInitStructInSegment("Buffer Strategy Status", - sizeof(BufferStrategyControl), - &found, MAIN_SHMEM_SEGMENT)) - elog(FATAL, "something went wrong while re-initializing the buffer strategy"); - - Assert(found); - - /* TODO: Buffer lookup table adjustment: There are two options: - * - * 1. Resize the buffer lookup table to match the new number of buffers. But - * this requires rehashing all the entries in the buffer lookup table with - * the new table size. - * - * 2. Allocate maximum size of the buffer lookup table at the beginning and - * never resize it. This leaves sparse buffer lookup table which is - * inefficient from both memory and time perspective. According to David - * Rowley, the sparse entries in the buffer look up table cause frequent - * cacheline reload which affect performance. If the impact of that - * inefficiency in a benchmark is significant, we will need to consider first - * option. - */ - /* - * The clock sweep tick pointer might have got invalidated. Reset it as if - * starting a fresh server. - */ - pg_atomic_write_u32(&StrategyControl->nextVictimBuffer, 0); - - /* - * The old statistics is viewed in the context of the number of shared - * buffers. It does not make sense now that the number of shared buffers - * itself has changed. - */ - StrategyControl->completePasses = 0; - pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0); - - /* No pending notification */ - StrategyControl->bgwprocno = -1; -} - /* ---------------------------------------------------------------- * Backend-private buffer ring management diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index bd75f06047e..cfd952e621e 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -133,7 +133,7 @@ CalculateShmemSize(int *num_semaphores) * memory segment that it uses in the corresponding AnonymousMappings. * Consider size required from only the main shared memory segment here. */ - size = add_size(size, BufferManagerShmemSize()); + size = add_size(size, BufferManagerShmemSize(true)); size = add_size(size, LockManagerShmemSize()); size = add_size(size, PredicateLockShmemSize()); size = add_size(size, ProcGlobalShmemSize()); @@ -187,10 +187,14 @@ CalculateShmemSize(int *num_semaphores) * shared memory segment. */ Mappings[MAIN_SHMEM_SEGMENT].shmem_req_size = size; + Mappings[MAIN_SHMEM_SEGMENT].shmem_reserved = size; /* might as well round it off to a multiple of a typical page size */ for (int segment = 0; segment < ANON_MAPPINGS; segment++) + { Mappings[segment].shmem_req_size = add_size(Mappings[segment].shmem_req_size, 8192 - (Mappings[segment].shmem_req_size % 8192)); + Mappings[segment].shmem_reserved = add_size(Mappings[segment].shmem_reserved, 8192 - (Mappings[segment].shmem_reserved % 8192)); + } return size; } @@ -341,7 +345,7 @@ CreateOrAttachShmemStructs(void) CommitTsShmemInit(); SUBTRANSShmemInit(); MultiXactShmemInit(); - BufferManagerShmemInit(0); + BufferManagerShmemInit(); /* * Set up lock manager diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 2160d258fa7..0a173f038a3 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -657,9 +657,17 @@ ProcessProcSignalBarrier(void) case PROCSIGNAL_BARRIER_SMGRRELEASE: processed = ProcessBarrierSmgrRelease(); break; - case PROCSIGNAL_BARRIER_SHMEM_RESIZE: - processed = ProcessBarrierShmemResize( - &ShmemCtrl->Barrier); + case PROCSIGNAL_BARRIER_SHBUF_SHRINK: + processed = ProcessBarrierShmemShrink(); + break; + case PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM: + processed = ProcessBarrierShmemResizeMapAndMem(); + break; + case PROCSIGNAL_BARRIER_SHBUF_EXPAND: + processed = ProcessBarrierShmemExpand(); + break; + case PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED: + processed = ProcessBarrierShmemResizeFailed(); break; } diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 0f9abf69fd5..9793d27042a 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -69,11 +69,19 @@ #include "funcapi.h" #include "miscadmin.h" #include "port/pg_numa.h" +#include "postmaster/bgwriter.h" +#include "storage/bufmgr.h" +#include "storage/buf_internals.h" +#include "storage/ipc.h" #include "storage/lwlock.h" #include "storage/pg_shmem.h" +#include "storage/pmsignal.h" +#include "storage/procsignal.h" #include "storage/shmem.h" #include "storage/spin.h" #include "utils/builtins.h" +#include "utils/injection_point.h" +#include "utils/wait_event.h" static void *ShmemAllocRaw(Size size, Size *allocated_size); static void *ShmemAllocRawInSegment(Size size, Size *allocated_size, @@ -498,28 +506,15 @@ ShmemInitStructInSegment(const char *name, Size size, bool *foundPtr, { /* * Structure is in the shmem index so someone else has allocated it - * already. Verify the structure's size: - * - If it's the same, we've found the expected structure. - * - If it's different, we're resizing the expected structure. - * - * XXX: There is an implicit assumption this can only happen in - * "resizable" segments, where only one shared structure is allowed. - * This has to be implemented more cleanly. Probably we should implement - * ShmemReallocRawInSegment functionality just to adjust the size - * according to alignment, return the allocated size and update the - * mapping offset. + * already. The size better be the same as the size we are trying to */ if (result->size != size) { - Size delta = size - result->size; - - result->size = size; - result->allocated_size = size; - - /* Reflect size change in the shared segment */ - SpinLockAcquire(Segments[shmem_segment].ShmemLock); - Segments[shmem_segment].ShmemSegHdr->freeoffset += delta; - SpinLockRelease(Segments[shmem_segment].ShmemLock); + LWLockRelease(ShmemIndexLock); + ereport(ERROR, + (errmsg("ShmemIndex entry size is wrong for data structure" + " \"%s\": expected %zu, actual %zu", + name, size, result->size))); } structPtr = result->location; @@ -556,6 +551,59 @@ ShmemInitStructInSegment(const char *name, Size size, bool *foundPtr, return structPtr; } +/* + * ShmemUpdateStructInSegment -- Update the size of a structure in shared memory. + * + * This function updates the size of an existing shared memory structure. It + * finds the structure in the shmem index and updates its size information while + * preserving the existing memory location. + * + * Returns: pointer to the existing structure location. + */ +void * +ShmemUpdateStructInSegment(const char *name, Size size, bool *foundPtr, + int shmem_segment) +{ + ShmemIndexEnt *result; + void *structPtr; + Size delta; + + LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE); + + Assert(ShmemIndex); + + /* Look up the structure in the shmem index */ + result = (ShmemIndexEnt *) + hash_search(ShmemIndex, name, HASH_FIND, foundPtr); + + Assert(*foundPtr); + Assert(result); + Assert(result->shmem_segment == shmem_segment); + + delta = size - result->size; + /* Store the existing structure pointer */ + structPtr = result->location; + + /* Update the size information. + TODO: Ideally we should implement repalloc kind of functionality for shared memory which will return allocated size. */ + result->size = size; + result->allocated_size = size; + + /* Reflect size change in the shared segment */ + SpinLockAcquire(Segments[shmem_segment].ShmemLock); + Segments[shmem_segment].ShmemSegHdr->freeoffset += delta; + SpinLockRelease(Segments[shmem_segment].ShmemLock); + LWLockRelease(ShmemIndexLock); + + /* Verify the structure is still in the correct segment */ + Assert(ShmemAddrIsValidInSegment(structPtr, shmem_segment)); + Assert(structPtr == (void *) CACHELINEALIGN(structPtr)); + + return structPtr; +} + + + /* * Add two Size values, checking for overflow */ @@ -892,3 +940,402 @@ pg_get_shmem_segments(PG_FUNCTION_ARGS) return (Datum) 0; } +/* + * TODO: The function henceforth are related to buffer manager and better be + * placed in buffer manager related file. + */ + +/* + * Prepare ShmemCtrl for resizing the shared buffer pool. + */ +static void +MarkBufferResizingStart(int targetNBuffers, int currentNBuffers) +{ + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + pg_atomic_write_u32(&ShmemCtrl->currentNBuffers, currentNBuffers); + pg_atomic_write_u32(&ShmemCtrl->targetNBuffers, targetNBuffers); + pg_atomic_write_u32(&ShmemCtrl->activeNBuffers, Min(targetNBuffers, currentNBuffers)); + pg_atomic_write_u32(&ShmemCtrl->transitNBuffers, currentNBuffers); + ShmemCtrl->coordinator = MyProcPid; + ShmemCtrl->pmwork_done = false; +} + +/* + * Reset ShmemCtrl after resizing the shared buffer pool is done. + */ +static void +MarkBufferResizingEnd(int NBuffers) +{ + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + pg_atomic_write_u32(&ShmemCtrl->currentNBuffers, NBuffers); + pg_atomic_write_u32(&ShmemCtrl->targetNBuffers, NBuffers); + pg_atomic_write_u32(&ShmemCtrl->activeNBuffers, NBuffers); + pg_atomic_write_u32(&ShmemCtrl->transitNBuffers, NBuffers); + ShmemCtrl->coordinator = -1; + ShmemCtrl->pmwork_done = false; +} + +/* + * Function which updates the shared buffers according to the current values of + * shared_buffers GUCs. + * + * When resizing the buffer pool is divided into two portions + * + * - active buffer pool, which is the part of buffer pool which remains active + * even during resizing. Its size is given by activeNBuffers. Newly allocated + * buffers will have their buffer ids less than activeNBuffers. + * + * - in-transit buffer pool, which is the part of buffer pool which may be + * accessible to some backends but not others. When shrinking the buffer pool + * this is the part of buffer pool which will be evicted. When expanding the + * buffer pool this is the expanded portion. Its size is given by + * transitNBuffers. The backends may see buffer ids upto transitNBuffers. + * + * Before starting resizing, activeNBuffers = transitNBuffers = NBuffers. And + * NewNBuffers is the new size of shared buffer pool. + * + * In order to synchronize with other running backends, the coordinator sends + * following ProcSignalBarriers in the order given below: + * + * 1. When shrinking the shared buffer pool (with size NBuffers), the coordinator + * sends SHBUF_SHRINK ProcSignalBarrier. Every backend sets activeNBuffers = + * NewNBuffers to restrict its buffer pool allocations to the new size of the + * buffer pool and acknowledges the ProcSignalBarrrier. Once every backend has + * acknowledged, the coordinator evicts the buffers in the area being shrunk. + * Note that tansitNBuffers is still NBuffers, so the backends may see buffer ids + * upto NBuffers from earlier allocations. + * + * 2. In both cases, when expanding the buffer pool or shrinking the buffer pool, + * the coordinator sends SHBUF_RESIZE_MAP_AND_MEM ProcSignalBarrier. Every + * backend is expected to adjust their shared memory segment maps (by calling + * AnonymousShmemResize()) and validate that their pointers to the shared buffers + * structure are valid and have the right size. When shrinking shared buffer pool + * transitNBuffers is set to NewNBuffers and the backends should no more see + * buffer ids beyond NewNBuffers. When expanding they should also set + * transitNBuffers to NewNBuffers to accomodate backends which may accept the + * next barrier earlier than the others. After this the backends should + * acknowledge the ProcSignalBarrier. + * + * 3. When expanding the buffer pool, the coordinator sends SHBUF_EXPAND + * ProcSignalBarrier. The backends are expected to set activeNBuffers = + * NewNBuffers and start allocating buffers from the expanded range. + * + * Find a better place for this function, also a name if we find this interface + * viable. + * + * TODO: Should this function be in bufmgr.c? + * + * TODO: Handle the case when the backend executing this function dies or the + * query is cancelled. + */ +Datum +pg_resize_shared_buffers(PG_FUNCTION_ARGS) +{ + bool result = true; + int currentNBuffers = NBuffers; + int targetNBuffers = NBuffersPending; + + if (currentNBuffers == targetNBuffers) + { + elog(LOG, "shared buffers are already at %d, no need to resize", currentNBuffers); + PG_RETURN_BOOL(true); + } + + if (!pg_atomic_test_set_flag(&ShmemCtrl->resize_in_progress)) + { + elog(LOG, "shared buffer resizing already in progress"); + PG_RETURN_BOOL(false); + } + + MarkBufferResizingStart(targetNBuffers, currentNBuffers); + elog(LOG, "resizing shared buffers from %d to %d", currentNBuffers, targetNBuffers); + + INJECTION_POINT("pg-resize-shared-buffers-flag-set", NULL); + + /* Phase 1: SHBUF_SHRINK - Only for shrinking buffer pool */ + if (targetNBuffers < currentNBuffers) + { + /* + * Phase 1: Shrinking - send SHBUF_SHRINK barrier + * Every backend sets activeNBuffers = NewNBuffers to restrict + * buffer pool allocations to the new size + */ + elog(LOG, "Phase 1: Shrinking buffer pool, restricting allocations to %d buffers", targetNBuffers); + + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SHBUF_SHRINK)); + elog(LOG, "all backends acknowledged shrink phase"); + + /* Evict buffers in the area being shrunk */ + elog(LOG, "evicting buffers %u..%u", targetNBuffers + 1, currentNBuffers); + if (!EvictExtraBuffers(targetNBuffers, currentNBuffers)) + { + elog(ERROR, "failed to evict extra buffers during shrinking"); + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED)); + MarkBufferResizingEnd(currentNBuffers); + pg_atomic_clear_flag(&ShmemCtrl->resize_in_progress); + Assert(NBuffers == currentNBuffers); + NBuffers = pg_atomic_read_u32(&ShmemCtrl->currentNBuffers); + PG_RETURN_BOOL(false); + } + + /* This backend handles NBuffers itself instead of relying on the barrier + * handler, so that barrier handlers do not interfere with its + * operations. */ + NBuffers = targetNBuffers; + } + + /* Phase 2: SHBUF_RESIZE_MAP_AND_MEM - Both expanding and shrinking */ + elog(LOG, "Phase 2: Remapping shared memory segments and updating structures"); + if (!AnonymousShmemResize()) + { + /* + * This should never fail since address map should already be reserved. + * So the failure should be treated as PANIC. + */ + elog(PANIC, "failed to resize anonymous shared memory"); + } + + /* When shrinking no backends should see buffers beyond active portion of the + * buffer pool. When expanding, update transitNBuffers so backends can see + * the new range. */ + pg_atomic_write_u32(&ShmemCtrl->transitNBuffers, targetNBuffers); + + /* Update structure pointers and sizes */ + BufferManagerShmemResize(currentNBuffers, targetNBuffers); + + /* Request Postmaster to remap and resize. TODO: Handle the case when Postmaster is not able to remap and resize the shared memory structures. */ + SendPostmasterSignal(PMSIGNAL_SHMEM_RESIZE); + elog(LOG, "waiting for the postmaster to finish remapping and resizing the shared buffers"); + while (!ShmemCtrl->pmwork_done) + { + if (ConditionVariableTimedSleep(&ShmemCtrl->pm_cv, + 5000, + WAIT_EVENT_PM_BUFFER_RESIZE_WAIT)) + ereport(LOG, + (errmsg("still waiting for the postmaster PID %d to finish resizing buffers", + (int) PostmasterPid))); + } + ConditionVariableCancelSleep(); + elog(LOG, "postmaster remapped and resized the shared memory"); + + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM)); + elog(LOG, "all backends acknowledged memory remapping and structure updates"); + + /* Phase 3: SHBUF_EXPAND - Only for expanding buffer pool */ + if (targetNBuffers > currentNBuffers) + { + /* + * Phase 3: Expanding - send SHBUF_EXPAND barrier + * Backends set activeNBuffers = NewNBuffers and start allocating + * buffers from the expanded range + */ + elog(LOG, "Phase 3: Expanding buffer pool, enabling allocations up to %d buffers", targetNBuffers); + + WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SHBUF_EXPAND)); + elog(LOG, "all backends acknowledged expand phase"); + + /* This backend handles NBuffers itself instead of relying on the barrier + * handler, so that barrier handlers do not interfere with its + * operations. */ + NBuffers = targetNBuffers; + } + + /* + * Reset buffer resize control area. + */ + MarkBufferResizingEnd(targetNBuffers); + + pg_atomic_clear_flag(&ShmemCtrl->resize_in_progress); + + elog(LOG, "successfully resized shared buffers to %d", targetNBuffers); + + PG_RETURN_BOOL(result); +} + +bool +ProcessBarrierShmemShrink(void) +{ + int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers); + int activeNBuffers = pg_atomic_read_u32(&ShmemCtrl->activeNBuffers); + + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + /* The work to be done by the coordinator is done in the function which sends the barriers. Hence acknowledge immediately. */ + if (ShmemCtrl->coordinator == MyProcPid) + { + elog(LOG, "Phase 1: Coordinator backend %d acknowledging SHBUF_SHRINK barrier immediately", MyProcPid); + return true; + } + + /* + * Delay adjusting the new active size of buffer pool till this process + * becomes ready to resize buffers. + */ + if (delay_shmem_resize) + { + elog(LOG, "Phase 1: Delaying SHBUF_SHRINK barrier - restricting allocations from %d to %d buffers, coordinator is %d", + NBuffers, targetNBuffers, ShmemCtrl->coordinator); + + return false; + } + + elog(LOG, "Phase 1: Processing SHBUF_SHRINK barrier - restricting allocations from %d to %d buffers, coordinator is %d", + NBuffers, targetNBuffers, ShmemCtrl->coordinator); + + if (MyBackendType == B_BG_WRITER) + { + /* + * Before resuming regular background writer activity, adjust the + * statistics collected so far. + */ + BgBufferSyncReset(NBuffers, targetNBuffers); + /* Reset strategy control to new size */ + StrategyReset(targetNBuffers); + } + + /* Update local knowledge of activeNBuffers */ + NBuffers = activeNBuffers; + + return true; +} + +bool +ProcessBarrierShmemResizeMapAndMem(void) +{ + int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers); +#ifdef USE_ASSERT_CHECKING + int activeNBuffers = pg_atomic_read_u32(&ShmemCtrl->activeNBuffers); + int transitNBuffers = pg_atomic_read_u32(&ShmemCtrl->transitNBuffers); +#endif /* USE_ASSERT_CHECKING */ + + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + /* The work to be done by the coordinator is done in the function which sends the barriers. Hence acknowledge immediately. */ + if (ShmemCtrl->coordinator == MyProcPid) + { + elog(LOG, "Phase 2: Coordinator backend %d acknowledging SHBUF_RESIZE_MAP_AND_MEM barrier immediately", MyProcPid); + return true; + } + + /* + * If buffer pool is being shrunk, we are already working with a smaller + * buffer pool, so shrinking address space and shared structures should not + * be a problem. When expanding, expanding the address space and shared + * structures beyond the current boundaries is not going to be a problem + * since we are not accessing that memory yet. So there is no reason to + * delay processing this barrier. + */ + + elog(LOG, "Phase 2: Processing SHBUF_RESIZE_MAP_AND_MEM barrier - adjusting memory maps and validating structures, coordinator is %d", + ShmemCtrl->coordinator); + + /* + * NBuffers should already be set to activeNBuffers from Phase 1. + * When shrinking, NBuffers should also be same as transitNBuffers in this phase. + */ + Assert(NBuffers == activeNBuffers); + if (targetNBuffers < pg_atomic_read_u32(&ShmemCtrl->currentNBuffers)) + { + /* Shrinking case - verify NBuffers equals transitNBuffers */ + Assert(NBuffers == transitNBuffers); + } + + /* + * Address space should already be reserved so resizing should not fail. If + * it fails, the address map of this backend may go out of sync with other + * backends. Hence PANIC. + */ + if (!AnonymousShmemResize()) + elog(PANIC, "failed to resize anonymous shared memory in backend %d", MyProcPid); + + elog(LOG, "Backend %d successfully remapped shared memory segments for buffer resize", MyProcPid); + + /* + * Backends validate that their pointers to shared buffer structures are + * still valid and have the correct size after memory remapping. + */ + BufferManagerShmemValidate(targetNBuffers); + + /* + * TODO: Save new transitNBuffers value in process local memory, if + * necessary. + */ + elog(LOG, "Backend %d successfully validated structure pointers after resize", MyProcPid); + + return true; +} + +bool +ProcessBarrierShmemExpand(void) +{ + int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers); +#ifdef USE_ASSERT_CHECKING + int transitNBuffers = pg_atomic_read_u32(&ShmemCtrl->transitNBuffers); +#endif /* USE_ASSERT_CHECKING */ + + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + /* The work to be done by the coordinator is done in the function which sends the barriers. Hence acknowledge immediately. */ + if (ShmemCtrl->coordinator == MyProcPid) + { + elog(LOG, "Phase 3: Coordinator backend %d acknowledging SHBUF_EXPAND barrier immediately", MyProcPid); + return true; + } + + /* + * Delay adjusting the new active size of buffer pool till this process + * becomes ready to resize buffers. + */ + if (delay_shmem_resize) + { + elog(LOG, "Phase 3: delaying SHBUF_EXPAND barrier - enabling allocations up to %d buffers, coordinator is %d", + targetNBuffers, ShmemCtrl->coordinator); + return false; + } + + elog(LOG, "Phase 3: Processing SHBUF_EXPAND barrier - enabling allocations up to %d buffers, coordinator is %d", + targetNBuffers, ShmemCtrl->coordinator); + + if (MyBackendType == B_BG_WRITER) + { + /* + * Adjust background writer statistics for the expanded buffer pool + */ + BgBufferSyncReset(NBuffers, targetNBuffers); + StrategyReset(targetNBuffers); + } + + /* Update local knowledge about the size of active buffer pool. */ + NBuffers = targetNBuffers; + + /* When expanding, NBuffers should be same as transitNBuffers previous phase. */ + Assert(NBuffers == transitNBuffers); + + return true; +} + +bool +ProcessBarrierShmemResizeFailed(void) +{ + int currentNBuffers = pg_atomic_read_u32(&ShmemCtrl->currentNBuffers); + int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers); + + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + /* The work to be done by the coordinator is done in the function which sends the barriers. Hence acknowledge immediately. */ + if (ShmemCtrl->coordinator == MyProcPid) + { + elog(LOG, "Coordinator backend %d acknowledging SHBUF_RESIZE_FAILED barrier immediately", MyProcPid); + return true; + } + + elog(LOG, "received proc signal indicating failure to resize shared buffers from %d to %d, restoring to %d, coordinator is %d", + NBuffers, targetNBuffers, currentNBuffers, ShmemCtrl->coordinator); + + /* Restore NBuffers to the original value */ + NBuffers = currentNBuffers; + + return true; +} diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index ee9f308379c..b43f1408855 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -4129,6 +4129,9 @@ PostgresSingleUserMain(int argc, char *argv[], /* Initialize size of fast-path lock cache. */ InitializeFastPathLocks(); + /* Initialize MaxNBuffers for buffer pool resizing. */ + InitializeMaxNBuffers(); + /* * Give preloaded libraries a chance to request additional shared memory. */ @@ -4319,14 +4322,12 @@ PostgresMain(const char *dbname, const char *username) */ BeginReportingGUCOptions(); - /* Verify the shared barrier, if it's still active: join and wait. */ - WaitOnShmemBarrier(); - /* - * After waiting on the barrier above we guaranteed to have NSharedBuffers - * broadcasted, so we can use it in the function below. + * TODO: The new backend should fetch the shared buffers status. If the + * resizing is going on, it should bring itself upto speed with it. If not, + * simply fetch the latest pointers are sizes. Is this the right place to do + * that? */ - AdjustShmemSize(); /* * Also set up handler to log session end; we have to wait till now to be diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 9a6a6275305..5794d9522d7 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -155,14 +155,12 @@ REPLICATION_ORIGIN_DROP "Waiting for a replication origin to become inactive so REPLICATION_SLOT_DROP "Waiting for a replication slot to become inactive so it can be dropped." RESTORE_COMMAND "Waiting for to complete." SAFE_SNAPSHOT "Waiting to obtain a valid snapshot for a READ ONLY DEFERRABLE transaction." -SHMEM_RESIZE_START "Waiting for other backends to start resizing shared memory." -SHMEM_RESIZE_EVICT "Waiting for other backends to finish buffer evication phase." -SHMEM_RESIZE_DONE "Waiting for other backends to finish resizing shared memory." SYNC_REP "Waiting for confirmation from a remote server during synchronous replication." WAL_RECEIVER_EXIT "Waiting for the WAL receiver to exit." WAL_RECEIVER_WAIT_START "Waiting for startup process to send initial data for streaming replication." WAL_SUMMARY_READY "Waiting for a new WAL summary to be generated." XACT_GROUP_UPDATE "Waiting for the group leader to update transaction status at transaction end." +PM_BUFFER_RESIZE_WAIT "Waiting for the postmaster to complete shared buffer pool resize operations." ABI_compatibility: diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 90d3feb547c..894a04caf0f 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -139,8 +139,10 @@ int max_parallel_maintenance_workers = 2; * MaxBackends is computed by PostmasterMain after modules have had a chance to * register background workers. */ -int NBuffers = 16384; -int MaxAvailableMemory = 524288; +int NBuffers = 0; +int NBuffersPending = 16384; +bool finalMaxNBuffers = false; +int MaxNBuffers = 0; int MaxConnections = 100; int max_worker_processes = 8; int max_parallel_workers = 8; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 641e535a73c..e0401fb6477 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -599,6 +599,38 @@ InitializeFastPathLocks(void) pg_nextpower2_32(FastPathLockGroupsPerBackend)); } +/* + * Initialize MaxNBuffers variable with validation. + * + * This must be called after GUCs have been loaded but before shared memory size + * is determined. + * + * Since MaxNBuffers limits the size of the buffer pool, it must be at least as + * much as NBuffersPending. If MaxNBuffers is 0 (default), set it to + * NBuffersPending. Otherwise, validate that MaxNBuffers is not less than + * NBuffersPending. + */ +void +InitializeMaxNBuffers(void) +{ + if (MaxNBuffers == 0) /* default/boot value */ + MaxNBuffers = NBuffersPending; + else + { + if (MaxNBuffers < NBuffersPending) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("max_shared_buffers (%d) cannot be less than current shared_buffers (%d)", + MaxNBuffers, NBuffersPending), + errhint("Increase max_shared_buffers or decrease shared_buffers."))); + } + } + + Assert(!finalMaxNBuffers); + finalMaxNBuffers = true; +} + /* * Early initialization of a backend (either standalone or under postmaster). * This happens even before InitPostgres. diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 8794e26ef1d..71a09a65182 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2731,7 +2731,7 @@ convert_to_base_unit(double value, const char *unit, * the value without loss. For example, if the base unit is GUC_UNIT_KB, 1024 * is converted to 1 MB, but 1025 is represented as 1025 kB. */ -static void +void convert_int_from_base_unit(int64 base_value, int base_unit, int64 *value, const char **unit) { diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 7b3ac5f3716..262f42c06c3 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -1108,25 +1108,23 @@ { name => 'shared_buffers', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_MEM', short_desc => 'Sets the number of shared memory buffers used by the server.', flags => 'GUC_UNIT_BLOCKS', - variable => 'NBuffers', + variable => 'NBuffersPending', boot_val => '16384', min => '16', max => 'INT_MAX / 2', - assign_hook => 'assign_shared_buffers' + check_hook => 'check_shared_buffers', + show_hook => 'show_shared_buffers', }, -# TODO: should this be PGC_POSTMASTER? -{ name => "max_available_memory", type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_MEM', +{ name => "max_shared_buffers", type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', short_desc => 'Sets the upper limit for the shared_buffers value.', - long_desc => 'Shared memory could be resized at runtime, this parameters sets the upper limit for it, beyond which resizing would not be supported. Normally this value would be the same as the total available memory.', flags => 'GUC_UNIT_BLOCKS', - variable => 'MaxAvailableMemory', - boot_val => '524288', - min => '16', + variable => 'MaxNBuffers', + boot_val => '0', + min => '0', max => 'INT_MAX / 2', }, - { name => 'vacuum_buffer_usage_limit', type => 'int', context => 'PGC_USERSET', group => 'RESOURCES_MEM', short_desc => 'Sets the buffer pool size for VACUUM, ANALYZE, and autovacuum.', flags => 'GUC_UNIT_KB', diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 8f1d0b7c031..ce5110b8636 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -12620,4 +12620,10 @@ proargnames => '{pid,io_id,io_generation,state,operation,off,length,target,handle_data_len,raw_result,result,target_desc,f_sync,f_localmem,f_buffered}', prosrc => 'pg_get_aios' }, +{ oid => '9999', descr => 'resize shared buffers according to the value of GUC `shared_buffers`', + proname => 'pg_resize_shared_buffers', + provolatile => 'v', + prorettype => 'bool', + proargtypes => '', + prosrc => 'pg_resize_shared_buffers'}, ] diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index a0c37a7749e..efe3d3c73ff 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -172,8 +172,11 @@ extern PGDLLIMPORT bool ExitOnAnyError; extern PGDLLIMPORT char *DataDir; extern PGDLLIMPORT int data_directory_mode; +/* TODO: This is no more a GUC variable; should be moved somewhere else. */ extern PGDLLIMPORT int NBuffers; -extern PGDLLIMPORT int MaxAvailableMemory; +extern PGDLLIMPORT int NBuffersPending; +extern PGDLLIMPORT bool finalMaxNBuffers; +extern PGDLLIMPORT int MaxNBuffers; extern PGDLLIMPORT int MaxBackends; extern PGDLLIMPORT int MaxConnections; extern PGDLLIMPORT int max_worker_processes; @@ -502,6 +505,7 @@ extern PGDLLIMPORT ProcessingMode Mode; extern void pg_split_opts(char **argv, int *argcp, const char *optstr); extern void InitializeMaxBackends(void); extern void InitializeFastPathLocks(void); +extern void InitializeMaxNBuffers(void); extern void InitPostgres(const char *in_dbname, Oid dboid, const char *username, Oid useroid, bits32 flags, diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 20bea8132fd..bbb7a225216 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -447,7 +447,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); -extern void StrategyReInitialize(int FirstBufferToInit); +extern void StrategyReset(int activeNBuffers); /* buf_table.c */ extern Size BufTableShmemSize(int size); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 74e226269af..6866d09dc22 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -20,6 +20,7 @@ #include "storage/buf.h" #include "storage/bufpage.h" #include "storage/relfilelocator.h" +#include "utils/guc.h" #include "utils/relcache.h" #include "utils/snapmgr.h" @@ -151,6 +152,7 @@ typedef struct WritebackContext WritebackContext; /* in globals.c ... this duplicates miscadmin.h */ extern PGDLLIMPORT int NBuffers; +extern PGDLLIMPORT int NBuffersPending; /* in bufmgr.c */ extern PGDLLIMPORT bool zero_damaged_pages; @@ -197,6 +199,11 @@ extern PGDLLIMPORT int32 *LocalRefCount; #define BUFFER_LOCK_SHARE 1 #define BUFFER_LOCK_EXCLUSIVE 2 +/* + * prototypes for functions in buf_init.c + */ +extern const char *show_shared_buffers(void); +extern bool check_shared_buffers(int *newval, void **extra, GucSource source); /* * prototypes for functions in bufmgr.c @@ -300,7 +307,7 @@ extern bool IsBufferCleanupOK(Buffer buffer); extern bool HoldingBufferPinThatDelaysRecovery(void); extern bool BgBufferSync(WritebackContext *wb_context); -extern void BgBufferSyncReset(int NBuffersOld, int NBuffersNew); +extern void BgBufferSyncReset(int currentNBuffers, int targetNBuffers); extern uint32 GetPinLimit(void); extern uint32 GetLocalPinLimit(void); @@ -317,11 +324,13 @@ extern void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped); -extern bool EvictExtraBuffers(int fromBuf, int toBuf); +extern bool EvictExtraBuffers(int targetNBuffers, int currentNBuffers); /* in buf_init.c */ -extern void BufferManagerShmemInit(int); -extern Size BufferManagerShmemSize(void); +extern void BufferManagerShmemInit(void); +extern Size BufferManagerShmemSize(bool set_reserved); +extern void BufferManagerShmemResize(int currentNBuffers, int targetNBuffers); +extern void BufferManagerShmemValidate(int targetNBuffers); /* in localbuf.c */ extern void AtProcExit_LocalBuffers(void); diff --git a/src/include/storage/ipc.h b/src/include/storage/ipc.h index 6e7b0abb625..10e74b34813 100644 --- a/src/include/storage/ipc.h +++ b/src/include/storage/ipc.h @@ -64,7 +64,6 @@ typedef void (*shmem_startup_hook_type) (void); /* ipc.c */ extern PGDLLIMPORT bool proc_exit_inprogress; extern PGDLLIMPORT bool shmem_exit_inprogress; -extern PGDLLIMPORT volatile bool pending_pm_shmem_resize; extern PGDLLIMPORT volatile bool delay_shmem_resize; pg_noreturn extern void proc_exit(int code); diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 704b065f9e9..34b5e6c48ca 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -24,9 +24,11 @@ #ifndef PG_SHMEM_H #define PG_SHMEM_H +#include "port/atomics.h" #include "storage/barrier.h" #include "storage/dsm_impl.h" #include "storage/spin.h" +#include "utils/guc.h" typedef struct AnonymousMapping { @@ -73,14 +75,20 @@ extern PGDLLIMPORT AnonymousMapping Mappings[ANON_MAPPINGS]; /* * ShmemControl is shared between backends and helps to coordinate shared * memory resize. + * + * TODO: I think we need a lock to protect this structure. If we do so, do we + * need to use atomic integers? */ typedef struct { - pg_atomic_uint32 NSharedBuffers; - pid_t evictor_pid; - Barrier Barrier; - pg_atomic_uint64 Generation; - bool Resizable; + pg_atomic_flag resize_in_progress; /* true if resizing is in progress. false otherwise. */ + pg_atomic_uint32 currentNBuffers; /* Original NBuffers value before resize started */ + pg_atomic_uint32 targetNBuffers; + pg_atomic_uint32 activeNBuffers; /* Active portion of buffer pool during resizing. */ + pg_atomic_uint32 transitNBuffers; /* Part of the buffer pool beyond activeNBuffers which may remain accessible during resizing. */ + pid_t coordinator; + ConditionVariable pm_cv; /* Coordinator waits for PM to complete its work using this CV. */ + bool pmwork_done; /* PM has completed its work of resizing buffers. */ } ShmemControl; extern PGDLLIMPORT ShmemControl *ShmemCtrl; @@ -95,7 +103,8 @@ extern PGDLLIMPORT int shared_memory_type; extern PGDLLIMPORT int huge_pages; extern PGDLLIMPORT int huge_page_size; extern PGDLLIMPORT int huge_pages_status; -extern PGDLLIMPORT int MaxAvailableMemory; +extern PGDLLIMPORT bool finalMaxNBuffers; +extern PGDLLIMPORT int MaxNBuffers; /* Possible values for huge_pages and huge_pages_status */ typedef enum @@ -145,7 +154,8 @@ extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags, void PrepareHugePages(void); bool ProcessBarrierShmemResize(Barrier *barrier); -void assign_shared_buffers(int newval, void *extra, bool *pending); +const char *show_shared_buffers(void); +bool check_shared_buffers(int *newval, void **extra, GucSource source); void AdjustShmemSize(void); extern void WaitOnShmemBarrier(void); extern void ShmemControlInit(void); diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 97033f84dce..b80b05f2804 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -54,7 +54,10 @@ typedef enum typedef enum { PROCSIGNAL_BARRIER_SMGRRELEASE, /* ask smgr to close files */ - PROCSIGNAL_BARRIER_SHMEM_RESIZE, /* ask backends to resize shared memory */ + PROCSIGNAL_BARRIER_SHBUF_SHRINK, /* shrink buffer pool - restrict allocations to new size */ + PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM, /* remap shared memory segments and update structure pointers */ + PROCSIGNAL_BARRIER_SHBUF_EXPAND, /* expand buffer pool - enable allocations in new range */ + PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED, /* signal backends that the shared buffer resizing failed. */ } ProcSignalBarrierType; /* diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h index 64ff5a286ba..6944560d485 100644 --- a/src/include/storage/shmem.h +++ b/src/include/storage/shmem.h @@ -50,11 +50,19 @@ extern HTAB *ShmemInitHashInSegment(const char *name, long init_size, extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr); extern void *ShmemInitStructInSegment(const char *name, Size size, bool *foundPtr, int shmem_segment); +extern void *ShmemUpdateStructInSegment(const char *name, Size size, + bool *foundPtr, int shmem_segment); extern Size add_size(Size s1, Size s2); extern Size mul_size(Size s1, Size s2); extern PGDLLIMPORT Size pg_get_shmem_pagesize(void); +extern bool ProcessBarrierShmemShrink(void); +extern bool ProcessBarrierShmemResizeMapAndMem(void); +extern bool ProcessBarrierShmemExpand(void); +extern bool ProcessBarrierShmemResizeFailed(void); + + /* ipci.c */ extern void RequestAddinShmemSpace(Size size); diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index f21ec37da89..08a84373fb7 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -459,6 +459,8 @@ extern config_handle *get_config_handle(const char *name); extern void AlterSystemSetConfigFile(AlterSystemStmt *altersysstmt); extern char *GetConfigOptionByName(const char *name, const char **varname, bool missing_ok); +extern void convert_int_from_base_unit(int64 base_value, int base_unit, + int64 *value, const char **unit); extern void TransformGUCArray(ArrayType *array, List **names, List **values); diff --git a/src/test/buffermgr/Makefile b/src/test/buffermgr/Makefile index 97c3da9e20a..eb275027fa6 100644 --- a/src/test/buffermgr/Makefile +++ b/src/test/buffermgr/Makefile @@ -13,6 +13,9 @@ EXTRA_INSTALL = contrib/pg_buffercache REGRESS = buffer_resize +# Custom configuration for buffer manager tests +TEMP_CONFIG = $(srcdir)/buffermgr_test.conf + subdir = src/test/buffermgr top_builddir = ../../.. include $(top_builddir)/src/Makefile.global diff --git a/src/test/buffermgr/buffermgr_test.conf b/src/test/buffermgr/buffermgr_test.conf new file mode 100644 index 00000000000..21ccf66d9c7 --- /dev/null +++ b/src/test/buffermgr/buffermgr_test.conf @@ -0,0 +1,9 @@ +# Configuration for buffer manager regression tests + +# Even if max_shared_buffers is set multiple times only the last one is used to +# as the limit on shared_buffers. +max_shared_buffers = 128kB +# Set initial shared_buffers as expected by test +shared_buffers = 128MB +# Set a larger value for max_shared_buffers to allow testing resize operations +max_shared_buffers = 300MB \ No newline at end of file diff --git a/src/test/buffermgr/expected/buffer_resize.out b/src/test/buffermgr/expected/buffer_resize.out index a986be9a5da..d5cb9d78437 100644 --- a/src/test/buffermgr/expected/buffer_resize.out +++ b/src/test/buffermgr/expected/buffer_resize.out @@ -1,9 +1,8 @@ -- Test buffer pool resizing and shared memory allocation tracking -- This test resizes the buffer pool multiple times and monitors -- shared memory allocations related to buffer management --- Create a separate schema for this test -CREATE SCHEMA buffer_resize_test; -SET search_path TO buffer_resize_test, public; +-- TODO: The test sets shared_buffers values in MBs. Instead it could use values +-- in kBs so that the test runs on very small machines. -- Create a view for buffer-related shared memory allocations CREATE VIEW buffer_allocations AS SELECT name, segment, size, allocated_size @@ -28,6 +27,49 @@ SHOW shared_buffers; 128MB (1 row) +SHOW max_shared_buffers; + max_shared_buffers +-------------------- + 300MB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+-----------+---------------- + Buffer Blocks | buffers | 134221824 | 134221824 + Buffer Descriptors | descriptors | 1048576 | 1048576 + Buffer IO Condition Variables | iocv | 262144 | 262144 + Checkpoint BufferIds | checkpoint | 327680 | 327680 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+-----------+--------------+----------------------- + buffers | 134225920 | 134225920 | 314580992 + checkpoint | 335872 | 335872 | 770048 + descriptors | 1056768 | 1056768 | 2465792 + iocv | 270336 | 270336 | 622592 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 16384 +(1 row) + +-- Calling pg_resize_shared_buffers() without changing shared_buffers should be a no-op. +SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t +(1 row) + +SHOW shared_buffers; + shared_buffers +---------------- + 128MB +(1 row) + SELECT * FROM buffer_allocations; name | segment | size | allocated_size -------------------------------+-------------+-----------+---------------- @@ -40,10 +82,10 @@ SELECT * FROM buffer_allocations; SELECT * FROM buffer_segments; name | size | mapping_size | mapping_reserved_size -------------+-----------+--------------+----------------------- - buffers | 134225920 | 134225920 | 2576982016 - checkpoint | 335872 | 335872 | 214753280 - descriptors | 1056768 | 1056768 | 429498368 - iocv | 270336 | 270336 | 429498368 + buffers | 134225920 | 134225920 | 314580992 + checkpoint | 335872 | 335872 | 770048 + descriptors | 1056768 | 1056768 | 2465792 + iocv | 270336 | 270336 | 622592 (4 rows) SELECT COUNT(*) AS buffer_count FROM pg_buffercache; @@ -60,10 +102,18 @@ SELECT pg_reload_conf(); t (1 row) -SELECT pg_sleep(1); - pg_sleep ----------- - +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; + shared_buffers +----------------------- + 128MB (pending: 64MB) +(1 row) + +SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t (1 row) SHOW shared_buffers; @@ -84,10 +134,10 @@ SELECT * FROM buffer_allocations; SELECT * FROM buffer_segments; name | size | mapping_size | mapping_reserved_size -------------+----------+--------------+----------------------- - buffers | 67117056 | 67117056 | 2576982016 - checkpoint | 172032 | 172032 | 214753280 - descriptors | 532480 | 532480 | 429498368 - iocv | 139264 | 139264 | 429498368 + buffers | 67117056 | 67117056 | 314580992 + checkpoint | 172032 | 172032 | 770048 + descriptors | 532480 | 532480 | 2465792 + iocv | 139264 | 139264 | 622592 (4 rows) SELECT COUNT(*) AS buffer_count FROM pg_buffercache; @@ -104,10 +154,18 @@ SELECT pg_reload_conf(); t (1 row) -SELECT pg_sleep(1); - pg_sleep ----------- - +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; + shared_buffers +----------------------- + 64MB (pending: 256MB) +(1 row) + +SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t (1 row) SHOW shared_buffers; @@ -128,10 +186,10 @@ SELECT * FROM buffer_allocations; SELECT * FROM buffer_segments; name | size | mapping_size | mapping_reserved_size -------------+-----------+--------------+----------------------- - buffers | 268443648 | 268443648 | 2576982016 - checkpoint | 663552 | 663552 | 214753280 - descriptors | 2105344 | 2105344 | 429498368 - iocv | 532480 | 532480 | 429498368 + buffers | 268443648 | 268443648 | 314580992 + checkpoint | 663552 | 663552 | 770048 + descriptors | 2105344 | 2105344 | 2465792 + iocv | 532480 | 532480 | 622592 (4 rows) SELECT COUNT(*) AS buffer_count FROM pg_buffercache; @@ -148,10 +206,18 @@ SELECT pg_reload_conf(); t (1 row) -SELECT pg_sleep(1); - pg_sleep ----------- - +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; + shared_buffers +------------------------ + 256MB (pending: 100MB) +(1 row) + +SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t (1 row) SHOW shared_buffers; @@ -172,10 +238,10 @@ SELECT * FROM buffer_allocations; SELECT * FROM buffer_segments; name | size | mapping_size | mapping_reserved_size -------------+-----------+--------------+----------------------- - buffers | 104865792 | 104865792 | 2576982016 - checkpoint | 262144 | 262144 | 214753280 - descriptors | 827392 | 827392 | 429498368 - iocv | 212992 | 212992 | 429498368 + buffers | 104865792 | 104865792 | 314580992 + checkpoint | 262144 | 262144 | 770048 + descriptors | 827392 | 827392 | 2465792 + iocv | 212992 | 212992 | 622592 (4 rows) SELECT COUNT(*) AS buffer_count FROM pg_buffercache; @@ -192,10 +258,18 @@ SELECT pg_reload_conf(); t (1 row) -SELECT pg_sleep(1); - pg_sleep ----------- - +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; + shared_buffers +------------------------ + 100MB (pending: 128kB) +(1 row) + +SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t (1 row) SHOW shared_buffers; @@ -216,10 +290,10 @@ SELECT * FROM buffer_allocations; SELECT * FROM buffer_segments; name | size | mapping_size | mapping_reserved_size -------------+--------+--------------+----------------------- - buffers | 139264 | 139264 | 2576982016 - checkpoint | 8192 | 8192 | 214753280 - descriptors | 8192 | 8192 | 429498368 - iocv | 8192 | 8192 | 429498368 + buffers | 139264 | 139264 | 314580992 + checkpoint | 8192 | 8192 | 770048 + descriptors | 8192 | 8192 | 2465792 + iocv | 8192 | 8192 | 622592 (4 rows) SELECT COUNT(*) AS buffer_count FROM pg_buffercache; @@ -228,10 +302,28 @@ SELECT COUNT(*) AS buffer_count FROM pg_buffercache; 16 (1 row) --- Clean up the schema and all its objects -RESET search_path; -DROP SCHEMA buffer_resize_test CASCADE; -NOTICE: drop cascades to 3 other objects -DETAIL: drop cascades to view buffer_resize_test.buffer_allocations -drop cascades to view buffer_resize_test.buffer_segments -drop cascades to extension pg_buffercache +-- Test 6: Try to set shared_buffers higher than max_shared_buffers (should fail) +ALTER SYSTEM SET shared_buffers = '400MB'; +ERROR: invalid value for parameter "shared_buffers": 51200 +DETAIL: "shared_buffers" must be less than "max_shared_buffers". +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +-- reconnect to ensure new setting is loaded +\c +-- This should show the old value since the configuration was rejected +SHOW shared_buffers; + shared_buffers +---------------- + 128kB +(1 row) + +SHOW max_shared_buffers; + max_shared_buffers +-------------------- + 300MB +(1 row) + diff --git a/src/test/buffermgr/meson.build b/src/test/buffermgr/meson.build index e71dcdea685..561630e846f 100644 --- a/src/test/buffermgr/meson.build +++ b/src/test/buffermgr/meson.build @@ -8,10 +8,15 @@ tests += { 'sql': [ 'buffer_resize', ], + 'regress_args': ['--temp-config', files('buffermgr_test.conf')], }, 'tap': { + 'env': { + 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', + }, 'tests': [ 't/001_resize_buffer.pl', + 't/003_parallel_resize_buffer.pl', ], }, } diff --git a/src/test/buffermgr/sql/buffer_resize.sql b/src/test/buffermgr/sql/buffer_resize.sql index 45f5bb6d78b..dfaaeabfcbb 100644 --- a/src/test/buffermgr/sql/buffer_resize.sql +++ b/src/test/buffermgr/sql/buffer_resize.sql @@ -1,10 +1,8 @@ -- Test buffer pool resizing and shared memory allocation tracking -- This test resizes the buffer pool multiple times and monitors -- shared memory allocations related to buffer management - --- Create a separate schema for this test -CREATE SCHEMA buffer_resize_test; -SET search_path TO buffer_resize_test, public; +-- TODO: The test sets shared_buffers values in MBs. Instead it could use values +-- in kBs so that the test runs on very small machines. -- Create a view for buffer-related shared memory allocations CREATE VIEW buffer_allocations AS @@ -28,6 +26,13 @@ CREATE EXTENSION IF NOT EXISTS pg_buffercache; -- Test 1: Default shared_buffers SHOW shared_buffers; +SHOW max_shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; +-- Calling pg_resize_shared_buffers() without changing shared_buffers should be a no-op. +SELECT pg_resize_shared_buffers(); +SHOW shared_buffers; SELECT * FROM buffer_allocations; SELECT * FROM buffer_segments; SELECT COUNT(*) AS buffer_count FROM pg_buffercache; @@ -35,7 +40,10 @@ SELECT COUNT(*) AS buffer_count FROM pg_buffercache; -- Test 2: Set to 64MB ALTER SYSTEM SET shared_buffers = '64MB'; SELECT pg_reload_conf(); -SELECT pg_sleep(1); +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; +SELECT pg_resize_shared_buffers(); SHOW shared_buffers; SELECT * FROM buffer_allocations; SELECT * FROM buffer_segments; @@ -44,7 +52,10 @@ SELECT COUNT(*) AS buffer_count FROM pg_buffercache; -- Test 3: Set to 256MB ALTER SYSTEM SET shared_buffers = '256MB'; SELECT pg_reload_conf(); -SELECT pg_sleep(1); +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; +SELECT pg_resize_shared_buffers(); SHOW shared_buffers; SELECT * FROM buffer_allocations; SELECT * FROM buffer_segments; @@ -53,7 +64,10 @@ SELECT COUNT(*) AS buffer_count FROM pg_buffercache; -- Test 4: Set to 100MB (non-power-of-two) ALTER SYSTEM SET shared_buffers = '100MB'; SELECT pg_reload_conf(); -SELECT pg_sleep(1); +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; +SELECT pg_resize_shared_buffers(); SHOW shared_buffers; SELECT * FROM buffer_allocations; SELECT * FROM buffer_segments; @@ -62,12 +76,20 @@ SELECT COUNT(*) AS buffer_count FROM pg_buffercache; -- Test 5: Set to minimum 128kB ALTER SYSTEM SET shared_buffers = '128kB'; SELECT pg_reload_conf(); -SELECT pg_sleep(1); +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; +SELECT pg_resize_shared_buffers(); SHOW shared_buffers; SELECT * FROM buffer_allocations; SELECT * FROM buffer_segments; SELECT COUNT(*) AS buffer_count FROM pg_buffercache; --- Clean up the schema and all its objects -RESET search_path; -DROP SCHEMA buffer_resize_test CASCADE; +-- Test 6: Try to set shared_buffers higher than max_shared_buffers (should fail) +ALTER SYSTEM SET shared_buffers = '400MB'; +SELECT pg_reload_conf(); +-- reconnect to ensure new setting is loaded +\c +-- This should show the old value since the configuration was rejected +SHOW shared_buffers; +SHOW max_shared_buffers; diff --git a/src/test/buffermgr/t/001_resize_buffer.pl b/src/test/buffermgr/t/001_resize_buffer.pl index 8cf9e4539ab..7b0a78ebe5b 100644 --- a/src/test/buffermgr/t/001_resize_buffer.pl +++ b/src/test/buffermgr/t/001_resize_buffer.pl @@ -14,40 +14,26 @@ sub apply_and_verify_buffer_change { my ($node, $new_size) = @_; - # Use a single background_psql session for consistency - my $psql_session = $node->background_psql('postgres'); - $psql_session->query_safe("ALTER SYSTEM SET shared_buffers = '$new_size'"); - $psql_session->query_safe("SELECT pg_reload_conf()"); - - # Wait till the resizing finishes using the same session - # - # TODO: Right now there is no way to know when the resize has finished and - # all the backends are using new value of shared_buffers. Hence we poll - # manually until we get the expected value in the same session. - my $current_size; - my $attempts = 0; - my $max_attempts = 60; # 60 seconds timeout - do { - $current_size = $psql_session->query_safe("SHOW shared_buffers"); - $attempts++; - - # Only sleep if we didn't get the expected result and haven't timed out yet - if ($current_size ne $new_size && $attempts < $max_attempts) { - sleep(1); - } - } while ($current_size ne $new_size && $attempts < $max_attempts); - - $psql_session->quit; - - # Check if we succeeded or timed out - if ($current_size ne $new_size) { - die "Timeout waiting for shared_buffers to change to $new_size (got $current_size after ${attempts}s)"; - } + # Use the new pg_resize_shared_buffers() interface which handles everything synchronously + $node->safe_psql('postgres', "ALTER SYSTEM SET shared_buffers = '$new_size'"); + $node->safe_psql('postgres', "SELECT pg_reload_conf()"); + # Call the resize function - it returns when the operation is complete + is($node->safe_psql('postgres', "SELECT pg_resize_shared_buffers()"), 't', + 'resizing to ' . $new_size . ' succeeded'); + is($node->safe_psql('postgres', "SHOW shared_buffers"), $new_size, + 'SHOW after resizing to '. $new_size . ' succeeded'); } # Initialize a cluster and start pgbench in the background for concurrent load. my $node = PostgreSQL::Test::Cluster->new('main'); $node->init; + +# Permit resizing up to 1GB for this test and let the server start with 128MB. +$node->append_conf('postgresql.conf', qq{ +max_shared_buffers = 1GB +shared_buffers = 128MB +}); + $node->start; $node->safe_psql('postgres', "CREATE EXTENSION pg_buffercache"); my $pgb_scale = 10; diff --git a/src/test/buffermgr/t/003_parallel_resize_buffer.pl b/src/test/buffermgr/t/003_parallel_resize_buffer.pl new file mode 100644 index 00000000000..9cbb5452fd2 --- /dev/null +++ b/src/test/buffermgr/t/003_parallel_resize_buffer.pl @@ -0,0 +1,71 @@ +# Copyright (c) 2025-2025, PostgreSQL Global Development Group +# +# Test that only one pg_resize_shared_buffers() call succeeds when multiple +# sessions attempt to resize buffers concurrently + +use strict; +use warnings; +use IPC::Run; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Skip this test if injection points are not supported +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +# Initialize a cluster +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init; +$node->append_conf('postgresql.conf', 'shared_preload_libraries = injection_points'); +$node->append_conf('postgresql.conf', 'shared_buffers = 128kB'); +$node->append_conf('postgresql.conf', 'max_shared_buffers = 256kB'); +$node->start; + +# Load injection points extension for test coordination +$node->safe_psql('postgres', "CREATE EXTENSION injection_points"); + +# Test 1: Two concurrent pg_resize_shared_buffers() calls +# Set up injection point to pause the first resize call +$node->safe_psql('postgres', + "SELECT injection_points_attach('pg-resize-shared-buffers-flag-set', 'wait')"); + +# Change shared_buffers for the resize operation +$node->safe_psql('postgres', "ALTER SYSTEM SET shared_buffers = '144kB'"); +$node->safe_psql('postgres', "SELECT pg_reload_conf()"); + +# Start first resize session (will pause at injection point) +my $session1 = $node->background_psql('postgres'); +$session1->query_until( + qr/starting_resize/, + q( + \echo starting_resize + SELECT pg_resize_shared_buffers(); + ) +); + +# Wait until session actually reaches the injection point +$node->wait_for_event('client backend', 'pg-resize-shared-buffers-flag-set'); + +# Start second resize session (should fail immediately since resize is in progress) +my $result2 = $node->safe_psql('postgres', "SELECT pg_resize_shared_buffers()"); + +# The second call should return false (already in progress) +is($result2, 'f', 'Second concurrent resize call returns false'); + +# Wake up the first session +$node->safe_psql('postgres', + "SELECT injection_points_wakeup('pg-resize-shared-buffers-flag-set')"); + +# The pg_resize_shared_buffers() in session1 should now complete successfully +# We can't easily capture the return value from query_until, but we can +# verify the session completes without error and the resize actually happened +$session1->quit; + +# Detach injection point +$node->safe_psql('postgres', + "SELECT injection_points_detach('pg-resize-shared-buffers-flag-set')"); + +done_testing(); \ No newline at end of file -- 2.34.1