From 0d1f0f67557679cb20d53096d12fa22de7760fe5 Mon Sep 17 00:00:00 2001 From: Maxim Orlov Date: Fri, 28 Feb 2025 17:06:01 +0300 Subject: [PATCH v4] Process sync requests incrementally in AbsorbSyncRequests If the number of sync requests is big enough, the palloc() call in AbsorbSyncRequests() will attempt to allocate more than 1 Gb of memory, resulting in failure. This will lead to an infinite loop in the checkpointer process. So, we process requests incrementally with a buffer of not more than 1 Gb on each step. --- src/backend/postmaster/checkpointer.c | 63 +++++++++++++++++---------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 7acbbd3e26..de2e99844b 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -1324,39 +1324,56 @@ AbsorbSyncRequests(void) CheckpointerRequest *requests = NULL; CheckpointerRequest *request; int n; + bool loop; if (!AmCheckpointerProcess()) return; - LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); - - /* - * We try to avoid holding the lock for a long time by copying the request - * array, and processing the requests after releasing the lock. - * - * Once we have cleared the requests from shared memory, we have to PANIC - * if we then fail to absorb them (eg, because our hashtable runs out of - * memory). This is because the system cannot run safely if we are unable - * to fsync what we have been told to fsync. Fortunately, the hashtable - * is so small that the problem is quite unlikely to arise in practice. - */ - n = CheckpointerShmem->num_requests; - if (n > 0) + do { - requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest)); - memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest)); - } + LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); - START_CRIT_SECTION(); + /* + * We try to avoid holding the lock for a long time by copying the + * request array, and processing the requests after releasing the lock. + * + * Once we have cleared the requests from shared memory, we have to + * PANIC if we then fail to absorb them (eg, because our hashtable runs + * out of memory). This is because the system cannot run safely if we + * are unable to fsync what we have been told to fsync. Fortunately, + * the hashtable is so small that the problem is quite unlikely to arise + * in practice. + * + * Note: we could not palloc more than 1Gb of memory, thus make sure + * that the maximum number of elements will fit in the requests buffer. + */ + n = Min(CheckpointerShmem->num_requests, + MaxAllocSize / sizeof(CheckpointerRequest)); + if (n > 0) + { + if (!requests) + requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest)); + memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest)); - CheckpointerShmem->num_requests = 0; + CheckpointerShmem->num_requests -= n; - LWLockRelease(CheckpointerCommLock); + memmove(CheckpointerShmem->requests, + CheckpointerShmem->requests + n, + CheckpointerShmem->num_requests * sizeof(CheckpointerRequest)); + } + + START_CRIT_SECTION(); + + /* Are there any requests in the queue? If so, keep going. */ + loop = CheckpointerShmem->num_requests != 0; + + LWLockRelease(CheckpointerCommLock); - for (request = requests; n > 0; request++, n--) - RememberSyncRequest(&request->ftag, request->type); + for (request = requests; n > 0; request++, n--) + RememberSyncRequest(&request->ftag, request->type); - END_CRIT_SECTION(); + END_CRIT_SECTION(); + } while (loop); if (requests) pfree(requests); -- 2.48.1