From b01ee0c2408669ced7154be9f0de71e8771a6a8c Mon Sep 17 00:00:00 2001 From: Anthonin Bonnefoy Date: Tue, 16 Dec 2025 10:48:12 +0100 Subject: Fix 'unexpected data beyond EOF' on replica restart On restart, a replica can fail with an 'unexpected data beyond EOF in block 200 of relation T/D/R' error. This can happen under the following circumstances: - A relation has a size of 400 blocks. - Blocks 201 to 400 are empty. - Block 200 has two rows. - Blocks 100 to 199 are empty. - A restartpoint is done - Vacuum truncates the relation to 200 blocks - A FPW deletes a row in block 200 - A checkpoint is done - A FPW deletes the last row in block 200 - Vacuum truncates the relation to 100 blocks - The replica restarts When the replica restarts: - The relation on disk is reduced to 100 blocks due to having applied the truncate before restart. - The first truncate to 200 blocks is replayed. It silently fails, but it will still update the cache size to 200 blocks - The first FPW on block 200 is applied, XLogReadBufferForRead will rely on the cached size and incorrectly assume the page exists in file, and thus won't extend the relation. - The Checkpoint Online is replayed, calling smgrdestroyall which will discard the cached size. - The second FPW on block 200 is applied. This time, the detected size is 100 blocks, an extend is attempted. However, the block 200 is already present in the buffer table due to the first FPW. This triggers the 'unexpected data beyond EOF' since the page isn't new. This patch fixes the issue by moving smgr_cached_nblocks updates in mdtruncate. If truncate size > old size, we set the cache to the old size. Otherwise, on successful truncate, the cached size is set to truncate size. --- src/backend/storage/smgr/md.c | 26 +++++++++++++++++++++++++- src/backend/storage/smgr/smgr.c | 12 ------------ 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 2ccb0faceb5..d0d116f42ef 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -1280,18 +1280,33 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber priorblocks; int curopensegs; + /* Make the cached size is invalid if we encounter an error. */ + reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber; + if (nblocks > curnblk) { - /* Bogus request ... but no complaint if InRecovery */ + /* + * This can happen when a relation was truncated multiple times and + * the restartpoint is located before the truncates. On restart, the + * relation on disk will have the size of the second truncate. As the + * first truncate has a higher nblocks, mdtruncate will be called with + * nblocks > curnblk during startup. + */ if (InRecovery) + { + reln->smgr_cached_nblocks[forknum] = curnblk; return; + } ereport(ERROR, (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now", relpath(reln->smgr_rlocator, forknum).str, nblocks, curnblk))); } if (nblocks == curnblk) + { + reln->smgr_cached_nblocks[forknum] = curnblk; return; /* no work */ + } /* * Truncate segments, starting at the last one. Starting at the end makes @@ -1357,6 +1372,15 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, } curopensegs--; } + + /* + * We might as well update the local smgr_cached_nblocks values. The smgr + * cache inval message that this function sent will cause other backends + * to invalidate their copies of smgr_cached_nblocks, and these ones too + * at the next command boundary. But ensure they aren't outright wrong + * until then. + */ + reln->smgr_cached_nblocks[forknum] = nblocks; } /* diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index bce37a36d51..b017266316e 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -898,20 +898,8 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, /* Do the truncation */ for (i = 0; i < nforks; i++) { - /* Make the cached size is invalid if we encounter an error. */ - reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber; - smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], old_nblocks[i], nblocks[i]); - - /* - * We might as well update the local smgr_cached_nblocks values. The - * smgr cache inval message that this function sent will cause other - * backends to invalidate their copies of smgr_cached_nblocks, and - * these ones too at the next command boundary. But ensure they aren't - * outright wrong until then. - */ - reln->smgr_cached_nblocks[forknum[i]] = nblocks[i]; } } -- 2.51.0