From 0ffe042c70e0b665b6ae673b39713799618b9f02 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 19 Oct 2023 15:12:52 -0400
Subject: [PATCH 1/2] RelationTruncate must also set DELAY_CHKPT_START

Previously, it set only DELAY_CHKPT_COMPLETE. That was important,
because it meant that if the XLOG_SMGR_TRUNCATE record preceded a
XLOG_CHECKPOINT_ONLINE record in the WAL, then the truncation would
also happen on disk before the XLOG_CHECKPOINT_ONLINE record was
written.

However, it didn't guarantee that the sync request for the
truncation got processed before the XLOG_CHECKPOINT_ONLINE record
was written. By setting XLOG_CHKPT_START, we guarantee that if
an XLOG_SMGR_TRUNCATE record is written to WAL before the redo
pointer of some concurrent checkpoint, the sync request queued by
that operation must be processed by that checkpoint (rather than
being left for the following one).

Report by Thomas Munro. Patch by me.
---
 src/backend/catalog/storage.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 93f07e49b7..bef4269368 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -336,20 +336,33 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 	RelationPreTruncate(rel);
 
 	/*
-	 * Make sure that a concurrent checkpoint can't complete while truncation
-	 * is in progress.
+	 * The code which follows can interact with concurrent checkpoints in two
+	 * separate ways.
 	 *
-	 * The truncation operation might drop buffers that the checkpoint
+	 * First, the truncation operation might drop buffers that the checkpoint
 	 * otherwise would have flushed. If it does, then it's essential that the
 	 * files actually get truncated on disk before the checkpoint record is
 	 * written. Otherwise, if reply begins from that checkpoint, the
 	 * to-be-truncated blocks might still exist on disk but have older
 	 * contents than expected, which can cause replay to fail. It's OK for the
 	 * blocks to not exist on disk at all, but not for them to have the wrong
-	 * contents.
+	 * contents. For this reason, we need to set DELAY_CHKPT_COMPLETE while
+	 * this code executes.
+	 *
+	 * Second, the call to smgrtruncate() below will in turn call
+	 * RegisterSyncRequest(). We need the sync request created by that call
+	 * to be processed before the checkpoint completes. CheckPointGuts()
+	 * will call ProcessSyncRequests(), but if we register our sync request
+	 * after that happens, then the WAL record for the truncation could end
+	 * up preceding the checkpoint record, while the actual sync doesn't happen
+	 * until the next checkpoint. To prevent that, we need to set
+	 * DELAY_CHKPT_START here. That way, if the XLOG_SMGR_TRUNCATE precedes
+	 * the redo pointer of a concurrent checkpoint, we're guaranteed that the
+	 * corresponding sync request will be processed before the checkpoint
+	 * completes.
 	 */
-	Assert((MyProc->delayChkptFlags & DELAY_CHKPT_COMPLETE) == 0);
-	MyProc->delayChkptFlags |= DELAY_CHKPT_COMPLETE;
+	Assert((MyProc->delayChkptFlags & (DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE)) == 0);
+	MyProc->delayChkptFlags |= DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE;
 
 	/*
 	 * We WAL-log the truncation before actually truncating, which means
@@ -397,7 +410,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 	smgrtruncate(RelationGetSmgr(rel), forks, nforks, blocks);
 
 	/* We've done all the critical work, so checkpoints are OK now. */
-	MyProc->delayChkptFlags &= ~DELAY_CHKPT_COMPLETE;
+	MyProc->delayChkptFlags &= ~(DELAY_CHKPT_START | DELAY_CHKPT_COMPLETE);
 
 	/*
 	 * Update upper-level FSM pages to account for the truncation. This is
-- 
2.37.1 (Apple Git-137.1)