From 6bb06cf444fd587ac30b59f725491102ccdc8ec6 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 19 Mar 2021 13:06:35 +1300
Subject: [PATCH v8] Provide recovery_init_sync_method=wal.

It's important to avoid relying on dirty data in the kernel's cache when
deciding that a change has already been applied.  The existing options
achieve that but potentially do a lot of unnecesssary extra work.

This new option syncs only the WAL files, and then relies on analysis of
the WAL during replay.  We start by assuming that modifications from
before the last checkpoint were already flushed by the checkpoint.  That
is true if you crashed or ran pg_basebackup to the present location, but
false if you made a second copy with non-syncing tools.  Then, if
full_page_writes=on, we expect all data modified since the last
checkpoint to be written to disk as part of the end-of-recovery
checkpoint.  If full_page_writes=off, things are slightly trickier: we
skip updating pages that have an LSN higher than a WAL record
("BLK_DONE"), but we don't trust any data we read from the kernel's
cache to be really on disk, so we skip replay but we still make a note
to sync the file at the next checkpoint anyway.

(A slightly more paranoid mode would mark skipped pages dirty instead,
to trigger a rewrite of all pages we skip, if our threat model includes
operating systems that might mark their own page cache pages "clean"
rather than invalidating them after a writeback failure.  That is not
done in this commit.)

Discussion: https://postgr.es/m/11bc2bb7-ecb5-3ad0-b39f-df632734cd81%40discourse.org
Discussion: https://postgr.es/m/CAEET0ZHGnbXmi8yF3ywsDZvb3m9CbdsGZgfTXscQ6agcbzcZAw%40mail.gmail.com
---
 doc/src/sgml/config.sgml                      | 12 +++++
 src/backend/access/transam/xlog.c             | 52 +++++++++++++------
 src/backend/access/transam/xlogutils.c        | 11 ++++
 src/backend/storage/buffer/bufmgr.c           | 20 +++++++
 src/backend/storage/file/fd.c                 | 11 ++--
 src/backend/storage/smgr/md.c                 | 15 ++++++
 src/backend/storage/smgr/smgr.c               | 15 ++++++
 src/backend/utils/misc/guc.c                  |  1 +
 src/backend/utils/misc/postgresql.conf.sample |  2 +-
 src/include/storage/bufmgr.h                  |  1 +
 src/include/storage/fd.h                      |  1 +
 src/include/storage/md.h                      |  1 +
 src/include/storage/smgr.h                    |  2 +
 13 files changed, 124 insertions(+), 20 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index ee4925d6d9..fc453b3915 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9753,6 +9753,18 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
         <productname>PostgreSQL</productname>, and relevant error messages may
         appear only in kernel logs.
        </para>
+       <para>
+        When set to <literal>wal</literal>,
+        <productname>PostgreSQL</productname> will synchronize all WAL files
+        before recovery begins, and then rely on WAL replay to synchronize all
+        data files modified since the last checkpoint.  Data files that are not
+        modified by WAL replay are not synchronized, which is sufficient if
+        recovering from a crash or a copy made with
+        <application>pg_basebackup</application> (without
+        <literal>--no-sync</literal>), but may be insufficient if
+        the data directory was copied from its original location without taking
+        extra measures to ensure that it has reached the disk.
+       </para>
       </listitem>
      </varlistentry>
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 6f8810e149..89678e6470 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -927,7 +927,7 @@ static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 static int	emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 static void XLogFileClose(void);
 static void PreallocXlogFiles(XLogRecPtr endptr);
-static void RemoveTempXlogFiles(void);
+static void ScanXlogFilesAfterCrash(void);
 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr);
 static void RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo,
 						   XLogSegNo *endlogSegNo);
@@ -4030,13 +4030,15 @@ UpdateLastRemovedPtr(char *filename)
 }
 
 /*
- * Remove all temporary log files in pg_wal
+ * Remove all temporary log files in pg_wal, and make sure that all remaining
+ * files are down on disk before we replay anything in them if
+ * recovery_init_sync_method requires that.
  *
  * This is called at the beginning of recovery after a previous crash,
  * at a point where no other processes write fresh WAL data.
  */
 static void
-RemoveTempXlogFiles(void)
+ScanXlogFilesAfterCrash(void)
 {
 	DIR		   *xldir;
 	struct dirent *xlde;
@@ -4048,12 +4050,23 @@ RemoveTempXlogFiles(void)
 	{
 		char		path[MAXPGPATH];
 
-		if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
-			continue;
-
 		snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
-		unlink(path);
-		elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
+
+		if (strncmp(xlde->d_name, "xlogtemp.", 9) == 0)
+		{
+			unlink(path);
+			elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
+		}
+		else if (IsXLogFileName(xlde->d_name) &&
+				 recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_WAL)
+		{
+			/*
+			 * Make sure that whatever we read from this WAL file is durably on
+			 * disk before replaying in RECOVERY_INIT_SYNC_METHOD_WAL mode.
+			 * Necessary because SyncDataDirectory() won't do that for us.
+			 */
+			fsync_fname(path, false);
+		}
 	}
 	FreeDir(xldir);
 }
@@ -6547,23 +6560,30 @@ StartupXLOG(void)
 	ValidateXLOGDirectoryStructure();
 
 	/*----------
-	 * If we previously crashed, perform a couple of actions:
+	 * If we previously crashed:
 	 *
 	 * - The pg_wal directory may still include some temporary WAL segments
 	 *   used when creating a new segment, so perform some clean up to not
-	 *   bloat this path.  This is done first as there is no point to sync
-	 *   this temporary data.
+	 *   bloat this path, in ScanXlogFilesAfterCrash().
+	 *
+	 * - It's possible that some WAL data had been written but not yet synced.
+	 *   Therefore we have to sync these files before replaying the records
+	 *   they contain.  If recovery_init_wal_sync_method=wal we'll do that
+	 *   in ScanXlogFilesAfterCrash(); otherwise we'll leave it up to
+	 *   SyncDataDirectory().
 	 *
 	 * - There might be data which we had written, intending to fsync it, but
-	 *   which we had not actually fsync'd yet.  Therefore, a power failure in
-	 *   the near future might cause earlier unflushed writes to be lost, even
-	 *   though more recent data written to disk from here on would be
-	 *   persisted.  To avoid that, fsync the entire data directory.
+	 *   which we had not actually fsync'd yet.  If
+	 *   recovery_init_wal_sync_method=wal, then XLogReadBufferForRedo() will
+	 *   compute the set of files that may need fsyncing at the next
+	 *   checkpoint, during recovery.  Otherwise, SyncDataDirectory() will
+	 *   sync the entire pgdata directory up front, to avoid relying on data
+	 *   from the kernel's cache that hasn't reached disk yet.
 	 */
 	if (ControlFile->state != DB_SHUTDOWNED &&
 		ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
 	{
-		RemoveTempXlogFiles();
+		ScanXlogFilesAfterCrash();
 		SyncDataDirectory();
 	}
 
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index d17d660f46..85303e761e 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -401,7 +401,18 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
 					LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE);
 			}
 			if (lsn <= PageGetLSN(BufferGetPage(*buf)))
+			{
+				/*
+				 * The page is from the future.  We must be in crash recovery.
+				 * We don't need to redo the record, but we do need to make
+				 * sure that the image as we have seen it is durably stored on
+				 * disk as part of the next checkpoint, unless that was already
+				 * done by SyncDataDirectory().
+				 */
+				if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_WAL)
+					RequestBufferSync(*buf);
 				return BLK_DONE;
+			}
 			else
 				return BLK_NEEDS_REDO;
 		}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 852138f9c9..6393545f0c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1530,6 +1530,26 @@ MarkBufferDirty(Buffer buffer)
 	}
 }
 
+/*
+ * Request that the file containing a buffer be fsynced at the next checkpoint.
+ * This is only used in crash recovery, to make it safe to skip applying WAL on
+ * the basis that the page already has a change.  We want to make sure that the
+ * data we read from the kernel matches what's on disk at the next checkpoint.
+ */
+void
+RequestBufferSync(Buffer buffer)
+{
+	SMgrRelation reln;
+	BufferDesc *bufHdr;
+
+	Assert(BufferIsPinned(buffer));
+	Assert(!BufferIsLocal(buffer));
+
+	bufHdr = GetBufferDescriptor(buffer - 1);
+	reln = smgropen(bufHdr->tag.rnode, InvalidBackendId);
+	smgrsync(reln, bufHdr->tag.forkNum, bufHdr->tag.blockNum);
+}
+
 /*
  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
  *
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 28933f8bbe..dde20fb5c3 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -336,6 +336,7 @@ static void walkdir(const char *path,
 					void (*action) (const char *fname, bool isdir, int elevel),
 					bool process_symlinks,
 					int elevel);
+
 #ifdef PG_FLUSH_DATA_WORKS
 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
 #endif
@@ -3293,8 +3294,9 @@ do_syncfs(const char *path)
 #endif
 
 /*
- * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
- * all potential filesystem, depending on recovery_init_sync_method setting.
+ * Issue fsync recursively on PGDATA and all its contents, issue syncfs for all
+ * potential filesystem, or do nothing, depending on the
+ * recovery_init_sync_method setting.
  *
  * We fsync regular files and directories wherever they are, but we
  * follow symlinks only for pg_wal and immediately under pg_tblspc.
@@ -3323,6 +3325,10 @@ SyncDataDirectory(void)
 	if (!enableFsync)
 		return;
 
+	/* Likewise if we're using WAL analysis instead. */
+	if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_WAL)
+		return;
+
 	/*
 	 * If pg_wal is a symlink, we'll need to recurse into it separately,
 	 * because the first walkdir below will ignore it.
@@ -3478,7 +3484,6 @@ walkdir(const char *path,
 		(*action) (path, true, elevel);
 }
 
-
 /*
  * Hint to the OS that it should get ready to fsync() this file.
  *
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 1e12cfad8e..d74506be31 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -961,6 +961,21 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 	}
 }
 
+/*
+ *	mdsync() -- Schedule a sync at the next checkpoint.
+ *
+ * This is useful in crash recovery, to ensure that data we've read from the
+ * kernel matches what is on disk before a checkpoint.
+ */
+void
+mdsync(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+	MdfdVec	   *seg;
+
+	seg = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
+	register_dirty_segment(reln, forknum, seg);
+}
+
 /*
  * register_dirty_segment() -- Mark a relation segment as needing fsync
  *
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 4dc24649df..6856c40300 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -62,6 +62,8 @@ typedef struct f_smgr
 	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber nblocks);
 	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
+	void		(*smgr_sync) (SMgrRelation reln, ForkNumber forknum,
+							  BlockNumber blocknum);
 } f_smgr;
 
 static const f_smgr smgrsw[] = {
@@ -79,6 +81,7 @@ static const f_smgr smgrsw[] = {
 		.smgr_read = mdread,
 		.smgr_write = mdwrite,
 		.smgr_writeback = mdwriteback,
+		.smgr_sync = mdsync,
 		.smgr_nblocks = mdnblocks,
 		.smgr_truncate = mdtruncate,
 		.smgr_immedsync = mdimmedsync,
@@ -540,6 +543,18 @@ smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 											nblocks);
 }
 
+
+/*
+ *	smgrsync() -- Request that the file containing a block be flushed at the
+ *				  next checkpoint.
+ */
+void
+smgrsync(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+	smgrsw[reln->smgr_which].smgr_sync(reln, forknum, blocknum);
+}
+
+
 /*
  *	smgrnblocks() -- Calculate the number of blocks in the
  *					 supplied relation.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 2964efda96..2b5e1151c7 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -493,6 +493,7 @@ static struct config_enum_entry recovery_init_sync_method_options[] = {
 #ifdef HAVE_SYNCFS
 	{"syncfs", RECOVERY_INIT_SYNC_METHOD_SYNCFS, false},
 #endif
+	{"wal", RECOVERY_INIT_SYNC_METHOD_WAL, false},
 	{NULL, 0, false}
 };
 
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 86425965d0..b9091ccb75 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -761,7 +761,7 @@
 #restart_after_crash = on		# reinitialize after backend crash?
 #remove_temp_files_after_crash = on	# remove temporary files after
 					# backend crash?
-#recovery_init_sync_method = fsync	# fsync, syncfs (Linux 5.8+)
+#recovery_init_sync_method = fsync	# fsync, syncfs (Linux 5.8+), wal
 #data_sync_retry = off			# retry or panic on failure to fsync
 					# data?
 					# (change requires restart)
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index fb00fda6a7..f5409264da 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -186,6 +186,7 @@ extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
 extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
 extern void MarkBufferDirty(Buffer buffer);
+extern void RequestBufferSync(Buffer buffer);
 extern void IncrBufferRefCount(Buffer buffer);
 extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
 								   BlockNumber blockNum);
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 328473bdc9..2fe0ce890e 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -46,6 +46,7 @@
 #include <dirent.h>
 
 typedef enum RecoveryInitSyncMethod {
+	RECOVERY_INIT_SYNC_METHOD_WAL,
 	RECOVERY_INIT_SYNC_METHOD_FSYNC,
 	RECOVERY_INIT_SYNC_METHOD_SYNCFS
 } RecoveryInitSyncMethod;
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index 752b440864..35e813410a 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -40,6 +40,7 @@ extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
 extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber nblocks);
 extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
+extern void mdsync(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum);
 
 extern void ForgetDatabaseSyncRequests(Oid dbid);
 extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index a6fbf7b6a6..4a9cc9f181 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -103,6 +103,8 @@ extern BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum);
 extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
 						 int nforks, BlockNumber *nblocks);
 extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
+extern void smgrsync(SMgrRelation reln, ForkNumber forknum,
+					 BlockNumber blocknum);
 extern void AtEOXact_SMgr(void);
 
 #endif							/* SMGR_H */
-- 
2.30.1