From 850cb9122573d294903fe422d788bd102fcf5a90 Mon Sep 17 00:00:00 2001 From: Paul Guo Date: Mon, 25 Jan 2021 11:19:40 +0800 Subject: [PATCH v3 1/2] Fsync the affected files/directories only in pg_rewind. Previously in the end, pg_rewind would fsync the whole pgdata directory on the target, but that is a waste since usually just part of the files/directories on the target are modified. Other files on the target should have been flushed since pg_rewind requires a clean shutdown before doing the real work. This would help the scenario that the target postgres instance includes millions of files, which has been seen in a real environment. --- src/backend/storage/file/fd.c | 9 ---- src/bin/pg_rewind/file_ops.c | 19 ------- src/bin/pg_rewind/file_ops.h | 1 - src/bin/pg_rewind/pg_rewind.c | 116 ++++++++++++++++++++++++++++++++++++++-- src/common/file_utils.c | 12 +---- src/include/common/file_utils.h | 3 ++ src/include/pg_config_manual.h | 9 ++++ 7 files changed, 125 insertions(+), 44 deletions(-) diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index e8cd7ef088..61bb64aebf 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -101,15 +101,6 @@ #include "utils/guc.h" #include "utils/resowner_private.h" -/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ -#if defined(HAVE_SYNC_FILE_RANGE) -#define PG_FLUSH_DATA_WORKS 1 -#elif !defined(WIN32) && defined(MS_ASYNC) -#define PG_FLUSH_DATA_WORKS 1 -#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) -#define PG_FLUSH_DATA_WORKS 1 -#endif - /* * We must leave some file descriptors free for system(), the dynamic loader, * and other code that tries to open files without consulting fd.c. This diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c index c50f283ede..0dd9c6c95e 100644 --- a/src/bin/pg_rewind/file_ops.c +++ b/src/bin/pg_rewind/file_ops.c @@ -281,25 +281,6 @@ remove_target_symlink(const char *path) dstpath); } -/* - * Sync target data directory to ensure that modifications are safely on disk. - * - * We do this once, for the whole data directory, for performance reasons. At - * the end of pg_rewind's run, the kernel is likely to already have flushed - * most dirty buffers to disk. Additionally fsync_pgdata uses a two-pass - * approach (only initiating writeback in the first pass), which often reduces - * the overall amount of IO noticeably. - */ -void -sync_target_dir(void) -{ - if (!do_sync || dry_run) - return; - - fsync_pgdata(datadir_target, PG_VERSION_NUM); -} - - /* * Read a file into memory. The file to be read is /. * The file contents are returned in a malloc'd buffer, and *filesize diff --git a/src/bin/pg_rewind/file_ops.h b/src/bin/pg_rewind/file_ops.h index 611981f293..b6b8b319d5 100644 --- a/src/bin/pg_rewind/file_ops.h +++ b/src/bin/pg_rewind/file_ops.h @@ -19,7 +19,6 @@ extern void remove_target_file(const char *path, bool missing_ok); extern void truncate_target_file(const char *path, off_t newsize); extern void create_target(file_entry_t *t); extern void remove_target(file_entry_t *t); -extern void sync_target_dir(void); extern char *slurpFile(const char *datadir, const char *path, size_t *filesize); diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 2ac4910778..b7174dfbae 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -20,6 +20,7 @@ #include "catalog/pg_control.h" #include "common/controldata_utils.h" #include "common/file_perm.h" +#include "common/file_utils.h" #include "common/restricted_token.h" #include "common/string.h" #include "fe_utils/recovery_gen.h" @@ -32,6 +33,7 @@ static void usage(const char *progname); +static void perform_sync(filemap_t *filemap); static void perform_rewind(filemap_t *filemap, rewind_source *source, XLogRecPtr chkptrec, TimeLineID chkpttli, @@ -461,15 +463,15 @@ main(int argc, char **argv) */ perform_rewind(filemap, source, chkptrec, chkpttli, chkptredo); - if (showprogress) - pg_log_info("syncing target data directory"); - sync_target_dir(); - /* Also update the standby configuration, if requested. */ if (writerecoveryconf && !dry_run) WriteRecoveryConfig(conn, datadir_target, GenerateRecoveryConfig(conn, NULL)); + if (showprogress) + pg_log_info("syncing target data directory"); + perform_sync(filemap); + /* don't need the source connection anymore */ source->destroy(source); if (conn) @@ -483,6 +485,112 @@ main(int argc, char **argv) return 0; } +/* + * Fsync the modified files and directories on the target. + * + * The target should be cleanly shutdown before rewinding so we do not need + * to fsync the whole pg data directory, instead, we just flush those modified + * files/directories. + */ +static void +perform_sync(filemap_t *filemap) +{ + char parentpath[MAXPGPATH]; + + if (!do_sync || dry_run) + return; + + if (chdir(datadir_target) < 0) + { + pg_log_error("could not change directory to \"%s\": %m", datadir_target); + exit(1); + } + +#ifdef PG_FLUSH_DATA_WORKS + /* Hint the OS to flush data. */ + for (int i = 0; i < filemap->nentries; i++) + { + file_entry_t *entry = filemap->entries[i]; + + if (entry->target_pages_to_overwrite.bitmapsize > 0) + pre_sync_fname(entry->path, false); + + switch (entry->action) + { + case FILE_ACTION_COPY: + case FILE_ACTION_TRUNCATE: + case FILE_ACTION_COPY_TAIL: + pre_sync_fname(entry->path, false); + break; + + case FILE_ACTION_CREATE: + pre_sync_fname(entry->path, + entry->source_type == FILE_TYPE_DIRECTORY); + /* FALLTHROUGH */ + case FILE_ACTION_REMOVE: + /* + * Fsync the parent directory if we either create or delete + * files/directories in the parent directory. The parent + * directory might be missing as expected, but we ignore that + * error. + */ + strlcpy(parentpath, entry->path, MAXPGPATH); + get_parent_directory(parentpath); + if (strlen(parentpath) == 0) + strlcpy(parentpath, ".", MAXPGPATH); + pre_sync_fname(parentpath, true); + break; + + default: + break; + } + } +#endif + + /* Do the fsync()s finally */ + for (int i = 0; i < filemap->nentries; i++) + { + file_entry_t *entry = filemap->entries[i]; + + if (entry->target_pages_to_overwrite.bitmapsize > 0) + fsync_fname(entry->path, false); + + switch (entry->action) + { + case FILE_ACTION_COPY: + case FILE_ACTION_TRUNCATE: + case FILE_ACTION_COPY_TAIL: + fsync_fname(entry->path, false); + break; + + case FILE_ACTION_CREATE: + fsync_fname(entry->path, + entry->source_type == FILE_TYPE_DIRECTORY); + /* FALLTHROUGH */ + case FILE_ACTION_REMOVE: + /* + * Fsync the parent directory if we either create or delete + * files/directories in the parent directory. The parent + * directory might be missing as expected, but we ignore that + * error. + */ + fsync_parent_path(entry->path); + break; + + default: + break; + } + } + + fsync_fname("global/pg_control", false); + fsync_fname("backup_label", false); + if (access("recovery.conf", F_OK) == 0) + fsync_fname("recovery.conf", false); + if (access("postgresql.auto.conf", F_OK) == 0) + fsync_fname("postgresql.auto.conf", false); +} + + /* * Perform the rewind. * diff --git a/src/common/file_utils.c b/src/common/file_utils.c index 40b73bbe1a..68da0615f7 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -31,21 +31,11 @@ #ifdef FRONTEND -/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ -#if defined(HAVE_SYNC_FILE_RANGE) -#define PG_FLUSH_DATA_WORKS 1 -#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) -#define PG_FLUSH_DATA_WORKS 1 -#endif - /* * pg_xlog has been renamed to pg_wal in version 10. */ #define MINIMUM_VERSION_FOR_PG_WAL 100000 -#ifdef PG_FLUSH_DATA_WORKS -static int pre_sync_fname(const char *fname, bool isdir); -#endif static void walkdir(const char *path, int (*action) (const char *fname, bool isdir), bool process_symlinks); @@ -218,7 +208,7 @@ walkdir(const char *path, */ #ifdef PG_FLUSH_DATA_WORKS -static int +int pre_sync_fname(const char *fname, bool isdir) { int fd; diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index 978a57460a..114c196556 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -25,6 +25,9 @@ typedef enum PGFileType } PGFileType; #ifdef FRONTEND +#ifdef PG_FLUSH_DATA_WORKS +extern int pre_sync_fname(const char *fname, bool isdir); +#endif extern int fsync_fname(const char *fname, bool isdir); extern void fsync_pgdata(const char *pg_data, int serverVersion); extern void fsync_dir_recurse(const char *dir); diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index 27da86e5e0..dc25299192 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -402,3 +402,12 @@ * Enable tracing of syncscan operations (see also the trace_syncscan GUC var). */ /* #define TRACE_SYNCSCAN */ + +/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ +#if defined(HAVE_SYNC_FILE_RANGE) +#define PG_FLUSH_DATA_WORKS 1 +#elif !defined(WIN32) && defined(MS_ASYNC) +#define PG_FLUSH_DATA_WORKS 1 +#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) +#define PG_FLUSH_DATA_WORKS 1 +#endif -- 2.14.3