>From e60caf094f68496658e969cdd4df919fd66e9d29 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Sun, 6 Mar 2016 22:20:17 -0800 Subject: [PATCH 1/2] durable-rename-andres-v8 --- src/backend/access/transam/timeline.c | 40 +---- src/backend/access/transam/xlog.c | 64 ++------ src/backend/access/transam/xlogarchive.c | 21 +-- src/backend/postmaster/pgarch.c | 6 +- src/backend/replication/logical/origin.c | 23 +-- src/backend/replication/slot.c | 2 +- src/backend/storage/file/fd.c | 267 +++++++++++++++++++++++-------- src/include/storage/fd.h | 4 +- 8 files changed, 232 insertions(+), 195 deletions(-) diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c index f6da673..bd91573 100644 --- a/src/backend/access/transam/timeline.c +++ b/src/backend/access/transam/timeline.c @@ -418,24 +418,10 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, TLHistoryFilePath(path, newTLI); /* - * Prefer link() to rename() here just to be really sure that we don't - * overwrite an existing file. However, there shouldn't be one, so - * rename() is an acceptable substitute except for the truly paranoid. + * Perform the rename using link if available, paranoidly trying to avoid + * overwriting an existing file (there shouldn't be one). */ -#if HAVE_WORKING_LINK - if (link(tmppath, path) < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not link file \"%s\" to \"%s\": %m", - tmppath, path))); - unlink(tmppath); -#else - if (rename(tmppath, path) < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not rename file \"%s\" to \"%s\": %m", - tmppath, path))); -#endif + durable_link_or_rename(tmppath, path, ERROR); /* The history file can be archived immediately. */ if (XLogArchivingActive()) @@ -508,24 +494,10 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size) TLHistoryFilePath(path, tli); /* - * Prefer link() to rename() here just to be really sure that we don't - * overwrite an existing logfile. However, there shouldn't be one, so - * rename() is an acceptable substitute except for the truly paranoid. + * Perform the rename using link if available, paranoidly trying to avoid + * overwriting an existing file (there shouldn't be one). */ -#if HAVE_WORKING_LINK - if (link(tmppath, path) < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not link file \"%s\" to \"%s\": %m", - tmppath, path))); - unlink(tmppath); -#else - if (rename(tmppath, path) < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not rename file \"%s\" to \"%s\": %m", - tmppath, path))); -#endif + durable_link_or_rename(tmppath, path, ERROR); } /* diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 00f139a..2d63a54 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -3299,34 +3299,16 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, } /* - * Prefer link() to rename() here just to be really sure that we don't - * overwrite an existing logfile. However, there shouldn't be one, so - * rename() is an acceptable substitute except for the truly paranoid. + * Perform the rename using link if available, paranoidly trying to avoid + * overwriting an existing file (there shouldn't be one). */ -#if HAVE_WORKING_LINK - if (link(tmppath, path) < 0) + if (durable_link_or_rename(tmppath, path, LOG) != 0) { if (use_lock) LWLockRelease(ControlFileLock); - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m", - tmppath, path))); + /* durable_link_or_rename already emitted log message */ return false; } - unlink(tmppath); -#else - if (rename(tmppath, path) < 0) - { - if (use_lock) - LWLockRelease(ControlFileLock); - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m", - tmppath, path))); - return false; - } -#endif if (use_lock) LWLockRelease(ControlFileLock); @@ -3840,14 +3822,8 @@ RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr) * flag, rename will fail. We'll try again at the next checkpoint. */ snprintf(newpath, MAXPGPATH, "%s.deleted", path); - if (rename(path, newpath) != 0) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not rename old transaction log file \"%s\": %m", - path))); + if (durable_rename(path, newpath, LOG) != 0) return; - } rc = unlink(newpath); #else rc = unlink(path); @@ -5339,11 +5315,7 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog) * re-enter archive recovery mode in a subsequent crash. */ unlink(RECOVERY_COMMAND_DONE); - if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0) - ereport(FATAL, - (errcode_for_file_access(), - errmsg("could not rename file \"%s\" to \"%s\": %m", - RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE))); + durable_rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE, FATAL); ereport(LOG, (errmsg("archive recovery complete"))); @@ -6190,7 +6162,7 @@ StartupXLOG(void) if (stat(TABLESPACE_MAP, &st) == 0) { unlink(TABLESPACE_MAP_OLD); - if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0) + if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0) ereport(LOG, (errmsg("ignoring file \"%s\" because no file \"%s\" exists", TABLESPACE_MAP, BACKUP_LABEL_FILE), @@ -6553,11 +6525,7 @@ StartupXLOG(void) if (haveBackupLabel) { unlink(BACKUP_LABEL_OLD); - if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0) - ereport(FATAL, - (errcode_for_file_access(), - errmsg("could not rename file \"%s\" to \"%s\": %m", - BACKUP_LABEL_FILE, BACKUP_LABEL_OLD))); + durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL); } /* @@ -6570,11 +6538,7 @@ StartupXLOG(void) if (haveTblspcMap) { unlink(TABLESPACE_MAP_OLD); - if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) != 0) - ereport(FATAL, - (errcode_for_file_access(), - errmsg("could not rename file \"%s\" to \"%s\": %m", - TABLESPACE_MAP, TABLESPACE_MAP_OLD))); + durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL); } /* Check that the GUCs used to generate the WAL allow recovery */ @@ -7351,11 +7315,7 @@ StartupXLOG(void) */ XLogArchiveCleanup(partialfname); - if (rename(origpath, partialpath) != 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not rename file \"%s\" to \"%s\": %m", - origpath, partialpath))); + durable_rename(origpath, partialpath, ERROR); XLogArchiveNotify(partialfname); } } @@ -10911,7 +10871,7 @@ CancelBackup(void) /* remove leftover file from previously canceled backup if it exists */ unlink(BACKUP_LABEL_OLD); - if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0) + if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0) { ereport(WARNING, (errcode_for_file_access(), @@ -10934,7 +10894,7 @@ CancelBackup(void) /* remove leftover file from previously canceled backup if it exists */ unlink(TABLESPACE_MAP_OLD); - if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0) + if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0) { ereport(LOG, (errmsg("online backup mode canceled"), diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c index 277c14a..bcfc53f 100644 --- a/src/backend/access/transam/xlogarchive.c +++ b/src/backend/access/transam/xlogarchive.c @@ -451,13 +451,7 @@ KeepFileRestoredFromArchive(char *path, char *xlogfname) */ snprintf(oldpath, MAXPGPATH, "%s.deleted%u", xlogfpath, deletedcounter++); - if (rename(xlogfpath, oldpath) != 0) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not rename file \"%s\" to \"%s\": %m", - xlogfpath, oldpath))); - } + durable_rename(xlogfpath, oldpath, ERROR); #else /* same-size buffers, so this never truncates */ strlcpy(oldpath, xlogfpath, MAXPGPATH); @@ -470,11 +464,7 @@ KeepFileRestoredFromArchive(char *path, char *xlogfname) reload = true; } - if (rename(path, xlogfpath) < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not rename file \"%s\" to \"%s\": %m", - path, xlogfpath))); + durable_rename(path, xlogfpath, ERROR); /* * Create .done file forcibly to prevent the restored segment from being @@ -580,12 +570,7 @@ XLogArchiveForceDone(const char *xlog) StatusFilePath(archiveReady, xlog, ".ready"); if (stat(archiveReady, &stat_buf) == 0) { - if (rename(archiveReady, archiveDone) < 0) - ereport(WARNING, - (errcode_for_file_access(), - errmsg("could not rename file \"%s\" to \"%s\": %m", - archiveReady, archiveDone))); - + (void) durable_rename(archiveReady, archiveDone, WARNING); return; } diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 397f802..1aa6466 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -728,9 +728,5 @@ pgarch_archiveDone(char *xlog) StatusFilePath(rlogready, xlog, ".ready"); StatusFilePath(rlogdone, xlog, ".done"); - if (rename(rlogready, rlogdone) < 0) - ereport(WARNING, - (errcode_for_file_access(), - errmsg("could not rename file \"%s\" to \"%s\": %m", - rlogready, rlogdone))); + (void) durable_rename(rlogready, rlogdone, WARNING); } diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c index 0caf7a3..8c8833b 100644 --- a/src/backend/replication/logical/origin.c +++ b/src/backend/replication/logical/origin.c @@ -604,29 +604,10 @@ CheckPointReplicationOrigin(void) tmppath))); } - /* fsync the temporary file */ - if (pg_fsync(tmpfd) != 0) - { - CloseTransientFile(tmpfd); - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", - tmppath))); - } - CloseTransientFile(tmpfd); - /* rename to permanent file, fsync file and directory */ - if (rename(tmppath, path) != 0) - { - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not rename file \"%s\" to \"%s\": %m", - tmppath, path))); - } - - fsync_fname((char *) path, false); - fsync_fname("pg_logical", true); + /* fsync, rename to permanent file, fsync file and directory */ + durable_rename(tmppath, path, PANIC); } /* diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index affa9b9..ead221d 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -1095,7 +1095,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) START_CRIT_SECTION(); fsync_fname(path, false); - fsync_fname((char *) dir, true); + fsync_fname(dir, true); fsync_fname("pg_replslot", true); END_CRIT_SECTION(); diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 1b30100..e3ccb8c 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -306,7 +306,10 @@ static void walkdir(const char *path, #ifdef PG_FLUSH_DATA_WORKS static void pre_sync_fname(const char *fname, bool isdir, int elevel); #endif -static void fsync_fname_ext(const char *fname, bool isdir, int elevel); +static void datadir_fsync_fname(const char *fname, bool isdir, int elevel); + +static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel); +static int fsync_parent_path(const char *fname, int elevel); /* @@ -413,54 +416,142 @@ pg_flush_data(int fd, off_t offset, off_t amount) * indicate the OS just doesn't allow/require fsyncing directories. */ void -fsync_fname(char *fname, bool isdir) +fsync_fname(const char *fname, bool isdir) { - int fd; - int returncode; - - /* - * Some OSs require directories to be opened read-only whereas other - * systems don't allow us to fsync files opened read-only; so we need both - * cases here - */ - if (!isdir) - fd = OpenTransientFile(fname, - O_RDWR | PG_BINARY, - S_IRUSR | S_IWUSR); - else - fd = OpenTransientFile(fname, - O_RDONLY | PG_BINARY, - S_IRUSR | S_IWUSR); - - /* - * Some OSs don't allow us to open directories at all (Windows returns - * EACCES) - */ - if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) - return; - - else if (fd < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", fname))); - - returncode = pg_fsync(fd); - - /* Some OSs don't allow us to fsync directories at all */ - if (returncode != 0 && isdir && errno == EBADF) - { - CloseTransientFile(fd); - return; - } - - if (returncode != 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", fname))); - - CloseTransientFile(fd); + fsync_fname_ext(fname, isdir, false, ERROR); } +/* + * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability + * + * This routine ensures that, after returning, the effect of renaming file + * persists in case of a crash. A crash while this routine is running will + * leave you with either the old, or the new file. + * + * It does so by using fsync on the sourcefile before the rename, and the + * target file and directory after. + * + * Note that rename() cannot be used across arbitrary directories, as they + * might not be on the same filesystem. Therefore this routine does not + * support renaming across directories. + */ +int +durable_rename(const char *oldfile, const char *newfile, int elevel) +{ + int fd; + + /* + * First fsync the old and target path (if it exists), to ensure that they + * are properly persistent on disk. Syncing the target file is not + * strictly necessary, but it makes it easier to reason about crashes; + * because it's then guaranteed that either source or target file exists + * after a crash. + */ + if (fsync_fname_ext(oldfile, false, false, elevel) != 0) + return -1; + + fd = OpenTransientFile((char *) newfile, PG_BINARY | O_RDWR, 0); + if (fd < 0) + { + if (errno != ENOENT) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", newfile))); + return -1; + } + } + + if (pg_fsync(fd) != 0) + { + /* XXX: perform close() before? might be outside a transaction. Consider errno! */ + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + newfile))); + CloseTransientFile(fd); + return -1; + } + CloseTransientFile(fd); + + /* Time to do the real deal... */ + if (rename(oldfile, newfile) < 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + oldfile, newfile))); + return -1; + } + + /* + * To guarantee renaming the file is persistent, fsync the file with its + * new name, and its containing directory. + */ + if (fsync_fname_ext(newfile, false, false, elevel) != 0) + return -1; + + /* Same for parent directory */ + if (fsync_parent_path(newfile, elevel) != 0) + return -1; + + return 0; +} + +/* + * durable_link_or_rename -- rename a file in a durable manner. + * + * Similar to durable_rename(), except that this routine tries (but does not + * guarantee) not to overwrite the target file. + * + * Note that a crash in an unfortunate moment can leave you with two links to + * the target file. + */ +int +durable_link_or_rename(const char *oldfile, const char *newfile, int elevel) +{ + /* + * Ensure that, if we crash directly after the rename/link, a file with + * valid contents is moved into place. + */ + if (fsync_fname_ext(oldfile, false, false, elevel) != 0) + return -1; + +#if HAVE_WORKING_LINK + if (link(oldfile, newfile) < 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not link file \"%s\" to \"%s\": %m", + oldfile, newfile))); + return -1; + } + unlink(oldfile); +#else + /* XXX: Add racy file existence check? */ + if (rename(oldfile, newfile) < 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + tmppath, path))); + return -1; + } +#endif + + /* + * Make change persistent in case of an OS crash, both the new entry and + * its parent directory need to be flushed. + */ + if (fsync_fname_ext(newfile, false, false, elevel) != 0) + return -1; + + /* Same for parent directory */ + if (fsync_parent_path(newfile, elevel) != 0) + return -1; + + return 0; +} /* * InitFileAccess --- initialize this module during backend startup @@ -2547,10 +2638,10 @@ SyncDataDirectory(void) * in pg_tblspc, they'll get fsync'd twice. That's not an expected case * so we don't worry about optimizing it. */ - walkdir(".", fsync_fname_ext, false, LOG); + walkdir(".", datadir_fsync_fname, false, LOG); if (xlog_is_symlink) - walkdir("pg_xlog", fsync_fname_ext, false, LOG); - walkdir("pg_tblspc", fsync_fname_ext, true, LOG); + walkdir("pg_xlog", datadir_fsync_fname, false, LOG); + walkdir("pg_tblspc", datadir_fsync_fname, true, LOG); } /* @@ -2667,12 +2758,12 @@ pre_sync_fname(const char *fname, bool isdir, int elevel) /* * fsync_fname_ext -- Try to fsync a file or directory * - * Ignores errors trying to open unreadable files, or trying to fsync - * directories on systems where that isn't allowed/required, and logs other - * errors at a caller-specified level. + * If desired ignores errors trying to open unreadable files, or trying to + * fsync directories on systems where that isn't allowed/required, and logs + * other errors at a caller-specified level. */ -static void -fsync_fname_ext(const char *fname, bool isdir, int elevel) +static int +fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel) { int fd; int flags; @@ -2690,20 +2781,23 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel) else flags |= O_RDONLY; - /* - * Open the file, silently ignoring errors about unreadable files (or - * unsupported operations, e.g. opening a directory under Windows), and - * logging others. - */ fd = OpenTransientFile((char *) fname, flags, 0); - if (fd < 0) + + /* + * Some OSs don't allow us to open directories at all (Windows returns + * EACCES), just ignore the error in that case. If desired also silently + * ignoring errors about unreadable files. Log others. + */ + if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) + return 0; + else if (fd < 0 && ignore_perm && errno == EACCES) + return 0; + else if (fd < 0) { - if (errno == EACCES || (isdir && errno == EISDIR)) - return; ereport(elevel, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", fname))); - return; + return -1; } returncode = pg_fsync(fd); @@ -2713,9 +2807,56 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel) * those errors. Anything else needs to be logged. */ if (returncode != 0 && !(isdir && errno == EBADF)) + { + /* XXX: perform close() before? might be outside a transaction. Consider errno! */ ereport(elevel, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", fname))); + (void) CloseTransientFile(fd); + return -1; + } (void) CloseTransientFile(fd); + + return 0; +} + +/* + * fsync_parent_path -- fsync the parent path of a file or directory + * + * This is aimed at making file operations persistent on disk in case of + * an OS crash or power failure. + */ +static int +fsync_parent_path(const char *fname, int elevel) +{ + char parentpath[MAXPGPATH]; + + /* Same for parent directory */ + snprintf(parentpath, MAXPGPATH, "%s", fname); + get_parent_directory(parentpath); + + /* + * get_parent_directory() returns an empty string if the input argument + * is just a file name (see comments in path.c), so handle that as being + * the current directory. + */ + if (strlen(parentpath) == 0) + sprintf(parentpath, "."); + + if (fsync_fname_ext(parentpath, true, false, elevel) != 0) + return -1; + + return 0; +} + +static void +datadir_fsync_fname(const char *fname, bool isdir, int elevel) +{ + /* + * We want to silently ignoring errors about unreadable files (or + * unsupported operations, e.g. opening a directory under Windows). Pass + * that desire on to fsync_fname_ext(). + */ + fsync_fname_ext(fname, isdir, true, elevel); } diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 4a3fccb..66dc5dc 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -113,7 +113,9 @@ extern int pg_fsync_no_writethrough(int fd); extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern int pg_flush_data(int fd, off_t offset, off_t amount); -extern void fsync_fname(char *fname, bool isdir); +extern void fsync_fname(const char *fname, bool isdir); +extern int durable_rename(const char *oldfile, const char *newfile, int loglevel); +extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel); extern void SyncDataDirectory(void); /* Filename components for OpenTemporaryFile */ -- 2.7.0.229.g701fa7f