diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c index f6da673..67e0f73 100644 --- a/src/backend/access/transam/timeline.c +++ b/src/backend/access/transam/timeline.c @@ -418,19 +418,20 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, TLHistoryFilePath(path, newTLI); /* - * Prefer link() to rename() here just to be really sure that we don't - * overwrite an existing file. However, there shouldn't be one, so - * rename() is an acceptable substitute except for the truly paranoid. + * Prefer link_safe() to rename_safe() here just to be really sure that + * we don't overwrite an existing file. However, there shouldn't be one, + * so rename_safe() is an acceptable substitute except for the truly + * paranoid. */ #if HAVE_WORKING_LINK - if (link(tmppath, path) < 0) + if (link_safe(tmppath, path) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not link file \"%s\" to \"%s\": %m", tmppath, path))); unlink(tmppath); #else - if (rename(tmppath, path) < 0) + if (rename_safe(tmppath, path) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -508,19 +509,20 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size) TLHistoryFilePath(path, tli); /* - * Prefer link() to rename() here just to be really sure that we don't - * overwrite an existing logfile. However, there shouldn't be one, so - * rename() is an acceptable substitute except for the truly paranoid. + * Prefer link_safe() to rename_safe() here just to be really sure that + * we don't overwrite an existing logfile. However, there shouldn't be + * one, so rename_safe() is an acceptable substitute except for the + * truly paranoid. */ #if HAVE_WORKING_LINK - if (link(tmppath, path) < 0) + if (link_safe(tmppath, path) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not link file \"%s\" to \"%s\": %m", tmppath, path))); unlink(tmppath); #else - if (rename(tmppath, path) < 0) + if (rename_safe(tmppath, path) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 94b79ac..93b6896 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -3299,12 +3299,13 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, } /* - * Prefer link() to rename() here just to be really sure that we don't - * overwrite an existing logfile. However, there shouldn't be one, so - * rename() is an acceptable substitute except for the truly paranoid. + * Prefer link_safe() to rename_safe() here just to be really sure + * that we don't overwrite an existing logfile. However, there + * shouldn't be one, so rename_safe() is an acceptable substitute + * except for the truly paranoid. */ #if HAVE_WORKING_LINK - if (link(tmppath, path) < 0) + if (link_safe(tmppath, path) < 0) { if (use_lock) LWLockRelease(ControlFileLock); @@ -3316,7 +3317,7 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, } unlink(tmppath); #else - if (rename(tmppath, path) < 0) + if (rename_safe(tmppath, path) < 0) { if (use_lock) LWLockRelease(ControlFileLock); @@ -3840,7 +3841,7 @@ RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr) * flag, rename will fail. We'll try again at the next checkpoint. */ snprintf(newpath, MAXPGPATH, "%s.deleted", path); - if (rename(path, newpath) != 0) + if (rename_safe(path, newpath) != 0) { ereport(LOG, (errcode_for_file_access(), @@ -5339,7 +5340,7 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog) * re-enter archive recovery mode in a subsequent crash. */ unlink(RECOVERY_COMMAND_DONE); - if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0) + if (rename_safe(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0) ereport(FATAL, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -6186,7 +6187,7 @@ StartupXLOG(void) if (stat(TABLESPACE_MAP, &st) == 0) { unlink(TABLESPACE_MAP_OLD); - if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0) + if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0) ereport(LOG, (errmsg("ignoring file \"%s\" because no file \"%s\" exists", TABLESPACE_MAP, BACKUP_LABEL_FILE), @@ -6549,7 +6550,7 @@ StartupXLOG(void) if (haveBackupLabel) { unlink(BACKUP_LABEL_OLD); - if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0) + if (rename_safe(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0) ereport(FATAL, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -6566,7 +6567,7 @@ StartupXLOG(void) if (haveTblspcMap) { unlink(TABLESPACE_MAP_OLD); - if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) != 0) + if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) != 0) ereport(FATAL, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -7347,7 +7348,7 @@ StartupXLOG(void) */ XLogArchiveCleanup(partialfname); - if (rename(origpath, partialpath) != 0) + if (rename_safe(origpath, partialpath) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -10907,7 +10908,7 @@ CancelBackup(void) /* remove leftover file from previously canceled backup if it exists */ unlink(BACKUP_LABEL_OLD); - if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0) + if (rename_safe(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0) { ereport(WARNING, (errcode_for_file_access(), @@ -10930,7 +10931,7 @@ CancelBackup(void) /* remove leftover file from previously canceled backup if it exists */ unlink(TABLESPACE_MAP_OLD); - if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0) + if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0) { ereport(LOG, (errmsg("online backup mode canceled"), diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c index 277c14a..65c03b2 100644 --- a/src/backend/access/transam/xlogarchive.c +++ b/src/backend/access/transam/xlogarchive.c @@ -451,7 +451,7 @@ KeepFileRestoredFromArchive(char *path, char *xlogfname) */ snprintf(oldpath, MAXPGPATH, "%s.deleted%u", xlogfpath, deletedcounter++); - if (rename(xlogfpath, oldpath) != 0) + if (rename_safe(xlogfpath, oldpath) != 0) { ereport(ERROR, (errcode_for_file_access(), @@ -470,7 +470,7 @@ KeepFileRestoredFromArchive(char *path, char *xlogfname) reload = true; } - if (rename(path, xlogfpath) < 0) + if (rename_safe(path, xlogfpath) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", @@ -580,7 +580,7 @@ XLogArchiveForceDone(const char *xlog) StatusFilePath(archiveReady, xlog, ".ready"); if (stat(archiveReady, &stat_buf) == 0) { - if (rename(archiveReady, archiveDone) < 0) + if (rename_safe(archiveReady, archiveDone) < 0) ereport(WARNING, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c index 397f802..db54889 100644 --- a/src/backend/postmaster/pgarch.c +++ b/src/backend/postmaster/pgarch.c @@ -728,7 +728,7 @@ pgarch_archiveDone(char *xlog) StatusFilePath(rlogready, xlog, ".ready"); StatusFilePath(rlogdone, xlog, ".done"); - if (rename(rlogready, rlogdone) < 0) + if (rename_safe(rlogready, rlogdone) < 0) ereport(WARNING, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c index 0caf7a3..96368c7 100644 --- a/src/backend/replication/logical/origin.c +++ b/src/backend/replication/logical/origin.c @@ -617,16 +617,13 @@ CheckPointReplicationOrigin(void) CloseTransientFile(tmpfd); /* rename to permanent file, fsync file and directory */ - if (rename(tmppath, path) != 0) + if (rename_safe(tmppath, path) != 0) { ereport(PANIC, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", tmppath, path))); } - - fsync_fname((char *) path, false); - fsync_fname("pg_logical", true); } /* diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index affa9b9..ead221d 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -1095,7 +1095,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) START_CRIT_SECTION(); fsync_fname(path, false); - fsync_fname((char *) dir, true); + fsync_fname(dir, true); fsync_fname("pg_replslot", true); END_CRIT_SECTION(); diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 1b30100..37c8926 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -307,6 +307,7 @@ static void walkdir(const char *path, static void pre_sync_fname(const char *fname, bool isdir, int elevel); #endif static void fsync_fname_ext(const char *fname, bool isdir, int elevel); +static void fsync_parent_path(const char *fname); /* @@ -413,7 +414,7 @@ pg_flush_data(int fd, off_t offset, off_t amount) * indicate the OS just doesn't allow/require fsyncing directories. */ void -fsync_fname(char *fname, bool isdir) +fsync_fname(const char *fname, bool isdir) { int fd; int returncode; @@ -424,11 +425,11 @@ fsync_fname(char *fname, bool isdir) * cases here */ if (!isdir) - fd = OpenTransientFile(fname, + fd = OpenTransientFile((char *) fname, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); else - fd = OpenTransientFile(fname, + fd = OpenTransientFile((char *) fname, O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR); @@ -461,6 +462,75 @@ fsync_fname(char *fname, bool isdir) CloseTransientFile(fd); } +/* + * rename_safe -- rename of a file, making it on-disk persistent + * + * This routine ensures that a rename file persists in case of a crash by using + * fsync on the old and new files before and after performing the rename so as + * this categorizes as an all-or-nothing operation. + */ +int +rename_safe(const char *oldfile, const char *newfile) +{ + struct stat filestats; + + /* + * First fsync the old entry and new entry, it this one exists, to ensure + * that they are properly persistent on disk. Calling this routine with + * an existing new target file is fine, rename() will first remove it + * before performing its operation. + */ + fsync_fname(oldfile, false); + if (stat(newfile, &filestats) == 0) + fsync_fname(newfile, false); + + /* Time to do the real deal... */ + if (rename(oldfile, newfile) != 0) + return -1; + + /* + * Make change persistent in case of an OS crash, both the new entry and + * its parent directory need to be flushed. + */ + fsync_fname(newfile, false); + + /* + * Same for parent directory. This routine is never called to rename + * files across directories, but if this proves to become the case, + * flushing the parent directory if the old file would be necessary. + */ + fsync_parent_path(newfile); + return 0; +} + +/* + * link_safe -- make a file hard link, making it on-disk persistent + * + * This routine ensures that a hard link created on a file persists on system + * in case of a crash by using fsync where on the link generated as well as on + * its parent directory. + */ +int +link_safe(const char *oldfile, const char *newfile) +{ + if (link(oldfile, newfile) < 0) + return -1; + + /* + * Make the link persistent in case of an OS crash, the new entry + * generated as well as its parent directory need to be flushed. + */ + fsync_fname(newfile, false); + + /* + * Same for parent directory. This routine is never called to rename + * files across directories, but if this proves to become the case, + * flushing the parent directory if the old file would be necessary. + */ + fsync_parent_path(newfile); + return 0; +} + /* * InitFileAccess --- initialize this module during backend startup @@ -2719,3 +2789,29 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel) (void) CloseTransientFile(fd); } + +/* + * fsync_parent_path -- fsync the parent path of a file or directory + * + * This is aimed at making file operations persistent on disk in case of + * an OS crash or power failure. + */ +static void +fsync_parent_path(const char *fname) +{ + char parentpath[MAXPGPATH]; + + /* Same for parent directory */ + snprintf(parentpath, MAXPGPATH, "%s", fname); + get_parent_directory(parentpath); + + /* + * get_parent_directory() returns an empty string if the input argument + * is just a file name (see comments in path.c), so handle that as being + * the current directory. + */ + if (strlen(parentpath) == 0) + fsync_fname(".", true); + else + fsync_fname(parentpath, true); +} diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 4a3fccb..7f3115a 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -113,7 +113,9 @@ extern int pg_fsync_no_writethrough(int fd); extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern int pg_flush_data(int fd, off_t offset, off_t amount); -extern void fsync_fname(char *fname, bool isdir); +extern void fsync_fname(const char *fname, bool isdir); +extern int rename_safe(const char *oldfile, const char *newfile); +extern int link_safe(const char *oldfile, const char *newfile); extern void SyncDataDirectory(void); /* Filename components for OpenTemporaryFile */