diff --git a/configure.in b/configure.in index 4ea5699..ef66354 100644 --- a/configure.in +++ b/configure.in @@ -1243,6 +1243,8 @@ AC_CHECK_FUNCS(posix_fadvise) AC_CHECK_DECLS(posix_fadvise, [], [], [#include ]) fi +AC_CHECK_FUNCS(posix_fallocate) + AC_CHECK_DECLS(fdatasync, [], [], [#include ]) AC_CHECK_DECLS([strlcat, strlcpy]) # This is probably only present on Darwin, but may as well check always diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index f7dd61c..fa51581 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -79,6 +79,9 @@ char *XLogArchiveCommand = NULL; bool EnableHotStandby = false; bool fullPageWrites = true; bool log_checkpoints = false; +#ifdef HAVE_POSIX_FALLOCATE +bool wal_use_fallocate = false; +#endif int sync_method = DEFAULT_SYNC_METHOD; int wal_level = WAL_LEVEL_MINIMAL; int CommitDelay = 0; /* precommit delay in microseconds */ @@ -2284,16 +2287,6 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) unlink(tmppath); - /* - * Allocate a buffer full of zeros. This is done before opening the file - * so that we don't leak the file descriptor if palloc fails. - * - * Note: palloc zbuffer, instead of just using a local char array, to - * ensure it is reasonably well-aligned; this may save a few cycles - * transferring data to the kernel. - */ - zbuffer = (char *) palloc0(XLOG_BLCKSZ); - /* do not use get_sync_bit() here --- want to fsync only at end of fill */ fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR); @@ -2302,38 +2295,71 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath))); - /* - * Zero-fill the file. We have to do this the hard way to ensure that all - * the file space has really been allocated --- on platforms that allow - * "holes" in files, just seeking to the end doesn't allocate intermediate - * space. This way, we know that we have all the space and (after the - * fsync below) that all the indirect blocks are down on disk. Therefore, - * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the - * log file. - */ - for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ) +#ifdef HAVE_POSIX_FALLOCATE + if (wal_use_fallocate) { - errno = 0; - if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ) + errno = posix_fallocate(fd, 0, XLogSegSize); + + if (errno) { - int save_errno = errno; - - /* - * If we fail to make the file, delete it to release disk space - */ + int errno_saved = errno; + + close(fd); unlink(tmppath); + errno = errno_saved; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not allocate space for file \"%s\" using posix_fallocate: %m", tmppath))); + } + } + else +#endif + { + /* + * Allocate a buffer full of zeros. This is done before opening the file + * so that we don't leak the file descriptor if palloc fails. + * + * Note: palloc zbuffer, instead of just using a local char array, to + * ensure it is reasonably well-aligned; this may save a few cycles + * transferring data to the kernel. + */ - close(fd); + zbuffer = (char *) palloc0(XLOG_BLCKSZ); - /* if write didn't set errno, assume problem is no disk space */ - errno = save_errno ? save_errno : ENOSPC; + /* + * Zero-fill the file. We have to do this the hard way to ensure that all + * the file space has really been allocated --- on platforms that allow + * "holes" in files, just seeking to the end doesn't allocate intermediate + * space. This way, we know that we have all the space and (after the + * fsync below) that all the indirect blocks are down on disk. Therefore, + * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the + * log file. + */ + for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ) + { + errno = 0; + if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ) + { + int save_errno = errno; - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not write to file \"%s\": %m", tmppath))); + /* + * If we fail to make the file, delete it to release disk space + */ + unlink(tmppath); + + close(fd); + + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tmppath))); + } } + pfree(zbuffer); } - pfree(zbuffer); if (pg_fsync(fd) != 0) { @@ -6866,7 +6892,7 @@ CreateCheckPoint(int flags) XLogRecPtr curInsert; INSERT_RECPTR(curInsert, Insert, Insert->curridx); - if (curInsert == ControlFile->checkPoint + + if (curInsert == ControlFile->checkPoint + MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) && ControlFile->checkPoint == ControlFile->checkPointCopy.redo) { diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 22ba35f..e55d44d 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1455,6 +1455,18 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, +#ifdef HAVE_POSIX_FALLOCATE + { + {"wal_use_fallocate", PGC_SIGHUP, WAL_SETTINGS, + gettext_noop("WAL writer should use posix_fallocate(3) instead of write(2)."), + NULL, + }, + &wal_use_fallocate, + false, + NULL, NULL, NULL + }, +#endif + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index f8f06c1..4f917f2 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -190,6 +190,9 @@ extern char *XLogArchiveCommand; extern bool EnableHotStandby; extern bool fullPageWrites; extern bool log_checkpoints; +#ifdef HAVE_POSIX_FALLOCATE +extern bool wal_use_fallocate; +#endif /* WAL levels */ typedef enum WalLevel diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 8aabf3c..033127b 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -369,6 +369,9 @@ /* Define to 1 if you have the `posix_fadvise' function. */ #undef HAVE_POSIX_FADVISE +/* Define to 1 if you have the `posix_fallocate' function. */ +#undef HAVE_POSIX_FALLOCATE + /* Define to 1 if you have the POSIX signal interface. */ #undef HAVE_POSIX_SIGNALS