From 819a406f029b04ab6a500f63fe9c154332b65d8e Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Mon, 3 Oct 2022 21:58:22 -0700 Subject: [PATCH 3/3] Add direct I/O settings (developer-only). Provide a way to ask the kernel to use O_DIRECT (or local equivalent) for data and WAL files. This hurts performance currently and is not intended for end-users yet. Later proposed work will introduce our own I/O clustering, read-ahead, etc to replace the kernel features that are disabled with this option. This replaces the previous logic that would use O_DIRECT for the WAL in limited and obscure cases, now that there is an explicit setting. Discussion: https://postgr.es/m/ Author: Andres Freund Author: Thomas Munro --- doc/src/sgml/config.sgml | 51 ++++++++++++++++++++ src/backend/access/transam/xlog.c | 53 +++++++++++++-------- src/backend/access/transam/xlogprefetcher.c | 2 +- src/backend/storage/buffer/bufmgr.c | 13 +++-- src/backend/storage/buffer/localbuf.c | 4 +- src/backend/storage/file/fd.c | 5 ++ src/backend/storage/smgr/md.c | 29 +++++++++-- src/backend/storage/smgr/smgr.c | 20 ++++++++ src/backend/utils/misc/guc_tables.c | 33 +++++++++++++ src/include/access/xlog.h | 2 + src/include/storage/fd.h | 6 ++- src/include/storage/smgr.h | 5 ++ src/include/utils/guc_hooks.h | 2 + 13 files changed, 190 insertions(+), 35 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 559eb898a9..2d860dd900 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -11011,6 +11011,57 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir' + + io_data_direct (boolean) + + io_data_direct configuration parameter + + + + + Ask the kernel to minimize caching effects for relation data files + using O_DIRECT (most Unix-like systems), + F_NOCACHE (macOS) or + FILE_FLAG_NO_BUFFERING (Windows). Currently this + hurts performance, and is intended for developer testing only. + + + + + + io_wal_direct (boolean) + + io_wal_direct configuration parameter + + + + + Ask the kernel to minimize caching effects while writing WAL files + using O_DIRECT (most Unix-like systems), + F_NOCACHE (macOS) or + FILE_FLAG_NO_BUFFERING (Windows). Currently this + hurts performance, and is intended for developer testing only. + + + + + + io_wal_init_direct (boolean) + + io_wal_init_direct configuration parameter + + + + + Ask the kernel to minimize caching effects while initializing WAL files + using O_DIRECT (most Unix-like systems), + F_NOCACHE (macOS) or + FILE_FLAG_NO_BUFFERING (Windows). Currently this + hurts performance, and is intended for developer testing only. + + + + post_auth_delay (integer) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8f10effe3a..5663bdf856 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -138,6 +138,8 @@ int wal_retrieve_retry_interval = 5000; int max_slot_wal_keep_size_mb = -1; int wal_decode_buffer_size = 512 * 1024; bool track_wal_io_timing = false; +bool io_wal_direct = false; +bool io_wal_init_direct = false; #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -2926,6 +2928,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, XLogSegNo max_segno; int fd; int save_errno; + int open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY; Assert(logtli != 0); @@ -2958,8 +2961,11 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, unlink(tmppath); + if (io_wal_init_direct) + open_flags |= PG_O_DIRECT; + /* do not use get_sync_bit() here --- want to fsync only at end of fill */ - fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + fd = BasicOpenFile(tmppath, open_flags); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), @@ -3373,7 +3379,7 @@ XLogFileClose(void) * use the cache to read the WAL segment. */ #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) - if (!XLogIsNeeded()) + if (!XLogIsNeeded() && !io_wal_direct) (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED); #endif @@ -4473,6 +4479,21 @@ show_in_hot_standby(void) return RecoveryInProgress() ? "on" : "off"; } +/* + * GUC check for direct I/O support. + */ +bool +check_io_wal_direct(bool *newval, void **extra, GucSource source) +{ +#if PG_O_DIRECT == 0 + if (*newval) + { + GUC_check_errdetail("io_wal_direct and io_wal_init_direct are not supported on this platform."); + return false; + } +#endif + return true; +} /* * Read the control file, set respective GUCs. @@ -8056,35 +8077,27 @@ xlog_redo(XLogReaderState *record) } /* - * Return the (possible) sync flag used for opening a file, depending on the - * value of the GUC wal_sync_method. + * Return the extra open flags used for opening a file, depending on the + * value of the GUCs wal_sync_method, fsync and io_wal_direct. */ static int get_sync_bit(int method) { int o_direct_flag = 0; - /* If fsync is disabled, never open in sync mode */ - if (!enableFsync) - return 0; - /* - * Optimize writes by bypassing kernel cache with O_DIRECT when using - * O_SYNC and O_DSYNC. But only if archiving and streaming are disabled, - * otherwise the archive command or walsender process will read the WAL - * soon after writing it, which is guaranteed to cause a physical read if - * we bypassed the kernel cache. We also skip the - * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same - * reason. - * - * Never use O_DIRECT in walreceiver process for similar reasons; the WAL + * Use O_DIRECT if requested, except in walreceiver process. The WAL * written by walreceiver is normally read by the startup process soon - * after it's written. Also, walreceiver performs unaligned writes, which + * after it's written. Also, walreceiver performs unaligned writes, which * don't work with O_DIRECT, so it is required for correctness too. */ - if (!XLogIsNeeded() && !AmWalReceiverProcess()) + if (io_wal_direct && !AmWalReceiverProcess()) o_direct_flag = PG_O_DIRECT; + /* If fsync is disabled, never open in sync mode */ + if (!enableFsync) + return o_direct_flag; + switch (method) { /* @@ -8096,7 +8109,7 @@ get_sync_bit(int method) case SYNC_METHOD_FSYNC: case SYNC_METHOD_FSYNC_WRITETHROUGH: case SYNC_METHOD_FDATASYNC: - return 0; + return o_direct_flag; #ifdef O_SYNC case SYNC_METHOD_OPEN: return O_SYNC | o_direct_flag; diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index 0cf03945ee..d840078afc 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -785,7 +785,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) block->prefetch_buffer = InvalidBuffer; return LRQ_NEXT_IO; } - else + else if (!io_data_direct) { /* * This shouldn't be possible, because we already determined diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 6b95381481..9918855f37 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -535,7 +535,7 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln, * Try to initiate an asynchronous read. This returns false in * recovery if the relation file doesn't exist. */ - if (smgrprefetch(smgr_reln, forkNum, blockNum)) + if (!io_data_direct && smgrprefetch(smgr_reln, forkNum, blockNum)) result.initiated_io = true; #endif /* USE_PREFETCH */ } @@ -582,11 +582,11 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln, * the kernel and therefore didn't really initiate I/O, and no way to know when * the I/O completes other than using synchronous ReadBuffer(). * - * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and either + * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and * USE_PREFETCH is not defined (this build doesn't support prefetching due to - * lack of a kernel facility), or the underlying relation file wasn't found and - * we are in recovery. (If the relation file wasn't found and we are not in - * recovery, an error is raised). + * lack of a kernel facility), io_data_direct is enabled, or the underlying + * relation file wasn't found and we are in recovery. (If the relation file + * wasn't found and we are not in recovery, an error is raised). */ PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum) @@ -4908,6 +4908,9 @@ ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag) { PendingWriteback *pending; + if (io_data_direct) + return; + /* * Add buffer to the pending writeback array, unless writeback control is * disabled. diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index f51d3527f6..f9c82a789e 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -87,8 +87,8 @@ PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, { #ifdef USE_PREFETCH /* Not in buffers, so initiate prefetch */ - smgrprefetch(smgr, forkNum, blockNum); - result.initiated_io = true; + if (!io_data_direct && smgrprefetch(smgr, forkNum, blockNum)) + result.initiated_io = true; #endif /* USE_PREFETCH */ } diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 4151cafec5..aa720952f8 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -2021,6 +2021,11 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info) if (nbytes <= 0) return; +#ifdef PG_O_DIRECT + if (VfdCache[file].fileFlags & PG_O_DIRECT) + return; +#endif + returnCode = FileAccess(file); if (returnCode < 0) return; diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 719721a894..20ec37c310 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -142,6 +142,21 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg); +static inline int +_mdfd_open_flags(ForkNumber forkNum) +{ + int flags = O_RDWR | PG_BINARY; + + /* + * XXX: not clear if direct IO ever is interesting for other forks? The + * FSM fork currently often ends up very fragmented when using direct IO, + * for example. + */ + if (io_data_direct /* && forkNum == MAIN_FORKNUM */) + flags |= PG_O_DIRECT; + + return flags; +} /* * mdinit() -- Initialize private state for magnetic disk storage manager. @@ -205,14 +220,14 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) path = relpath(reln->smgr_rlocator, forknum); - fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + fd = PathNameOpenFile(path, _mdfd_open_flags(forknum) | O_CREAT | O_EXCL); if (fd < 0) { int save_errno = errno; if (isRedo) - fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); + fd = PathNameOpenFile(path, _mdfd_open_flags(forknum)); if (fd < 0) { /* be sure to report the error reported by create, not open */ @@ -513,7 +528,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) path = relpath(reln->smgr_rlocator, forknum); - fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); + fd = PathNameOpenFile(path, _mdfd_open_flags(forknum)); if (fd < 0) { @@ -584,6 +599,8 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) off_t seekpos; MdfdVec *v; + Assert(!io_data_direct); + v = _mdfd_getseg(reln, forknum, blocknum, false, InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL); if (v == NULL) @@ -609,6 +626,8 @@ void mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks) { + Assert(!io_data_direct); + /* * Issue flush requests in as few requests as possible; have to split at * segment boundaries though, since those are actually separate files. @@ -1186,7 +1205,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, fullpath = _mdfd_segpath(reln, forknum, segno); /* open the file */ - fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags); + fd = PathNameOpenFile(fullpath, _mdfd_open_flags(forknum) | oflags); pfree(fullpath); @@ -1395,7 +1414,7 @@ mdsyncfiletag(const FileTag *ftag, char *path) strlcpy(path, p, MAXPGPATH); pfree(p); - file = PathNameOpenFile(path, O_RDWR | PG_BINARY); + file = PathNameOpenFile(path, _mdfd_open_flags(ftag->forknum)); if (file < 0) return -1; need_to_close = true; diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index c1a5febcbf..706a52b9f1 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -20,6 +20,7 @@ #include "access/xlogutils.h" #include "lib/ilist.h" #include "storage/bufmgr.h" +#include "storage/fd.h" #include "storage/ipc.h" #include "storage/md.h" #include "storage/smgr.h" @@ -27,6 +28,9 @@ #include "utils/inval.h" +/* GUCs */ +bool io_data_direct = false; + /* * This struct of function pointers defines the API between smgr.c and * any individual storage manager module. Note that smgr subfunctions are @@ -735,3 +739,19 @@ ProcessBarrierSmgrRelease(void) smgrreleaseall(); return true; } + +/* + * Check if this build allows smgr implementations to enable direct I/O. + */ +bool +check_io_data_direct(bool *newval, void **extra, GucSource source) +{ +#if PG_O_DIRECT == 0 + if (*newval) + { + GUC_check_errdetail("io_data_direct is not supported on this platform."); + return false; + } +#endif + return true; +} diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 05ab087934..e324378ad4 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -1925,6 +1925,39 @@ struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"io_data_direct", PGC_SUSET, DEVELOPER_OPTIONS, + gettext_noop("Access data files with direct I/O."), + NULL, + GUC_NOT_IN_SAMPLE + }, + &io_data_direct, + false, + check_io_data_direct, NULL, NULL + }, + + { + {"io_wal_direct", PGC_SUSET, DEVELOPER_OPTIONS, + gettext_noop("Write WAL files with direct I/O."), + NULL, + GUC_NOT_IN_SAMPLE + }, + &io_wal_direct, + false, + check_io_wal_direct, NULL, NULL + }, + + { + {"io_wal_init_direct", PGC_SUSET, DEVELOPER_OPTIONS, + gettext_noop("Initialize WAL files with direct I/O."), + NULL, + GUC_NOT_IN_SAMPLE + }, + &io_wal_init_direct, + false, + check_io_wal_direct, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 1fbd48fbda..6220370036 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -51,6 +51,8 @@ extern PGDLLIMPORT char *wal_consistency_checking_string; extern PGDLLIMPORT bool log_checkpoints; extern PGDLLIMPORT bool track_wal_io_timing; extern PGDLLIMPORT int wal_decode_buffer_size; +extern PGDLLIMPORT bool io_wal_direct; +extern PGDLLIMPORT bool io_wal_init_direct; extern PGDLLIMPORT int CheckPointSegments; diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index c0a212487d..283ff21e31 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -44,6 +44,7 @@ #define FD_H #include +#include typedef enum RecoveryInitSyncMethod { @@ -82,9 +83,10 @@ extern PGDLLIMPORT int max_safe_fds; * to the appropriate Windows flag in src/port/open.c. We simulate it with * fcntl(F_NOCACHE) on macOS inside fd.c's open() wrapper. We use the name * PG_O_DIRECT rather than defining O_DIRECT in that case (probably not a good - * idea on a Unix). + * idea on a Unix). We can only use it if the compiler will correctly align + * PGAlignedBlock for us, though. */ -#if defined(O_DIRECT) +#if defined(O_DIRECT) && defined(pg_attribute_aligned) #define PG_O_DIRECT O_DIRECT #elif defined(F_NOCACHE) #define PG_O_DIRECT 0x80000000 diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index a07715356b..ef75934a16 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -17,6 +17,10 @@ #include "lib/ilist.h" #include "storage/block.h" #include "storage/relfilelocator.h" +#include "utils/guc.h" + +/* GUCs */ +extern PGDLLIMPORT bool io_data_direct; /* * smgr.c maintains a table of SMgrRelation objects, which are essentially @@ -107,5 +111,6 @@ extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum); extern void AtEOXact_SMgr(void); extern bool ProcessBarrierSmgrRelease(void); +extern bool check_io_data_direct(bool *newval, void **extra, GucSource source); #endif /* SMGR_H */ diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h index f1a9a183b4..a9748f6b34 100644 --- a/src/include/utils/guc_hooks.h +++ b/src/include/utils/guc_hooks.h @@ -59,6 +59,8 @@ extern bool check_effective_io_concurrency(int *newval, void **extra, GucSource source); extern bool check_huge_page_size(int *newval, void **extra, GucSource source); extern const char *show_in_hot_standby(void); +extern bool check_io_data_direct(bool *newval, void **extra, GucSource source); +extern bool check_io_wal_direct(bool *newval, void **extra, GucSource source); extern bool check_locale_messages(char **newval, void **extra, GucSource source); extern void assign_locale_messages(const char *newval, void *extra); extern bool check_locale_monetary(char **newval, void **extra, GucSource source); -- 2.35.1