From 4f1190bbb4e37319be37f69fe1682ec5ef7eb971 Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot Date: Mon, 24 Feb 2025 11:29:35 +0000 Subject: [PATCH v1] re-introduce track_wal_io_timing a051e71e28a added the "timing tracking" in the WAL code path with "only" track_io_timing enabled (and track_wal_io_timing still false). Then it made sense for 2421e9a51d2 to get rid of the track_wal_io_timing GUC. Concerns have been raised about the fact that a051e71e28a added the "timing tracking" in the WAL code path with "only" track_io_timing enabled. So this commit re-introduces the track_wal_io_timing GUC and ensure that WAL timing is tracked in pg_stat_io only if track_io_timing and track_wal_io_timing are both enabled. --- doc/src/sgml/config.sgml | 25 +++++++++++++++++++ doc/src/sgml/monitoring.sgml | 5 ++++ src/backend/access/transam/xlog.c | 21 ++++++++++------ src/backend/access/transam/xlogreader.c | 4 +-- src/backend/access/transam/xlogrecovery.c | 6 ++--- src/backend/storage/buffer/bufmgr.c | 21 ++++++++-------- src/backend/storage/buffer/localbuf.c | 8 +++--- src/backend/storage/smgr/md.c | 8 +++--- src/backend/utils/activity/pgstat_io.c | 12 +++++---- src/backend/utils/misc/guc_tables.c | 9 +++++++ src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/access/xlog.h | 1 + src/include/pgstat.h | 4 +-- 13 files changed, 87 insertions(+), 38 deletions(-) 25.8% doc/src/sgml/ 24.2% src/backend/access/transam/ 25.8% src/backend/storage/buffer/ 6.9% src/backend/storage/smgr/ 8.2% src/backend/utils/activity/ 4.2% src/backend/utils/misc/ 4.6% src/include/ diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index a8354576108..7e7eae01d33 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -8393,6 +8393,31 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; + + track_wal_io_timing (boolean) + + track_wal_io_timing configuration parameter + + + + + Enables timing of WAL I/O write and fsync calls. This parameter is off by + default, as it will repeatedly query the operating system for the current time, + which may cause significant overhead on some platforms. + You can use the pg_test_timing tool to + measure the overhead of timing on your system. + I/O timing information is + displayed in + pg_stat_io and in the output of the + + pg_stat_get_backend_io() function only if + is also enabled. + Only superusers and users with the appropriate SET + privilege can change this setting. + + + + track_functions (enum) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 3dfd059b7ee..6ff8a4ec929 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -183,6 +183,11 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser of block read, write, extend, and fsync times. + + The parameter enables monitoring + of WAL write and fsync times. + + Normally these parameters are set in postgresql.conf so that they apply to all server processes, but it is possible to turn diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 919314f8258..9054aed2242 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -134,6 +134,7 @@ int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ int wal_retrieve_retry_interval = 5000; int max_slot_wal_keep_size_mb = -1; int wal_decode_buffer_size = 512 * 1024; +bool track_wal_io_timing = false; #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -2432,19 +2433,22 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) nleft = nbytes; do { + bool track_timing = track_io_timing && track_wal_io_timing; + errno = 0; /* * Measure I/O timing to write WAL data, for pg_stat_io. */ - start = pgstat_prepare_io_time(); + start = pgstat_prepare_io_time(track_timing); pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); written = pg_pwrite(openLogFile, from, nleft, startoffset); pgstat_report_wait_end(); pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, - IOOP_WRITE, start, 1, written); + IOOP_WRITE, start, 1, written, + track_timing); if (written <= 0) { @@ -3248,7 +3252,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, errmsg("could not create file \"%s\": %m", tmppath))); /* Measure I/O timing when initializing segment */ - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE); save_errno = 0; @@ -3291,7 +3295,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, */ pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, IOOP_WRITE, io_start, 1, - wal_init_zero ? wal_segment_size : 1); + wal_init_zero ? wal_segment_size : 1, track_io_timing); if (save_errno) { @@ -3310,7 +3314,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, } /* Measure I/O timing when flushing segment */ - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC); if (pg_fsync(fd) != 0) @@ -3325,7 +3329,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, pgstat_report_wait_end(); pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, - IOOP_FSYNC, io_start, 1, 0); + IOOP_FSYNC, io_start, 1, 0, track_io_timing); if (close(fd) != 0) ereport(ERROR, @@ -8728,6 +8732,7 @@ void issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) { char *msg = NULL; + bool track_timing = track_io_timing && track_wal_io_timing; instr_time start; Assert(tli != 0); @@ -8744,7 +8749,7 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) /* * Measure I/O timing to sync the WAL file for pg_stat_io. */ - start = pgstat_prepare_io_time(); + start = pgstat_prepare_io_time(track_timing); pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC); switch (wal_sync_method) @@ -8791,7 +8796,7 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) pgstat_report_wait_end(); pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_FSYNC, - start, 1, 0); + start, 1, 0, track_timing); } /* diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 995e4f7c26f..d2418243a18 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -1557,7 +1557,7 @@ WALRead(XLogReaderState *state, #ifndef FRONTEND /* Measure I/O timing when reading segment */ - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); pgstat_report_wait_start(WAIT_EVENT_WAL_READ); #endif @@ -1570,7 +1570,7 @@ WALRead(XLogReaderState *state, pgstat_report_wait_end(); pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ, - io_start, 1, readbytes); + io_start, 1, readbytes, track_io_timing); #endif if (readbytes <= 0) diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 915cb330295..9e177e9716a 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -3401,7 +3401,7 @@ retry: readOff = targetPageOff; /* Measure I/O timing when reading segment */ - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); pgstat_report_wait_start(WAIT_EVENT_WAL_READ); r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff); @@ -3413,7 +3413,7 @@ retry: pgstat_report_wait_end(); pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ, - io_start, 1, r); + io_start, 1, r, track_io_timing); XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size); if (r < 0) @@ -3436,7 +3436,7 @@ retry: pgstat_report_wait_end(); pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ, - io_start, 1, r); + io_start, 1, r, track_io_timing); Assert(targetSegNo == readSegNo); Assert(targetPageOff == readOff); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index d3216ef6ba7..5de9490d439 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1509,10 +1509,10 @@ WaitReadBuffers(ReadBuffersOperation *operation) io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]); } - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); smgrreadv(operation->smgr, forknum, io_first_block, io_pages, io_buffers_len); pgstat_count_io_op_time(io_object, io_context, IOOP_READ, io_start, - 1, io_buffers_len * BLCKSZ); + 1, io_buffers_len * BLCKSZ, track_io_timing); /* Verify each block we read, and terminate the I/O. */ for (int j = 0; j < io_buffers_len; ++j) @@ -2401,7 +2401,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, } } - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); /* * Note: if smgrzeroextend fails, we will end up with buffers that are @@ -2426,7 +2426,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr, UnlockRelationForExtension(bmr.rel, ExclusiveLock); pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND, - io_start, 1, extend_by * BLCKSZ); + io_start, 1, extend_by * BLCKSZ, track_io_timing); /* Set BM_VALID, terminate IO, and wake up any waiters */ for (uint32 i = 0; i < extend_by; i++) @@ -3858,7 +3858,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, */ bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum); - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); /* * bufToWrite is either the shared buffer or a copy, as appropriate. @@ -3888,7 +3888,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE). */ pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, - IOOP_WRITE, io_start, 1, BLCKSZ); + IOOP_WRITE, io_start, 1, BLCKSZ, track_io_timing); pgBufferUsage.shared_blks_written++; @@ -4460,7 +4460,7 @@ FlushRelationBuffers(Relation rel) PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); smgrwrite(srel, BufTagGetForkNum(&bufHdr->tag), @@ -4470,7 +4470,7 @@ FlushRelationBuffers(Relation rel) pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_WRITE, - io_start, 1, BLCKSZ); + io_start, 1, BLCKSZ, track_io_timing); buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED); pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); @@ -5917,7 +5917,7 @@ IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context) sort_pending_writebacks(wb_context->pending_writebacks, wb_context->nr_pending); - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); /* * Coalesce neighbouring writes, but nothing else. For that we iterate @@ -5977,7 +5977,8 @@ IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context) * blocks of permanent relations. */ pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, - IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0); + IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0, + track_io_timing); wb_context->nr_pending = 0; } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 78b65fb160a..4ca78c412d3 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -244,7 +244,7 @@ GetLocalVictimBuffer(void) PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); /* And write... */ smgrwrite(oreln, @@ -255,7 +255,7 @@ GetLocalVictimBuffer(void) /* Temporary table I/O does not use Buffer Access Strategies */ pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, - IOOP_WRITE, io_start, 1, BLCKSZ); + IOOP_WRITE, io_start, 1, BLCKSZ, track_io_timing); /* Mark not-dirty now in case we error out below */ buf_state &= ~BM_DIRTY; @@ -414,13 +414,13 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr, } } - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); /* actually extend relation */ smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false); pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_EXTEND, - io_start, 1, extend_by * BLCKSZ); + io_start, 1, extend_by * BLCKSZ, track_io_timing); for (uint32 i = 0; i < extend_by; i++) { diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 298754d1b64..d6df9f5102a 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -1381,7 +1381,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) ereport(DEBUG1, (errmsg_internal("could not forward fsync request because request queue is full"))); - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0) ereport(data_sync_elevel(ERROR), @@ -1401,7 +1401,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) * backend fsyncs. */ pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL, - IOOP_FSYNC, io_start, 1, 0); + IOOP_FSYNC, io_start, 1, 0, track_io_timing); } } @@ -1786,7 +1786,7 @@ mdsyncfiletag(const FileTag *ftag, char *path) need_to_close = true; } - io_start = pgstat_prepare_io_time(); + io_start = pgstat_prepare_io_time(track_io_timing); /* Sync the file. */ result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC); @@ -1796,7 +1796,7 @@ mdsyncfiletag(const FileTag *ftag, char *path) FileClose(file); pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL, - IOOP_FSYNC, io_start, 1, 0); + IOOP_FSYNC, io_start, 1, 0, track_io_timing); errno = save_errno; return result; diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c index e7362b52a37..3746f9e4bc2 100644 --- a/src/backend/utils/activity/pgstat_io.c +++ b/src/backend/utils/activity/pgstat_io.c @@ -83,14 +83,15 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, } /* - * Initialize the internal timing for an IO operation. + * Initialize the internal timing for an IO operation, depending on an + * IO timing GUC. */ instr_time -pgstat_prepare_io_time(void) +pgstat_prepare_io_time(bool track_io_guc) { instr_time io_start; - if (track_io_timing) + if (track_io_guc) INSTR_TIME_SET_CURRENT(io_start); else { @@ -117,9 +118,10 @@ pgstat_prepare_io_time(void) */ void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, - instr_time start_time, uint32 cnt, uint64 bytes) + instr_time start_time, uint32 cnt, uint64 bytes, + bool track_io_guc) { - if (track_io_timing) + if (track_io_guc) { instr_time io_time; diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 3f1c7b1ebf6..03a6dd49154 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -1500,6 +1500,15 @@ struct config_bool ConfigureNamesBool[] = false, NULL, NULL, NULL }, + { + {"track_wal_io_timing", PGC_SUSET, STATS_CUMULATIVE, + gettext_noop("Collects timing statistics for WAL I/O activity."), + NULL + }, + &track_wal_io_timing, + false, + NULL, NULL, NULL + }, { {"update_process_title", PGC_SUSET, PROCESS_TITLE, diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index e771d87da1f..5362ff80519 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -641,6 +641,7 @@ #track_counts = on #track_cost_delay_timing = off #track_io_timing = off +#track_wal_io_timing = off #track_functions = none # none, pl, all #stats_fetch_consistency = cache # cache, none, snapshot diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 0a50d160bba..d313099c027 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -54,6 +54,7 @@ extern PGDLLIMPORT char *wal_consistency_checking_string; extern PGDLLIMPORT bool log_checkpoints; extern PGDLLIMPORT int CommitDelay; extern PGDLLIMPORT int CommitSiblings; +extern PGDLLIMPORT bool track_wal_io_timing; extern PGDLLIMPORT int wal_decode_buffer_size; extern PGDLLIMPORT int CheckPointSegments; diff --git a/src/include/pgstat.h b/src/include/pgstat.h index fc651d03cf9..1e60e94be90 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -564,10 +564,10 @@ extern bool pgstat_bktype_io_stats_valid(PgStat_BktypeIO *backend_io, BackendType bktype); extern void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes); -extern instr_time pgstat_prepare_io_time(void); +extern instr_time pgstat_prepare_io_time(bool track_io_guc); extern void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, - uint32 cnt, uint64 bytes); + uint32 cnt, uint64 bytes, bool track_io_guc); extern PgStat_IO *pgstat_fetch_stat_io(void); extern const char *pgstat_get_io_context_name(IOContext io_context); -- 2.34.1