From 0d6e0106ce2477af6327dfb899c295c0ad0936af Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Wed, 11 Mar 2026 00:55:03 -0700 Subject: [PATCH v11 5/7] instrumentation: Use Time-Stamp Counter (TSC) on x86-64 for faster measurements This allows the direct use of the Time-Stamp Counter (TSC) value retrieved from the CPU using RDTSC/RDTSC instructions, instead of APIs like clock_gettime() on POSIX systems. This reduces the overhead of EXPLAIN with ANALYZE and TIMING ON. Tests showed that runtime when instrumented can be reduced by up to 10% for queries moving lots of rows through the plan. To control use of the TSC, the new "timing_clock_source" GUC is introduced, whose default ("auto") automatically uses the TSC when running on Linux/x86-64, in case the system clocksource is reported as "tsc". The use of the system APIs can be enforced by setting "system", or on x86-64 architectures the use of TSC can be enforced by explicitly setting "tsc". In order to use the TSC the frequency is first determined by use of CPUID, and if not available, by running a short calibration loop at program start, falling back to the system time if TSC values are not stable. Note, that we split TSC usage into the RDTSC CPU instruction which does not wait for out-of-order execution (faster, less precise) and the RDTSCP instruction, which waits for outstanding instructions to retire. RDTSCP is deemed to have little benefit in the typical InstrStartNode() / InstrStopNode() use case of EXPLAIN, and can be up to twice as slow. To separate these use cases, the new macro INSTR_TIME_SET_CURRENT_FAST() is introduced, which uses RDTSC. The original macro INSTR_TIME_SET_CURRENT() uses RDTSCP and is supposed to be used when precision is more important than performance. When the system timing clock source is used both of these macros instead utilize the system APIs (clock_gettime / QueryPerformanceCounter) like before. Author: David Geier Author: Andres Freund Author: Lukas Fittl Reviewed-by: Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de --- doc/src/sgml/config.sgml | 54 ++++ src/backend/executor/instrument.c | 58 +++- src/backend/main/main.c | 7 +- src/backend/utils/misc/guc_parameters.dat | 11 + src/backend/utils/misc/guc_tables.c | 11 + src/backend/utils/misc/postgresql.conf.sample | 4 + src/bin/pg_test_timing/pg_test_timing.c | 14 +- src/bin/pgbench/pgbench.c | 2 +- src/bin/psql/startup.c | 2 +- src/common/instr_time.c | 293 +++++++++++++++++- src/include/port/pg_cpu.h | 9 + src/include/portability/instr_time.h | 157 ++++++++-- src/include/utils/guc_hooks.h | 3 + src/include/utils/guc_tables.h | 1 + src/port/pg_cpu_x86.c | 180 ++++++++++- src/tools/pgindent/typedefs.list | 1 + 16 files changed, 766 insertions(+), 41 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 8cdd826fbd3..99a6593d9ac 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2533,6 +2533,60 @@ include_dir 'conf.d' + + Timing + + + + timing_clock_source (enum) + + timing_clock_source configuration parameter + + + + + Selects the method for making timing measurements using the OS or specialized CPU + instructions. Possible values are: + + + + auto (automatically chooses TSC clock source for modern CPUs, + otherwise uses the OS system clock) + + + + + system (measures timing using the OS system clock) + + + + + tsc (measures timing using the x86-64 Time-Stamp Counter (TSC) + by directly executing RDTSC/RDTSCP instructions, see below) + + + + The default is auto. + + + If enabled, the TSC clock source will use the RDTSC instruction for the x86-64 + Time-Stamp Counter (TSC) to perform certain time measurements, for example during + EXPLAIN ANALYZE. The RDTSC instruction has less overhead than going through the OS + clock source, which for an EXPLAIN ANALYZE statement will show timing closer to the + actual runtime when timing is off. For timings that require higher precision the + RDTSCP instruction is used, which avoids inaccuracies due to CPU instruction re-ordering. + Use of RDTSC/RDTSC is not supported on older CPUs or hypervisors that don't pass the TSC + frequency to guest VMs, and is not advised on systems that utilize an emulated TSC. + + + To help decide which clock source to use on an x86-64 system you can run the + pg_test_timing utility to check TSC availability, and + perform timing measurements. + + + + + Background Writer diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index a40610bc252..03cc82182ee 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -72,7 +72,7 @@ InstrStartNode(Instrumentation *instr) if (!INSTR_TIME_IS_ZERO(instr->starttime)) elog(ERROR, "InstrStartNode called twice in a row"); else - INSTR_TIME_SET_CURRENT(instr->starttime); + INSTR_TIME_SET_CURRENT_FAST(instr->starttime); } /* save buffer usage totals at node entry, if needed */ @@ -99,7 +99,7 @@ InstrStopNode(Instrumentation *instr, double nTuples) if (INSTR_TIME_IS_ZERO(instr->starttime)) elog(ERROR, "InstrStopNode called without start"); - INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_SET_CURRENT_FAST(endtime); INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime); INSTR_TIME_SET_ZERO(instr->starttime); @@ -294,3 +294,57 @@ WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub) dst->wal_fpi_bytes += add->wal_fpi_bytes - sub->wal_fpi_bytes; dst->wal_buffers_full += add->wal_buffers_full - sub->wal_buffers_full; } + +/* GUC hooks for timing_clock_source */ + +#include "portability/instr_time.h" +#include "utils/guc_hooks.h" + +bool +check_timing_clock_source(int *newval, void **extra, GucSource source) +{ + pg_initialize_timing(true); + +#if PG_INSTR_TSC_CLOCK + if (*newval == TIMING_CLOCK_SOURCE_TSC && !has_usable_tsc) + { + GUC_check_errdetail("TSC is not supported as timing clock source"); + return false; + } +#endif + + return true; +} + +void +assign_timing_clock_source(int newval, void *extra) +{ + /* + * Ignore the return code since the check hook already verified TSC is + * usable if its explicitly requested + */ + pg_set_timing_clock_source(newval); +} + +const char * +show_timing_clock_source(void) +{ + switch (timing_clock_source) + { + case TIMING_CLOCK_SOURCE_AUTO: +#if PG_INSTR_TSC_CLOCK + if (pg_current_timing_clock_source() == TIMING_CLOCK_SOURCE_TSC) + return "auto (tsc)"; +#endif + return "auto (system)"; + case TIMING_CLOCK_SOURCE_SYSTEM: + return "system"; +#if PG_INSTR_TSC_CLOCK + case TIMING_CLOCK_SOURCE_TSC: + return "tsc"; +#endif + } + + /* unreachable */ + return "?"; +} diff --git a/src/backend/main/main.c b/src/backend/main/main.c index 884fb7b4910..bcb45a54678 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -93,7 +93,12 @@ main(int argc, char *argv[]) /* * Initialize timing infrastructure */ - pg_initialize_timing(); +#if defined(WIN32) + /* Skip TSC calibration on Windows, its too expensive per connection */ + pg_initialize_timing(false); +#else + pg_initialize_timing(true); +#endif /* * Remember the physical location of the initially given argv[] array for diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index a5a0edf2534..18cd9a0fafd 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -2988,6 +2988,17 @@ assign_hook => 'assign_timezone_abbreviations', }, +{ name => 'timing_clock_source', type => 'enum', context => 'PGC_USERSET', group => 'RESOURCES_TIME', + short_desc => 'Controls the clock source used for collecting timing measurements.', + long_desc => 'This enables the use of specialized clock sources, specifically the RDTSC clock source on x86-64 systems (if available), to support timing measurements with lower overhead during EXPLAIN and other instrumentation.', + variable => 'timing_clock_source', + boot_val => 'TIMING_CLOCK_SOURCE_AUTO', + options => 'timing_clock_source_options', + check_hook => 'check_timing_clock_source', + assign_hook => 'assign_timing_clock_source', + show_hook => 'show_timing_clock_source', +}, + { name => 'trace_connection_negotiation', type => 'bool', context => 'PGC_POSTMASTER', group => 'DEVELOPER_OPTIONS', short_desc => 'Logs details of pre-authentication connection handshake.', flags => 'GUC_NOT_IN_SAMPLE', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 38aaf82f120..b8bb9590d9c 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -92,6 +92,7 @@ #include "tcop/tcopprot.h" #include "tsearch/ts_cache.h" #include "utils/builtins.h" +#include "portability/instr_time.h" #include "utils/bytea.h" #include "utils/float.h" #include "utils/guc_hooks.h" @@ -373,6 +374,15 @@ static const struct config_enum_entry huge_pages_options[] = { {NULL, 0, false} }; +static const struct config_enum_entry timing_clock_source_options[] = { + {"auto", TIMING_CLOCK_SOURCE_AUTO, false}, + {"system", TIMING_CLOCK_SOURCE_SYSTEM, false}, +#if PG_INSTR_TSC_CLOCK + {"tsc", TIMING_CLOCK_SOURCE_TSC, false}, +#endif + {NULL, 0, false} +}; + static const struct config_enum_entry huge_pages_status_options[] = { {"off", HUGE_PAGES_OFF, false}, {"on", HUGE_PAGES_ON, false}, @@ -723,6 +733,7 @@ const char *const config_group_names[] = [CONN_AUTH_TCP] = gettext_noop("Connections and Authentication / TCP Settings"), [CONN_AUTH_AUTH] = gettext_noop("Connections and Authentication / Authentication"), [CONN_AUTH_SSL] = gettext_noop("Connections and Authentication / SSL"), + [RESOURCES_TIME] = gettext_noop("Resource Usage / Time"), [RESOURCES_MEM] = gettext_noop("Resource Usage / Memory"), [RESOURCES_DISK] = gettext_noop("Resource Usage / Disk"), [RESOURCES_KERNEL] = gettext_noop("Resource Usage / Kernel Resources"), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index e686d88afc4..3cbe96b96ed 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -193,6 +193,10 @@ #max_files_per_process = 1000 # min 64 # (change requires restart) +# - Time - + +#timing_clock_source = auto # auto, system, tsc (if supported) + # - Background Writer - #bgwriter_delay = 200ms # 10-10000ms between rounds diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index 98672ae5d32..9f4b196c4bb 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -43,8 +43,18 @@ main(int argc, char *argv[]) handle_args(argc, argv); - /* initialize timing infrastructure (required for INSTR_* calls) */ - pg_initialize_timing(); + /* + * Initialize timing infrastructure (required for INSTR_* calls) + * + * This initialization should match the one in main() so the user can + * reason about what the backend will do. + */ +#if defined(WIN32) + /* Skip TSC calibration on Windows, its too expensive per connection */ + pg_initialize_timing(false); +#else + pg_initialize_timing(true); +#endif loop_count = test_timing(test_duration); diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index f962402a191..06db4042e8f 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -7335,7 +7335,7 @@ main(int argc, char **argv) } /* initialize timing infrastructure (required for INSTR_* calls) */ - pg_initialize_timing(); + pg_initialize_timing(false); /* opening connection... */ con = doConnect(); diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c index 69d044d405d..83753dab7d3 100644 --- a/src/bin/psql/startup.c +++ b/src/bin/psql/startup.c @@ -329,7 +329,7 @@ main(int argc, char *argv[]) PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL); /* initialize timing infrastructure (required for INSTR_* calls) */ - pg_initialize_timing(); + pg_initialize_timing(false); SyncVariables(); diff --git a/src/common/instr_time.c b/src/common/instr_time.c index 68bc585f2cc..2becf9b0780 100644 --- a/src/common/instr_time.c +++ b/src/common/instr_time.c @@ -14,14 +14,21 @@ */ #include "postgres.h" +#include + +#ifndef WIN32 +#include +#endif + +#include "port/pg_cpu.h" #include "portability/instr_time.h" /* * Stores what the number of ticks needs to be multiplied with to end up * with nanoseconds using integer math. * - * On certain platforms (currently Windows) the ticks to nanoseconds conversion - * requires floating point math because: + * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows) + * the ticks to nanoseconds conversion requires floating point math because: * * sec = ticks / frequency_hz * ns = ticks / frequency_hz * 1,000,000,000 @@ -40,7 +47,7 @@ * We remember the maximum number of ticks that can be multiplied by the scale * factor without overflowing so we can check via a * b > max <=> a > max / b. * - * On all other platforms we are using clock_gettime(), which uses nanoseconds + * In all other cases we are using clock_gettime(), which uses nanoseconds * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns * to return the original value. */ @@ -49,22 +56,73 @@ uint64 max_ticks_no_overflow = 0; static void set_ticks_per_ns(void); +int timing_clock_source = TIMING_CLOCK_SOURCE_AUTO; static bool timing_initialized = false; +#if PG_INSTR_TSC_CLOCK +/* Indicates if TSC instructions (RDTSC and RDTSCP) are usable. */ +bool has_usable_tsc = false; + +static void tsc_initialize(bool allow_tsc_calibration); +static bool tsc_use_by_default(void); +static void set_ticks_per_ns_system(); +static void set_ticks_per_ns_for_tsc(void); +#endif + +/* + * Initializes timing infrastructure. Must be called before making any use + * of INSTR* macros. + * + * The allow_tsc_calibration argument sets whether the TSC logic (if available) + * is permitted to do calibration if it couldn't get the frequency from CPUID. + * + * Calibration may take up to TSC_CALIBRATION_MAX_NS and delays program start. + */ void -pg_initialize_timing(void) +pg_initialize_timing(bool allow_tsc_calibration) { if (timing_initialized) return; +#if PG_INSTR_TSC_CLOCK + tsc_initialize(allow_tsc_calibration); +#endif + set_ticks_per_ns(); timing_initialized = true; } +bool +pg_set_timing_clock_source(TimingClockSourceType source) +{ + Assert(timing_initialized); + +#if PG_INSTR_TSC_CLOCK + switch (source) + { + case TIMING_CLOCK_SOURCE_AUTO: + use_tsc = has_usable_tsc && tsc_use_by_default(); + break; + case TIMING_CLOCK_SOURCE_SYSTEM: + use_tsc = false; + break; + case TIMING_CLOCK_SOURCE_TSC: + if (!has_usable_tsc) /* Tell caller TSC is not usable */ + return false; + use_tsc = true; + break; + } +#endif + + set_ticks_per_ns(); + timing_clock_source = source; + return true; +} + #ifndef WIN32 static void -set_ticks_per_ns() +set_ticks_per_ns_system() { ticks_per_ns_scaled = 0; max_ticks_no_overflow = 0; @@ -83,10 +141,233 @@ GetTimerFrequency(void) } static void -set_ticks_per_ns() +set_ticks_per_ns_system() { ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency(); max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; } #endif /* WIN32 */ + +static void +set_ticks_per_ns() +{ +#if PG_INSTR_TSC_CLOCK + if (use_tsc) + set_ticks_per_ns_for_tsc(); + else + set_ticks_per_ns_system(); +#else + set_ticks_per_ns_system(); +#endif +} + +/* TSC specific logic */ + +#if PG_INSTR_TSC_CLOCK + +bool use_tsc = false; + +static uint32 tsc_frequency_khz = 0; + +static uint32 tsc_calibrate(void); + +/* + * Decide whether we use the RDTSC/RDTSCP instructions at runtime, for x86-64, + * instead of incurring the overhead of a full clock_gettime() call. + * + * This can't be reliably determined at compile time, since the + * availability of an "invariant" TSC (that is not affected by CPU + * frequency changes) is dependent on the CPU architecture. Additionally, + * there are cases where TSC availability is impacted by virtualization, + * where a simple cpuid feature check would not be enough. + */ +static void +tsc_initialize(bool allow_tsc_calibration) +{ + /* Determine speed at which the TSC advances */ + tsc_frequency_khz = x86_tsc_frequency_khz(); + + if (tsc_frequency_khz) + { + has_usable_tsc = x86_feature_available(PG_RDTSCP); + return; + } + + /* + * CPUID did not give us the TSC frequency. If TSC is invariant and RDTSCP + * is available, we can measure the frequency by comparing TSC ticks + * against walltime using a short calibration loop. + */ + if (allow_tsc_calibration && x86_feature_available(PG_TSC_INVARIANT) && + x86_feature_available(PG_RDTSCP)) + { + tsc_frequency_khz = tsc_calibrate(); + has_usable_tsc = (tsc_frequency_khz > 0); + } +} + +/* + * Decides whether to use the TSC clock source if the user did not specify it + * one way or the other, and it is available (checked separately). + * + * Mirrors the Linux kernel's clocksource watchdog disable logic as updated in + * 2021 to reflect the reliability of the TSC on Intel platforms, see + * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion + * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/ + * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/ + * for reference. + * + * When the CPU has an invariant TSC (which we require in x86_tsc_frequency_khz), + * TSC_ADJUST bit set (Intel-only), and the system has at most 4 physical + * packages (sockets), we consider the TSC trustworthy by default, matching the + * Linux kernel. + * + * On other CPU platforms (e.g. AMD), in a virtual machine, or on 8+ socket + * systems we don't have an easy way to determine the TSC's reliability. If on + * Linux, we can check if TSC is the active clocksource, based on it having run + * the watchdog logic to monitor TSC correctness. For other platforms the user + * must explicitly enable it via GUC instead. + */ +static bool +tsc_use_by_default(void) +{ + if (x86_feature_available(PG_TSC_ADJUST)) + { + int cpus_per_package = x86_logical_processors_per_package(); + long total_cpus; + +#ifdef _SC_NPROCESSORS_CONF + total_cpus = sysconf(_SC_NPROCESSORS_CONF); +#elif defined(WIN32) + { + SYSTEM_INFO si; + + GetSystemInfo(&si); + total_cpus = si.dwNumberOfProcessors; + } +#else + total_cpus = -1; +#endif /* _SC_NPROCESSORS_CONF / WIN32 */ + + if (total_cpus > 0 && cpus_per_package > 0 && (total_cpus / cpus_per_package) <= 4) + return true; + } + +#if defined(__linux__) + { + FILE *fp; + char buf[128]; + + fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r"); + if (fp) + { + bool is_tsc = (fgets(buf, sizeof(buf), fp) != NULL && + strcmp(buf, "tsc\n") == 0); + + fclose(fp); + if (is_tsc) + return true; + } + } +#endif + + return false; +} + +/* + * Calibrate the TSC frequency by comparing TSC ticks against walltime. + * + * Takes initial TSC and system clock snapshots, then loops, recomputing the + * frequency each iteration from cumulative TSC ticks divided by elapsed time. + * + * Once the frequency estimate stabilizes (consecutive iterations agree), we + * consider it converged and the frequency in KHz is returned. If either too + * many iterations or a time limit passes without convergence, 0 is returned. + */ +#define TSC_CALIBRATION_MAX_NS (50 * NS_PER_MS) +#define TSC_CALIBRATION_ITERATIONS 1000000 +#define TSC_CALIBRATION_STABLE_CYCLES 3 + +static uint32 +tsc_calibrate(void) +{ + instr_time initial_wall; + int64 initial_tsc; + double freq_khz = 0; + double prev_freq_khz = 0; + int stable_count = 0; + int64 prev_tsc; + uint32 unused; + + /* Ensure INSTR_* time below work on system time */ + set_ticks_per_ns_system(); + + INSTR_TIME_SET_CURRENT(initial_wall); + +#ifdef _MSC_VER + initial_tsc = __rdtscp(&unused); +#else + initial_tsc = __builtin_ia32_rdtscp(&unused); +#endif + prev_tsc = initial_tsc; + + for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++) + { + instr_time now_wall; + int64 now_tsc; + int64 elapsed_ns; + int64 elapsed_ticks; + + INSTR_TIME_SET_CURRENT(now_wall); + +#ifdef _MSC_VER + now_tsc = __rdtscp(&unused); +#else + now_tsc = __builtin_ia32_rdtscp(&unused); +#endif + + INSTR_TIME_SUBTRACT(now_wall, initial_wall); + elapsed_ns = INSTR_TIME_GET_NANOSEC(now_wall); + + /* Safety: bail out if we've taken too long */ + if (elapsed_ns >= TSC_CALIBRATION_MAX_NS) + break; + + elapsed_ticks = now_tsc - initial_tsc; + + /* Skip if TSC hasn't advanced, or we walked backwards for some reason */ + if (now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0) + continue; + + freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000; + + /* + * Once freq_khz / prev_freq_khz is small, check if it stays that way. + * If it does for long enough, we've got a winner frequency. + */ + if (prev_freq_khz != 0 && fabs(freq_khz / prev_freq_khz) < 1.0001) + { + stable_count++; + if (stable_count >= TSC_CALIBRATION_STABLE_CYCLES) + return (uint32) freq_khz; + } + else + stable_count = 0; + + prev_tsc = now_tsc; + prev_freq_khz = freq_khz; + } + + /* did not converge */ + return 0; +} + +static void +set_ticks_per_ns_for_tsc(void) +{ + ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / tsc_frequency_khz; + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; +} + +#endif /* PG_INSTR_TSC_CLOCK */ diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index b93b828d3ac..a32e67487f8 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -23,6 +23,12 @@ typedef enum X86FeatureId /* scalar registers and 128-bit XMM registers */ PG_SSE4_2, PG_POPCNT, + PG_HYPERVISOR, + + /* TSC flags */ + PG_RDTSCP, + PG_TSC_INVARIANT, + PG_TSC_ADJUST, /* 512-bit ZMM registers */ PG_AVX512_BW, @@ -45,6 +51,9 @@ x86_feature_available(X86FeatureId feature) return X86Features[feature]; } +extern int x86_logical_processors_per_package(void); +extern uint32 x86_tsc_frequency_khz(void); + #endif /* defined(USE_SSE2) || defined(__i386__) */ #endif /* PG_CPU_H */ diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index eadf76720d9..a7640d0e72a 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -4,9 +4,10 @@ * portable high-precision interval timing * * This file provides an abstraction layer to hide portability issues in - * interval timing. On Unix we use clock_gettime(), and on Windows we use - * QueryPerformanceCounter(). These macros also give some breathing room to - * use other high-precision-timing APIs. + * interval timing. On x86 we use the RDTSC/RDTSCP instruction directly in + * certain cases, or alternatively clock_gettime() on Unix-like systems and + * QueryPerformanceCounter() on Windows. These macros also give some breathing + * room to use other high-precision-timing APIs. * * The basic data type is instr_time, which all callers should treat as an * opaque typedef. instr_time can store either an absolute time (of @@ -19,7 +20,11 @@ * * INSTR_TIME_SET_NANOSEC(t, x) set t to the specified value (in nanosecs) * - * INSTR_TIME_SET_CURRENT(t) set t to current time + * INSTR_TIME_SET_CURRENT_FAST(t) set t to current time without waiting + * for instructions in out-of-order window + * + * INSTR_TIME_SET_CURRENT(t) set t to current time while waiting for + * instructions in OOO to retire * * INSTR_TIME_ADD(x, y) x += y * @@ -82,12 +87,6 @@ typedef struct instr_time /* Shift amount for fixed-point ticks-to-nanoseconds conversion. */ #define TICKS_TO_NS_SHIFT 14 -#ifdef WIN32 -#define PG_INSTR_TICKS_TO_NS 1 -#else -#define PG_INSTR_TICKS_TO_NS 0 -#endif - /* * Variables used to translate ticks to nanoseconds, initialized by * pg_initialize_timing. @@ -95,12 +94,68 @@ typedef struct instr_time extern PGDLLIMPORT uint64 ticks_per_ns_scaled; extern PGDLLIMPORT uint64 max_ticks_no_overflow; +typedef enum +{ + TIMING_CLOCK_SOURCE_AUTO, + TIMING_CLOCK_SOURCE_SYSTEM, + TIMING_CLOCK_SOURCE_TSC +} TimingClockSourceType; + +extern int timing_clock_source; + /* * Initialize timing infrastructure * - * This must be called at least once before using INSTR_TIME_SET_CURRENT* macros. + * This must be called at least once by frontend programs before using + * INSTR_TIME_SET_CURRENT* macros. Backend programs automatically initialize + * this through the GUC check hook. + */ +extern void pg_initialize_timing(bool allow_tsc_calibrate); + +/* + * Sets the time source to be used. Mainly intended for frontend programs, + * the backend should set it via the timing_clock_source GUC instead. + * + * Returns false if the clock source could not be set, for example when TSC + * is not available despite being explicitly set. + */ +extern bool pg_set_timing_clock_source(TimingClockSourceType source); + +#if defined(__x86_64__) || defined(_M_X64) +#define PG_INSTR_TSC_CLOCK 1 +#define PG_INSTR_TICKS_TO_NS 1 +#elif defined(WIN32) +#define PG_INSTR_TSC_CLOCK 0 +#define PG_INSTR_TICKS_TO_NS 1 +#else +#define PG_INSTR_TSC_CLOCK 0 +#define PG_INSTR_TICKS_TO_NS 0 +#endif + + +#if PG_INSTR_TSC_CLOCK +/* Whether the hardware TSC clock is available and usable. */ +extern PGDLLIMPORT bool has_usable_tsc; + +/* Whether to actually use TSC based on availability and GUC settings. */ +extern PGDLLIMPORT bool use_tsc; + +#endif /* PG_INSTR_TSC_CLOCK */ + +/* + * Returns the current timing clock source effectively in use, resolving + * TIMING_CLOCK_SOURCE_AUTO to either TIMING_CLOCK_SOURCE_SYSTEM or + * TIMING_CLOCK_SOURCE_TSC. */ -extern void pg_initialize_timing(void); +static inline TimingClockSourceType +pg_current_timing_clock_source(void) +{ +#if PG_INSTR_TSC_CLOCK + return use_tsc ? TIMING_CLOCK_SOURCE_TSC : TIMING_CLOCK_SOURCE_SYSTEM; +#else + return TIMING_CLOCK_SOURCE_SYSTEM; +#endif +} #ifndef WIN32 @@ -119,22 +174,25 @@ extern void pg_initialize_timing(void); * than CLOCK_MONOTONIC. In particular, as of macOS 10.12, Apple provides * CLOCK_MONOTONIC_RAW which is both faster to read and higher resolution than * their version of CLOCK_MONOTONIC. + * + * Note this does not get used in case the TSC clock source logic is used, + * which directly calls architecture specific timing instructions (e.g. RDTSC). */ #if defined(__darwin__) && defined(CLOCK_MONOTONIC_RAW) -#define PG_INSTR_CLOCK CLOCK_MONOTONIC_RAW +#define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC_RAW #elif defined(CLOCK_MONOTONIC) -#define PG_INSTR_CLOCK CLOCK_MONOTONIC +#define PG_INSTR_SYSTEM_CLOCK CLOCK_MONOTONIC #else -#define PG_INSTR_CLOCK CLOCK_REALTIME +#define PG_INSTR_SYSTEM_CLOCK CLOCK_REALTIME #endif static inline instr_time -pg_get_ticks(void) +pg_get_ticks_system(void) { instr_time now; struct timespec tmp; - clock_gettime(PG_INSTR_CLOCK, &tmp); + clock_gettime(PG_INSTR_SYSTEM_CLOCK, &tmp); now.ticks = tmp.tv_sec * NS_PER_S + tmp.tv_nsec; return now; @@ -145,7 +203,7 @@ pg_get_ticks(void) /* On Windows, use QueryPerformanceCounter() for system clock source */ static inline instr_time -pg_get_ticks(void) +pg_get_ticks_system(void) { instr_time now; LARGE_INTEGER tmp; @@ -198,6 +256,66 @@ pg_ticks_to_ns(int64 ticks) #endif /* PG_INSTR_TICKS_TO_NS */ } +#if PG_INSTR_TSC_CLOCK + +#ifdef _MSC_VER +#include +#endif /* defined(_MSC_VER) */ + +static inline instr_time +pg_get_ticks_fast(void) +{ + if (likely(use_tsc)) + { + instr_time now; + +#ifdef _MSC_VER + now.ticks = __rdtsc(); +#else + /* Avoid complex includes on clang/GCC that raise compile times */ + now.ticks = __builtin_ia32_rdtsc(); +#endif /* defined(_MSC_VER) */ + return now; + } + + return pg_get_ticks_system(); +} + +static inline instr_time +pg_get_ticks(void) +{ + if (likely(use_tsc)) + { + instr_time now; + uint32 unused; + +#ifdef _MSC_VER + now.ticks = __rdtscp(&unused); +#else + now.ticks = __builtin_ia32_rdtscp(&unused); +#endif /* defined(_MSC_VER) */ + return now; + } + + return pg_get_ticks_system(); +} + +#else + +static inline instr_time +pg_get_ticks_fast(void) +{ + return pg_get_ticks_system(); +} + +static inline instr_time +pg_get_ticks(void) +{ + return pg_get_ticks_system(); +} + +#endif /* PG_INSTR_TSC_CLOCK */ + /* * Common macros */ @@ -208,6 +326,9 @@ pg_ticks_to_ns(int64 ticks) #define INSTR_TIME_SET_NANOSEC(t, n) ((t).ticks = n) +#define INSTR_TIME_SET_CURRENT_FAST(t) \ + ((t) = pg_get_ticks_fast()) + #define INSTR_TIME_SET_CURRENT(t) \ ((t) = pg_get_ticks()) diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h index 9c90670d9b8..a396e746415 100644 --- a/src/include/utils/guc_hooks.h +++ b/src/include/utils/guc_hooks.h @@ -162,6 +162,9 @@ extern const char *show_timezone(void); extern bool check_timezone_abbreviations(char **newval, void **extra, GucSource source); extern void assign_timezone_abbreviations(const char *newval, void *extra); +extern void assign_timing_clock_source(int newval, void *extra); +extern bool check_timing_clock_source(int *newval, void **extra, GucSource source); +extern const char *show_timing_clock_source(void); extern bool check_transaction_buffers(int *newval, void **extra, GucSource source); extern bool check_transaction_deferrable(bool *newval, void **extra, GucSource source); extern bool check_transaction_isolation(int *newval, void **extra, GucSource source); diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index 71a80161961..63440b8e36c 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -60,6 +60,7 @@ enum config_group CONN_AUTH_TCP, CONN_AUTH_AUTH, CONN_AUTH_SSL, + RESOURCES_TIME, RESOURCES_MEM, RESOURCES_DISK, RESOURCES_KERNEL, diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index 7b01c17750c..fc29212f38c 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -25,6 +25,11 @@ #endif /* defined(_MSC_VER) */ #endif +#ifdef __linux__ +#include +#include +#endif + #ifdef HAVE_XSAVE_INTRINSICS #include #endif @@ -100,22 +105,23 @@ pg_attribute_target("xsave") void set_x86_features(void) { - CPUIDResult r = {0}; + CPUIDResult r = {0}, r2 = {0}; pg_cpuid(0x01, &r); X86Features[PG_SSE4_2] = r.ecx >> 20 & 1; X86Features[PG_POPCNT] = r.ecx >> 23 & 1; + X86Features[PG_HYPERVISOR] = r.ecx >> 31 & 1; + + pg_cpuid_subleaf(0x07, 0, &r2); - /* All these features depend on OSXSAVE */ + X86Features[PG_TSC_ADJUST] = (r2.ebx & (1 << 1)) != 0; + + /* leaf 7 features that depend on OSXSAVE */ if (r.ecx & (1 << 27)) { uint32 xcr0_val = 0; - /* second cpuid call on leaf 7 to check extended AVX-512 support */ - - pg_cpuid_subleaf(0x07, 0, &r); - #ifdef HAVE_XSAVE_INTRINSICS /* get value of Extended Control Register */ xcr0_val = _xgetbv(0); @@ -125,15 +131,169 @@ set_x86_features(void) if (mask_available(xcr0_val, XMM | YMM | OPMASK | ZMM0_15 | ZMM16_31)) { - X86Features[PG_AVX512_BW] = r.ebx >> 30 & 1; - X86Features[PG_AVX512_VL] = r.ebx >> 31 & 1; + X86Features[PG_AVX512_BW] = r2.ebx >> 30 & 1; + X86Features[PG_AVX512_VL] = r2.ebx >> 31 & 1; - X86Features[PG_AVX512_VPCLMULQDQ] = r.ecx >> 10 & 1; - X86Features[PG_AVX512_VPOPCNTDQ] = r.ecx >> 14 & 1; + X86Features[PG_AVX512_VPCLMULQDQ] = r2.ecx >> 10 & 1; + X86Features[PG_AVX512_VPOPCNTDQ] = r2.ecx >> 14 & 1; } } + /* Check for other TSC related flags */ + pg_cpuid(0x80000001, &r); + X86Features[PG_RDTSCP] = r.edx >> 27 & 1; + + pg_cpuid(0x80000007, &r); + X86Features[PG_TSC_INVARIANT] = r.edx >> 8 & 1; + X86Features[INIT_PG_X86] = true; } +/* + * Return the number of logical processors per physical CPU package (socket). + * + * This uses CPUID.0B (Extended Topology Enumeration) to enumerate topology + * levels. Each sub-leaf reports a level type in ECX[15:8] (1 = SMT, 2 = Core) + * and the number of logical processors at that level and below in EBX[15:0]. + * The value at the highest level gives us logical processors per package. + * + * Vendor-specific leaves (0x1F for Intel, 0x80000026 for AMD) provide + * finer-grained sub-package topology but are assumed to report the same + * per-package totals on current hardware. + * + * Returns 0 if topology information is not available. + */ +int +x86_logical_processors_per_package(void) +{ + int logical_per_package = 0; + + for (int subleaf = 0; subleaf < 8; subleaf++) + { + CPUIDResult r = {0}; + uint32 level_type; + + if (!pg_cpuid_subleaf(0x0B, subleaf, &r)) + return 0; + + level_type = (r.ecx >> 8) & 0xff; + + /* level_type 0 means end of enumeration */ + if (level_type == 0) + break; + + logical_per_package = r.ebx & 0xffff; + } + + return logical_per_package; +} + +/* TSC (Time-stamp Counter) handling code */ + +static uint32 x86_hypervisor_tsc_frequency_khz(void); + +/* + * Determine the TSC frequency of the CPU, where supported. + * + * Needed to interpret the tick value returned by RDTSC/RDTSCP. Return value of + * 0 indicates TSC is not invariant, or the frequency information was not + * accessible and the instructions should not be used. + */ +uint32 +x86_tsc_frequency_khz(void) +{ + CPUIDResult r = {0}; + + if (!x86_feature_available(PG_TSC_INVARIANT)) + return 0; + + if (x86_feature_available(PG_HYPERVISOR)) + return x86_hypervisor_tsc_frequency_khz(); + + /* + * On modern Intel CPUs, the TSC is implemented by invariant timekeeping + * hardware, also called "Always Running Timer", or ART. The ART stays + * consistent even if the CPU changes frequency due to changing power + * levels. + * + * As documented in "Determining the Processor Base Frequency" in the + * "Intel® 64 and IA-32 Architectures Software Developer’s Manual", + * February 2026 Edition, we can get the TSC frequency as follows: + * + * Nominal TSC frequency = ( CPUID.15H:ECX[31:0] * CPUID.15H:EBX[31:0] ) / + * CPUID.15H:EAX[31:0] + * + * With CPUID.15H:ECX representing the nominal core crystal clock + * frequency, and EAX/EBX representing values used to translate the TSC + * value to that frequency, see "Chapter 20.17 "Time-Stamp Counter" of + * that manual. + * + * Older Intel CPUs, and other vendors do not set CPUID.15H:ECX, and as + * such we fall back to alternate approaches. + */ + pg_cpuid(0x15, &r); + if (r.ecx > 0) + { + /* + * EBX not being set indicates invariant TSC is not available. Require + * EAX being non-zero too, to avoid a theoretical divide by zero. + */ + if (r.eax == 0 || r.ebx == 0) + return 0; + + return r.ecx / 1000 * r.ebx / r.eax; + } + + /* + * When CPUID.15H is not available/incomplete, but we have verified an + * invariant TSC is used, we can instead get the processor base frequency + * in MHz from CPUID.16H:EAX, the "Processor Frequency Information Leaf". + */ + pg_cpuid(0x16, &r); + if (r.eax > 0) + return r.eax * 1000; + + return 0; +} + +/* + * Support for reading TSC frequency for hypervisors passing it to a guest VM. + * + * Two Hypervisors (VMware and KVM) are known to make TSC frequency in KHz + * available at the vendor-specific 0x40000010 leaf in the EAX register. + * + * For some other Hypervisors that have an invariant TSC, e.g. HyperV, we would + * need to access an MSR to get the frequency (which is typically not available + * for unprivileged processes), so we instead rely on the TSC calibration logic. + */ +#define CPUID_HYPERVISOR_VMWARE(r) (r.ebx == 0x61774d56 && r.ecx == 0x4d566572 && r.edx == 0x65726177) /* VMwareVMware */ +#define CPUID_HYPERVISOR_KVM(r) (r.ebx == 0x4b4d564b && r.ecx == 0x564b4d56 && r.edx == 0x0000004d) /* KVMKVMKVM */ +static uint32 +x86_hypervisor_tsc_frequency_khz(void) +{ + CPUIDResult r = {0}; + +/* + * The hypervisor is determined using the 0x40000000 Hypervisor information + * leaf, which requires use of __cpuidex to set ECX to 0 to access it. + * + * The similar __get_cpuid_count function does not work as expected since it + * contains a check for __get_cpuid_max, which has been observed to be lower + * than the special Hypervisor leaf, despite it being available. + */ +#if defined(HAVE__CPUIDEX) + __cpuidex((int *) &r, 0x40000000, 0); + + if (r.eax >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(r) || CPUID_HYPERVISOR_KVM(r))) + { + __cpuidex((int *) &r, 0x40000010, 0); + if (r.eax > 0) + return r.eax; + } +#endif /* HAVE__CPUIDEX */ + + return 0; +} + + #endif /* defined(USE_SSE2) || defined(__i386__) */ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 489defe7362..a3b76886caa 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3118,6 +3118,7 @@ TimeoutId TimeoutType Timestamp TimestampTz +TimingClockSourceType TmFromChar TmToChar ToastAttrInfo -- 2.47.1