From a9d6b939681445c855de73e6aea50ba9a2acbbc3 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Wed, 11 Mar 2026 00:55:03 -0700 Subject: [PATCH v16 2/5] Allow retrieving x86 TSC frequency/flags from CPUID This adds additional x86 specific CPUID checks for flags needed for determining whether the Time-Stamp Counter (TSC) is usable on a given system, as well as a helper function to retrieve the TSC frequency from CPUID. This is intended for a future patch that will utilize the TSC to lower the overhead of timing instrumentation. Author: Lukas Fittl Author: David Geier Author: Andres Freund Reviewed-by: Andres Freund Reviewed-by: David Geier Reviewed-by: John Naylor Reviewed-by: Jakub Wartak (in an earlier version) Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de --- src/include/port/pg_cpu.h | 8 +++ src/port/pg_cpu_x86.c | 124 +++++++++++++++++++++++++++++++++++++- 2 files changed, 129 insertions(+), 3 deletions(-) diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index b93b828d3ac..0e1fea7fa92 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -23,6 +23,12 @@ typedef enum X86FeatureId /* scalar registers and 128-bit XMM registers */ PG_SSE4_2, PG_POPCNT, + PG_HYPERVISOR, + + /* TSC flags */ + PG_RDTSCP, + PG_TSC_INVARIANT, + PG_TSC_ADJUST, /* 512-bit ZMM registers */ PG_AVX512_BW, @@ -45,6 +51,8 @@ x86_feature_available(X86FeatureId feature) return X86Features[feature]; } +extern uint32 x86_tsc_frequency_khz(void); + #endif /* defined(USE_SSE2) || defined(__i386__) */ #endif /* PG_CPU_H */ diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index 1331f3f4eb8..71a07b55bf3 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -101,19 +101,24 @@ void set_x86_features(void) { unsigned int reg[4] = {0}; + bool have_osxsave; pg_cpuid(0x01, reg); X86Features[PG_SSE4_2] = reg[ECX] >> 20 & 1; X86Features[PG_POPCNT] = reg[ECX] >> 23 & 1; + X86Features[PG_HYPERVISOR] = reg[ECX] >> 31 & 1; + have_osxsave = reg[ECX] & (1 << 27); + + pg_cpuid_subleaf(0x07, 0, reg); + + X86Features[PG_TSC_ADJUST] = (reg[EBX] & (1 << 1)) != 0; /* leaf 7 features that depend on OSXSAVE */ - if (reg[ECX] & (1 << 27)) + if (have_osxsave) { uint32 xcr0_val = 0; - pg_cpuid_subleaf(0x07, 0, reg); - #ifdef HAVE_XSAVE_INTRINSICS /* get value of Extended Control Register */ xcr0_val = _xgetbv(0); @@ -131,7 +136,120 @@ set_x86_features(void) } } + /* Check for other TSC related flags */ + pg_cpuid(0x80000001, reg); + X86Features[PG_RDTSCP] = reg[EDX] >> 27 & 1; + + pg_cpuid(0x80000007, reg); + X86Features[PG_TSC_INVARIANT] = reg[EDX] >> 8 & 1; + X86Features[INIT_PG_X86] = true; } +/* TSC (Time-stamp Counter) handling code */ + +static uint32 x86_hypervisor_tsc_frequency_khz(void); + +/* + * Determine the TSC frequency of the CPU through CPUID, where supported. + * + * Needed to interpret the tick value returned by RDTSC/RDTSCP. Return value of + * 0 indicates the frequency information was not accessible via CPUID. + */ +uint32 +x86_tsc_frequency_khz(void) +{ + unsigned int reg[4] = {0}; + + if (x86_feature_available(PG_HYPERVISOR)) + return x86_hypervisor_tsc_frequency_khz(); + + /* + * On modern Intel CPUs, the TSC is implemented by invariant timekeeping + * hardware, also called "Always Running Timer", or ART. The ART stays + * consistent even if the CPU changes frequency due to changing power + * levels. + * + * As documented in "Determining the Processor Base Frequency" in the + * "IntelĀ® 64 and IA-32 Architectures Software Developer's Manual", + * February 2026 Edition, we can get the TSC frequency as follows: + * + * Nominal TSC frequency = ( CPUID.15H:ECX[31:0] * CPUID.15H:EBX[31:0] ) / + * CPUID.15H:EAX[31:0] + * + * With CPUID.15H:ECX representing the nominal core crystal clock + * frequency, and EAX/EBX representing values used to translate the TSC + * value to that frequency, see "Chapter 20.17 "Time-Stamp Counter" of + * that manual. + * + * Older Intel CPUs, and other vendors do not set CPUID.15H:ECX, and as + * such we fall back to alternate approaches. + */ + pg_cpuid(0x15, reg); + if (reg[ECX] > 0) + { + /* + * EBX not being set indicates invariant TSC is not available. Require + * EAX being non-zero too, to avoid a theoretical divide by zero. + */ + if (reg[EAX] == 0 || reg[EBX] == 0) + return 0; + + return reg[ECX] / 1000 * reg[EBX] / reg[EAX]; + } + + /* + * When CPUID.15H is not available/incomplete, we can instead try to get + * the processor base frequency in MHz from CPUID.16H:EAX, the "Processor + * Frequency Information Leaf". + */ + pg_cpuid(0x16, reg); + if (reg[EAX] > 0) + return reg[EAX] * 1000; + + return 0; +} + +/* + * Support for reading TSC frequency for hypervisors passing it to a guest VM. + * + * Two Hypervisors (VMware and KVM) are known to make TSC frequency in KHz + * available at the vendor-specific 0x40000010 leaf in the EAX register. + * + * For some other Hypervisors that have an invariant TSC, e.g. HyperV, we would + * need to access an MSR to get the frequency (which is typically not available + * for unprivileged processes), so we instead rely on the TSC calibration logic. + */ +#define CPUID_HYPERVISOR_VMWARE(r) (r[EBX] == 0x61774d56 && r[ECX] == 0x4d566572 && r[EDX] == 0x65726177) /* VMwareVMware */ +#define CPUID_HYPERVISOR_KVM(r) (r[EBX] == 0x4b4d564b && r[ECX] == 0x564b4d56 && r[EDX] == 0x0000004d) /* KVMKVMKVM */ +static uint32 +x86_hypervisor_tsc_frequency_khz(void) +{ + unsigned int reg[4] = {0}; + +#if defined(HAVE__CPUIDEX) + + /* + * The hypervisor is determined using the 0x40000000 Hypervisor + * information leaf, which requires use of __cpuidex to set ECX to 0 to + * access it. + * + * The similar __get_cpuid_count function does not work as expected since + * it contains a check for __get_cpuid_max, which has been observed to be + * lower than the special Hypervisor leaf, despite it being available. + */ + __cpuidex((int *) reg, 0x40000000, 0); + + if (reg[EAX] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(reg) || CPUID_HYPERVISOR_KVM(reg))) + { + __cpuidex((int *) reg, 0x40000010, 0); + if (reg[EAX] > 0) + return reg[EAX]; + } +#endif /* HAVE__CPUIDEX */ + + return 0; +} + + #endif /* defined(USE_SSE2) || defined(__i386__) */ -- 2.47.1