From 6aa00e5ed2a5eb8714323f505efe87b5b0011393 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 10 Mar 2026 01:38:14 -0700 Subject: [PATCH v11 7/7] instrumentation: ARM support for fast time measurements Similar to the RDTSC/RDTSCP instructions on x68-64, this introduces use of the cntvct_el0 instruction on ARM systems to access the generic timer that provides a synchronized ticks value across CPUs. Note this adds an exception for Apple Silicon CPUs, due to the observed fact that M3 and newer has different timer frequencies for the Efficiency and the Performance cores, and we can't be sure where we get scheduled. To simplify the implementation this does not support Windows on ARM, since its quite rare and hard to test. Relies on the existing timing_clock_source GUC to control whether TSC-like timer gets used, instead of system timer. Author: Lukas Fittl Reviewed-by: Discussion: --- src/common/instr_time.c | 65 ++++++++++++++++++++++++++-- src/include/port/pg_cpu.h | 6 +++ src/include/portability/instr_time.h | 57 ++++++++++++++++++++++-- src/port/meson.build | 1 + src/port/pg_cpu_arm.c | 45 +++++++++++++++++++ 5 files changed, 166 insertions(+), 8 deletions(-) create mode 100644 src/port/pg_cpu_arm.c diff --git a/src/common/instr_time.c b/src/common/instr_time.c index 2becf9b0780..7d74c058d7a 100644 --- a/src/common/instr_time.c +++ b/src/common/instr_time.c @@ -20,6 +20,10 @@ #include #endif +#if defined(__APPLE__) +#include +#endif + #include "port/pg_cpu.h" #include "portability/instr_time.h" @@ -162,7 +166,7 @@ set_ticks_per_ns() #endif } -/* TSC specific logic */ +/* Hardware clock specific logic (x86 TSC / AArch64 CNTVCT) */ #if PG_INSTR_TSC_CLOCK @@ -170,6 +174,19 @@ bool use_tsc = false; static uint32 tsc_frequency_khz = 0; +static void +set_ticks_per_ns_for_tsc(void) +{ + ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / tsc_frequency_khz; + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; +} + +#if defined(__x86_64__) || defined(_M_X64) + +/* + * x86-64 TSC specific logic + */ + static uint32 tsc_calibrate(void); /* @@ -363,11 +380,51 @@ tsc_calibrate(void) return 0; } +#elif defined(__aarch64__) + +/* + * Check whether this is a heterogeneous Apple Silicon P+E core system + * where CNTVCT_EL0 may tick at different rates on different core types. + */ +static bool +aarch64_has_heterogeneous_cores(void) +{ +#if defined(__APPLE__) + int nperflevels = 0; + size_t len = sizeof(nperflevels); + + if (sysctlbyname("hw.nperflevels", &nperflevels, &len, NULL, 0) == 0) + return nperflevels > 1; +#endif + + return false; +} + +/* + * Initialize the AArch64 generic timer as a clock source. + */ static void -set_ticks_per_ns_for_tsc(void) +tsc_initialize(bool allow_tsc_calibration) { - ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / tsc_frequency_khz; - max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; + if (aarch64_has_heterogeneous_cores()) + return; + + tsc_frequency_khz = aarch64_cntvct_frequency_khz(); + if (tsc_frequency_khz != 0) + has_usable_tsc = true; +} + +/* + * The ARM generic timer is architecturally guaranteed to be monotonic and + * synchronized across cores of the same type, so we always use it by default + * when available and cores are homogenous. + */ +static bool +tsc_use_by_default(void) +{ + return true; } +#endif /* defined(__aarch64__) */ + #endif /* PG_INSTR_TSC_CLOCK */ diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index a32e67487f8..82df66f381e 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -56,4 +56,10 @@ extern uint32 x86_tsc_frequency_khz(void); #endif /* defined(USE_SSE2) || defined(__i386__) */ +#if defined(__aarch64__) + +extern uint32 aarch64_cntvct_frequency_khz(void); + +#endif /* defined(__aarch64__) */ + #endif /* PG_CPU_H */ diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index 681e3f5bf8d..ac8020bdd62 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -4,8 +4,9 @@ * portable high-precision interval timing * * This file provides an abstraction layer to hide portability issues in - * interval timing. On x86 we use the RDTSC/RDTSCP instruction directly in - * certain cases, or alternatively clock_gettime() on Unix-like systems and + * interval timing. On x86 we use the RDTSC/RDTSCP instruction, and on + * AArch64 the CNTVCT_EL0 generic timer, directly in certain cases, or + * alternatively clock_gettime() on Unix-like systems and * QueryPerformanceCounter() on Windows. These macros also give some breathing * room to use other high-precision-timing APIs. * @@ -126,6 +127,11 @@ extern bool pg_set_timing_clock_source(TimingClockSourceType source); #define PG_INSTR_TSC_CLOCK_NAME_FAST "RDTSC" #define PG_INSTR_TSC_CLOCK_NAME "RDTSCP" #define PG_INSTR_TICKS_TO_NS 1 +#elif defined(__aarch64__) && !defined(WIN32) +#define PG_INSTR_TSC_CLOCK 1 +#define PG_INSTR_TSC_CLOCK_NAME_FAST "CNTVCT_EL0" +#define PG_INSTR_TSC_CLOCK_NAME "CNTVCT_EL0 (ISB)" +#define PG_INSTR_TICKS_TO_NS 1 #elif defined(WIN32) #define PG_INSTR_TSC_CLOCK 0 #define PG_INSTR_TICKS_TO_NS 1 @@ -134,7 +140,6 @@ extern bool pg_set_timing_clock_source(TimingClockSourceType source); #define PG_INSTR_TICKS_TO_NS 0 #endif - #if PG_INSTR_TSC_CLOCK /* Whether the hardware TSC clock is available and usable. */ extern PGDLLIMPORT bool has_usable_tsc; @@ -264,6 +269,8 @@ pg_ticks_to_ns(int64 ticks) #if PG_INSTR_TSC_CLOCK +#if defined(__x86_64__) || defined(_M_X64) + #ifdef _MSC_VER #include #endif /* defined(_MSC_VER) */ @@ -306,7 +313,49 @@ pg_get_ticks(void) return pg_get_ticks_system(); } -#else +#elif defined(__aarch64__) && !defined(WIN32) + +/* + * Read the ARM generic timer counter (CNTVCT_EL0). + * + * The "fast" variant reads the counter without a barrier, analogous to RDTSC + * on x86. The regular variant issues an ISB (Instruction Synchronization + * Barrier) first, which acts as a serializing instruction analogous to RDTSCP, + * ensuring all preceding instructions have completed before reading the + * counter. + */ +static inline instr_time +pg_get_ticks_fast(void) +{ + if (likely(use_tsc)) + { + instr_time now; + + now.ticks = __builtin_arm_rsr64("cntvct_el0"); + return now; + } + + return pg_get_ticks_system(); +} + +static inline instr_time +pg_get_ticks(void) +{ + if (likely(use_tsc)) + { + instr_time now; + + __builtin_arm_isb(0xf); + now.ticks = __builtin_arm_rsr64("cntvct_el0"); + return now; + } + + return pg_get_ticks_system(); +} + +#endif /* defined(__aarch64__) */ + +#else /* !PG_INSTR_TSC_CLOCK */ static inline instr_time pg_get_ticks_fast(void) diff --git a/src/port/meson.build b/src/port/meson.build index 7296f8e3c03..110bcd28edd 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_cpu_arm.c', 'pg_cpu_x86.c', 'pg_localeconv_r.c', 'pg_numa.c', diff --git a/src/port/pg_cpu_arm.c b/src/port/pg_cpu_arm.c new file mode 100644 index 00000000000..6fd9dd892ec --- /dev/null +++ b/src/port/pg_cpu_arm.c @@ -0,0 +1,45 @@ +/*------------------------------------------------------------------------- + * + * pg_cpu_arm.c + * Runtime CPU feature detection for AArch64 + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_cpu_arm.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#if defined(__aarch64__) && !defined(WIN32) + +#include "port/pg_cpu.h" + +/* + * Return the frequency of the ARM generic timer (CNTVCT_EL0) in kHz. + * + * The CNTFRQ_EL0 system register is architecturally guaranteed to be readable + * from EL0 (userspace) and holds the timer frequency in Hz. The firmware sets + * this at boot and it does not change. + * + * Returns 0 if the frequency is not available (should not happen on conforming + * implementations). + */ +uint32 +aarch64_cntvct_frequency_khz(void) +{ + uint64 freq; + + freq = __builtin_arm_rsr64("cntfrq_el0"); + + if (freq == 0) + return 0; + + return (uint32) (freq / 1000); +} + +#endif /* defined(__aarch64__) */ -- 2.47.1