From cebe47ad741b437144b0bbffb6923ef15597a445 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 10 Mar 2026 01:38:14 -0700 Subject: [PATCH v16 5/5] instrumentation: ARM support for fast time measurements Similar to the RDTSC/RDTSCP instructions on x68-64, this introduces use of the cntvct_el0 instruction on ARM systems to access the generic timer that provides a synchronized ticks value across CPUs. Note this adds an exception for Apple Silicon CPUs, due to the observed fact that M3 and newer has different timer frequencies for the Efficiency and the Performance cores, and we can't be sure where we get scheduled. To simplify the implementation this does not support Windows on ARM, since its quite rare and hard to test. Relies on the existing timing_clock_source GUC to control whether TSC-like timer gets used, instead of system timer. Author: Lukas Fittl Reviewed-by: Discussion: --- src/common/instr_time.c | 63 +++++++++++++++++++++++++++- src/include/port/pg_cpu.h | 6 +++ src/include/portability/instr_time.h | 57 +++++++++++++++++++++++-- src/port/meson.build | 1 + src/port/pg_cpu_arm.c | 45 ++++++++++++++++++++ 5 files changed, 166 insertions(+), 6 deletions(-) create mode 100644 src/port/pg_cpu_arm.c diff --git a/src/common/instr_time.c b/src/common/instr_time.c index 2051f423415..8fcf49023bd 100644 --- a/src/common/instr_time.c +++ b/src/common/instr_time.c @@ -16,6 +16,10 @@ #include +#if defined(__APPLE__) +#include +#endif + #include "port/pg_cpu.h" #include "portability/instr_time.h" @@ -159,7 +163,7 @@ set_ticks_per_ns_system(void) #endif /* WIN32 */ -/* TSC specific logic */ +/* Hardware clock specific logic (x86 TSC / AArch64 CNTVCT) */ #if PG_INSTR_TSC_CLOCK @@ -168,7 +172,6 @@ bool timing_tsc_enabled = false; int32 timing_tsc_frequency_khz = -1; static void tsc_detect_frequency(void); -static uint32 tsc_calibrate(void); /* * Initialize the TSC clock source by determining its usability and frequency. @@ -197,6 +200,14 @@ set_ticks_per_ns_for_tsc(void) max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; } +#if defined(__x86_64__) || defined(_M_X64) + +/* + * x86-64 TSC specific logic + */ + +static uint32 tsc_calibrate(void); + /* * Detect the TSC frequency and whether RDTSCP is available on x86-64. * @@ -359,4 +370,52 @@ tsc_calibrate(void) return 0; } +#elif defined(__aarch64__) + +/* + * Check whether this is a heterogeneous Apple Silicon P+E core system + * where CNTVCT_EL0 may tick at different rates on different core types. + */ +static bool +aarch64_has_heterogeneous_cores(void) +{ +#if defined(__APPLE__) + int nperflevels = 0; + size_t len = sizeof(nperflevels); + + if (sysctlbyname("hw.nperflevels", &nperflevels, &len, NULL, 0) == 0) + return nperflevels > 1; +#endif + + return false; +} + +/* + * Detect the generic timer frequency on AArch64. + */ +static void +tsc_detect_frequency(void) +{ + if (aarch64_has_heterogeneous_cores()) + { + timing_tsc_frequency_khz = 0; + return; + } + + timing_tsc_frequency_khz = aarch64_cntvct_frequency_khz(); +} + +/* + * The ARM generic timer is architecturally guaranteed to be monotonic and + * synchronized across cores of the same type, so we always use it by default + * when available and cores are homogenous. + */ +static bool +tsc_use_by_default(void) +{ + return true; +} + +#endif /* defined(__aarch64__) */ + #endif /* PG_INSTR_TSC_CLOCK */ diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index 0e1fea7fa92..89cceba256f 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -55,4 +55,10 @@ extern uint32 x86_tsc_frequency_khz(void); #endif /* defined(USE_SSE2) || defined(__i386__) */ +#if defined(__aarch64__) + +extern uint32 aarch64_cntvct_frequency_khz(void); + +#endif /* defined(__aarch64__) */ + #endif /* PG_CPU_H */ diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index 2e7d7979e10..3c9815a3a25 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -4,8 +4,9 @@ * portable high-precision interval timing * * This file provides an abstraction layer to hide portability issues in - * interval timing. On x86 we use the RDTSC/RDTSCP instruction directly in - * certain cases, or alternatively clock_gettime() on Unix-like systems and + * interval timing. On x86 we use the RDTSC/RDTSCP instruction, and on + * AArch64 the CNTVCT_EL0 generic timer, directly in certain cases, or + * alternatively clock_gettime() on Unix-like systems and * QueryPerformanceCounter() on Windows. These macros also give some breathing * room to use other high-precision-timing APIs. * @@ -136,6 +137,11 @@ extern bool pg_set_timing_clock_source(TimingClockSourceType source); #define PG_INSTR_TSC_CLOCK_NAME_FAST "RDTSC" #define PG_INSTR_TSC_CLOCK_NAME "RDTSCP" #define PG_INSTR_TICKS_TO_NS 1 +#elif defined(__aarch64__) && !defined(WIN32) +#define PG_INSTR_TSC_CLOCK 1 +#define PG_INSTR_TSC_CLOCK_NAME_FAST "CNTVCT_EL0" +#define PG_INSTR_TSC_CLOCK_NAME "CNTVCT_EL0 (ISB)" +#define PG_INSTR_TICKS_TO_NS 1 #elif defined(WIN32) #define PG_INSTR_TSC_CLOCK 0 #define PG_INSTR_TICKS_TO_NS 1 @@ -144,7 +150,6 @@ extern bool pg_set_timing_clock_source(TimingClockSourceType source); #define PG_INSTR_TICKS_TO_NS 0 #endif - #if PG_INSTR_TSC_CLOCK /* Whether to actually use TSC based on availability and GUC settings. */ extern PGDLLIMPORT bool timing_tsc_enabled; @@ -324,6 +329,8 @@ pg_ns_to_ticks(int64 ns) #if PG_INSTR_TSC_CLOCK +#if defined(__x86_64__) || defined(_M_X64) + #ifdef _MSC_VER #include #endif /* defined(_MSC_VER) */ @@ -379,7 +386,49 @@ pg_get_ticks(void) return pg_get_ticks_system(); } -#else +#elif defined(__aarch64__) && !defined(WIN32) + +/* + * Read the ARM generic timer counter (CNTVCT_EL0). + * + * The "fast" variant reads the counter without a barrier, analogous to RDTSC + * on x86. The regular variant issues an ISB (Instruction Synchronization + * Barrier) first, which acts as a serializing instruction analogous to RDTSCP, + * ensuring all preceding instructions have completed before reading the + * counter. + */ +static inline instr_time +pg_get_ticks_fast(void) +{ + if (likely(timing_tsc_enabled)) + { + instr_time now; + + now.ticks = __builtin_arm_rsr64("cntvct_el0"); + return now; + } + + return pg_get_ticks_system(); +} + +static inline instr_time +pg_get_ticks(void) +{ + if (likely(timing_tsc_enabled)) + { + instr_time now; + + __builtin_arm_isb(0xf); + now.ticks = __builtin_arm_rsr64("cntvct_el0"); + return now; + } + + return pg_get_ticks_system(); +} + +#endif /* defined(__aarch64__) */ + +#else /* !PG_INSTR_TSC_CLOCK */ static inline instr_time pg_get_ticks_fast(void) diff --git a/src/port/meson.build b/src/port/meson.build index d55cb0424f3..9553d94a2f4 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_cpu_arm.c', 'pg_cpu_x86.c', 'pg_getopt_ctx.c', 'pg_localeconv_r.c', diff --git a/src/port/pg_cpu_arm.c b/src/port/pg_cpu_arm.c new file mode 100644 index 00000000000..6fd9dd892ec --- /dev/null +++ b/src/port/pg_cpu_arm.c @@ -0,0 +1,45 @@ +/*------------------------------------------------------------------------- + * + * pg_cpu_arm.c + * Runtime CPU feature detection for AArch64 + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_cpu_arm.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#if defined(__aarch64__) && !defined(WIN32) + +#include "port/pg_cpu.h" + +/* + * Return the frequency of the ARM generic timer (CNTVCT_EL0) in kHz. + * + * The CNTFRQ_EL0 system register is architecturally guaranteed to be readable + * from EL0 (userspace) and holds the timer frequency in Hz. The firmware sets + * this at boot and it does not change. + * + * Returns 0 if the frequency is not available (should not happen on conforming + * implementations). + */ +uint32 +aarch64_cntvct_frequency_khz(void) +{ + uint64 freq; + + freq = __builtin_arm_rsr64("cntfrq_el0"); + + if (freq == 0) + return 0; + + return (uint32) (freq / 1000); +} + +#endif /* defined(__aarch64__) */ -- 2.47.1