From 3e1a8c9accd958187a5ada117448f486261cb415 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Tue, 10 Mar 2026 01:38:14 -0700 Subject: [PATCH v21 5/5] instrumentation: ARM support for fast time measurements Similar to the RDTSC/RDTSCP instructions on x68-64, this introduces use of the cntvct_el0 instruction on ARM systems to access the generic timer that provides a synchronized ticks value across CPUs. Note this adds an exception for Apple Silicon CPUs, due to the observed fact that M3 and newer has different timer frequencies for the Efficiency and the Performance cores, and we can't be sure where we get scheduled. To simplify the implementation this does not support Windows on ARM, since its quite rare and hard to test. Relies on the existing timing_clock_source GUC to control whether TSC-like timer gets used, instead of system timer. Author: Lukas Fittl Reviewed-by: Discussion: --- src/common/instr_time.c | 67 +++++++++++++++++++++++++++- src/include/port/pg_cpu.h | 6 +++ src/include/portability/instr_time.h | 56 +++++++++++++++++++++-- src/port/meson.build | 1 + src/port/pg_cpu_arm.c | 45 +++++++++++++++++++ 5 files changed, 170 insertions(+), 5 deletions(-) create mode 100644 src/port/pg_cpu_arm.c diff --git a/src/common/instr_time.c b/src/common/instr_time.c index f0c5d73251e..14ab4579d37 100644 --- a/src/common/instr_time.c +++ b/src/common/instr_time.c @@ -20,6 +20,10 @@ #include +#if defined(__APPLE__) +#include +#endif + #include "port/pg_cpu.h" #include "portability/instr_time.h" @@ -161,7 +165,7 @@ set_ticks_per_ns_system(void) #endif /* WIN32 */ -/* TSC specific logic */ +/* Hardware clock specific logic (x86 TSC / AArch64 CNTVCT) */ #if PG_INSTR_TSC_CLOCK @@ -189,6 +193,12 @@ set_ticks_per_ns_for_tsc(void) max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; } +#if defined(__x86_64__) || defined(_M_X64) + +/* + * x86-64 TSC specific logic + */ + /* * Detect the TSC frequency and whether RDTSCP is available on x86-64. * @@ -370,4 +380,59 @@ pg_tsc_calibrate_frequency(void) return (uint32) freq_khz; } +#elif defined(__aarch64__) + +/* + * Check whether this is a heterogeneous Apple Silicon P+E core system + * where CNTVCT_EL0 may tick at different rates on different core types. + */ +static bool +aarch64_has_heterogeneous_cores(void) +{ +#if defined(__APPLE__) + int nperflevels = 0; + size_t len = sizeof(nperflevels); + + if (sysctlbyname("hw.nperflevels", &nperflevels, &len, NULL, 0) == 0) + return nperflevels > 1; +#endif + + return false; +} + +/* + * Detect the generic timer frequency on AArch64. + */ +static void +tsc_detect_frequency(void) +{ + if (aarch64_has_heterogeneous_cores()) + { + timing_tsc_frequency_khz = 0; + return; + } + + timing_tsc_frequency_khz = aarch64_cntvct_frequency_khz(); +} + +/* + * The ARM generic timer is architecturally guaranteed to be monotonic and + * synchronized across cores of the same type, so we always use it by default + * when available and cores are homogenous. + */ +static bool +tsc_use_by_default(void) +{ + return true; +} + +uint32 +pg_tsc_calibrate_frequency(void) +{ + /* No calibration loop on AArch64; frequency comes from CNTFRQ_EL0 */ + return 0; +} + +#endif /* defined(__aarch64__) */ + #endif /* PG_INSTR_TSC_CLOCK */ diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index a5d42f1b68d..aee501a4ecd 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -60,4 +60,10 @@ extern uint32 x86_tsc_frequency_khz(void); #endif /* defined(USE_SSE2) || defined(__i386__) */ +#if defined(__aarch64__) + +extern uint32 aarch64_cntvct_frequency_khz(void); + +#endif /* defined(__aarch64__) */ + #endif /* PG_CPU_H */ diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index bd81bb68b4c..dfebdfbf461 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -4,8 +4,9 @@ * portable high-precision interval timing * * This file provides an abstraction layer to hide portability issues in - * interval timing. On x86 we use the RDTSC/RDTSCP instruction directly in - * certain cases, or alternatively clock_gettime() on Unix-like systems and + * interval timing. On x86 we use the RDTSC/RDTSCP instruction, and on + * AArch64 the CNTVCT_EL0 generic timer, directly in certain cases, or + * alternatively clock_gettime() on Unix-like systems and * QueryPerformanceCounter() on Windows. These macros also give some breathing * room to use other high-precision-timing APIs. * @@ -95,7 +96,7 @@ typedef struct instr_time * PG_INSTR_TSC_CLOCK controls whether the TSC clock source is compiled in, and * potentially used based on timing_tsc_enabled. */ -#if defined(__x86_64__) || defined(_M_X64) +#if defined(__x86_64__) || defined(_M_X64) || (defined(__aarch64__) && !defined(_MSC_VER)) #define PG_INSTR_TICKS_TO_NS 1 #define PG_INSTR_TSC_CLOCK 1 #elif defined(WIN32) @@ -333,6 +334,8 @@ pg_ns_to_ticks(int64 ns) #if PG_INSTR_TSC_CLOCK +#if defined(__x86_64__) || defined(_M_X64) + #define PG_INSTR_TSC_CLOCK_NAME_FAST "RDTSC" #define PG_INSTR_TSC_CLOCK_NAME "RDTSCP" @@ -391,7 +394,52 @@ pg_get_ticks(void) return pg_get_ticks_system(); } -#else +#elif defined(__aarch64__) && !defined(_MSC_VER) + +#define PG_INSTR_TSC_CLOCK_NAME_FAST "CNTVCT_EL0" +#define PG_INSTR_TSC_CLOCK_NAME "CNTVCT_EL0 (ISB)" + +/* + * Read the ARM generic timer counter (CNTVCT_EL0). + * + * The "fast" variant reads the counter without a barrier, analogous to RDTSC + * on x86. The regular variant issues an ISB (Instruction Synchronization + * Barrier) first, which acts as a serializing instruction analogous to RDTSCP, + * ensuring all preceding instructions have completed before reading the + * counter. + */ +static pg_attribute_always_inline instr_time +pg_get_ticks_fast(void) +{ + if (likely(timing_tsc_enabled)) + { + instr_time now; + + now.ticks = __builtin_arm_rsr64("cntvct_el0"); + return now; + } + + return pg_get_ticks_system(); +} + +static pg_attribute_always_inline instr_time +pg_get_ticks(void) +{ + if (likely(timing_tsc_enabled)) + { + instr_time now; + + __builtin_arm_isb(0xf); + now.ticks = __builtin_arm_rsr64("cntvct_el0"); + return now; + } + + return pg_get_ticks_system(); +} + +#endif /* defined(__aarch64__) */ + +#else /* !PG_INSTR_TSC_CLOCK */ static pg_attribute_always_inline instr_time pg_get_ticks_fast(void) diff --git a/src/port/meson.build b/src/port/meson.build index 922b3f64676..d695f92b769 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_cpu_arm.c', 'pg_cpu_x86.c', 'pg_getopt_ctx.c', 'pg_localeconv_r.c', diff --git a/src/port/pg_cpu_arm.c b/src/port/pg_cpu_arm.c new file mode 100644 index 00000000000..2814a947706 --- /dev/null +++ b/src/port/pg_cpu_arm.c @@ -0,0 +1,45 @@ +/*------------------------------------------------------------------------- + * + * pg_cpu_arm.c + * Runtime CPU feature detection for AArch64 + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_cpu_arm.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#if defined(__aarch64__) && !defined(_MSC_VER) + +#include "port/pg_cpu.h" + +/* + * Return the frequency of the ARM generic timer (CNTVCT_EL0) in kHz. + * + * The CNTFRQ_EL0 system register is architecturally guaranteed to be readable + * from EL0 (userspace) and holds the timer frequency in Hz. The firmware sets + * this at boot and it does not change. + * + * Returns 0 if the frequency is not available (should not happen on conforming + * implementations). + */ +uint32 +aarch64_cntvct_frequency_khz(void) +{ + uint64 freq; + + freq = __builtin_arm_rsr64("cntfrq_el0"); + + if (freq == 0) + return 0; + + return (uint32) (freq / 1000); +} + +#endif /* defined(__aarch64__) */ -- 2.47.1