From c05a5b30df212248009a29df739b9d1d57ea9261 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Thu, 12 Feb 2026 12:45:23 +0700 Subject: [PATCH v1 2/4] Centralize detection of CPU features WIP: x86 only --- configure | 4 +- configure.ac | 4 +- src/include/port/pg_x86_feature.h | 44 +++++++++++++++ src/port/Makefile | 1 + src/port/meson.build | 3 +- src/port/pg_crc32c_sse42.c | 29 ++++++++++ src/port/pg_popcount_x86.c | 91 ++----------------------------- src/port/pg_x86_feature.c | 75 ++++++++++++------------- 8 files changed, 119 insertions(+), 132 deletions(-) create mode 100644 src/include/port/pg_x86_feature.h diff --git a/configure b/configure index 373194daa05..185703289b4 100755 --- a/configure +++ b/configure @@ -18196,7 +18196,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_x86_feature.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5 $as_echo "SSE 4.2" >&6; } else @@ -18204,7 +18204,7 @@ else $as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_x86_feature.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5 $as_echo "SSE 4.2 with runtime check" >&6; } else diff --git a/configure.ac b/configure.ac index 62e47394544..0955b7e4371 100644 --- a/configure.ac +++ b/configure.ac @@ -2245,12 +2245,12 @@ fi AC_MSG_CHECKING([which CRC-32C implementation to use]) if test x"$USE_SSE42_CRC32C" = x"1"; then AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_x86_feature.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o" AC_MSG_RESULT(SSE 4.2) else if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_x86_feature.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o" AC_MSG_RESULT(SSE 4.2 with runtime check) else if test x"$USE_ARMV8_CRC32C" = x"1"; then diff --git a/src/include/port/pg_x86_feature.h b/src/include/port/pg_x86_feature.h new file mode 100644 index 00000000000..de56882c9e1 --- /dev/null +++ b/src/include/port/pg_x86_feature.h @@ -0,0 +1,44 @@ +/*------------------------------------------------------------------------- + * + * pg_x86_feature.h + * Runtime CPU feature detection + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/port/pg_x86_feature.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_X86_FEATURE_H +#define PG_X86_FEATURE_H + + +typedef enum X86FeatureId +{ + init, + + PG_SSE4_2, + PG_POPCNT, + + PG_AVX512_BW, + PG_AVX512_VL, + PG_VPCLMULQDQ, + PG_AVX512_VPOPCNTDQ, +} X86FeatureId; +#define X86FeatureSize (PG_AVX512_VPOPCNTDQ + 1) + +extern PGDLLEXPORT bool X86Feature[]; + +extern void x86_set_runtime_features(void); + +static inline bool +x86_feature_available(X86FeatureId feature) +{ + if (X86Feature[init] == false) + x86_set_runtime_features(); + + return X86Feature[feature]; +} + +#endif /* PG_X86_FEATURE_H */ diff --git a/src/port/Makefile b/src/port/Makefile index 6e3b7d154ed..7b5bc58a898 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -49,6 +49,7 @@ OBJS = \ pg_popcount_aarch64.o \ pg_popcount_x86.o \ pg_strong_random.o \ + pg_x86_feature.o \ pgcheckdir.o \ pgmkdirp.o \ pgsleep.o \ diff --git a/src/port/meson.build b/src/port/meson.build index d96b4eed4c6..9a25a634a5b 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -12,6 +12,7 @@ pgport_sources = [ 'pg_popcount_aarch64.c', 'pg_popcount_x86.c', 'pg_strong_random.c', + 'pg_x86_feature.c', 'pgcheckdir.c', 'pgmkdirp.c', 'pgsleep.c', @@ -86,8 +87,6 @@ replace_funcs_pos = [ # x86/x64 ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'], ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], - ['pg_x86_feature', 'USE_SSE42_CRC32C'], - ['pg_x86_feature', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], # arm / aarch64 diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index c1279d31fbd..f64ddde28cd 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -20,6 +20,10 @@ #endif #include "port/pg_crc32c.h" +#include "port/pg_x86_feature.h" + +static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len); + pg_attribute_no_sanitize_alignment() pg_attribute_target("sse4.2") @@ -159,3 +163,28 @@ pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len) } #endif + +static pg_crc32c +pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) +{ + /* + * Set fallback. We must guard since slicing-by-8 is not visible + * everywhere. + */ +#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK + pg_comp_crc32c = pg_comp_crc32c_sb8; +#endif + + if (x86_feature_available(PG_SSE4_2)) + pg_comp_crc32c = pg_comp_crc32c_sse42; + +#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK + if (x86_feature_available(PG_AVX512_VL) && + x86_feature_available(PG_VPCLMULQDQ)) + pg_comp_crc32c = pg_comp_crc32c_avx512; +#endif + + return pg_comp_crc32c(crc, data, len); +}; + +pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c index 6bce089432f..45e8930adc7 100644 --- a/src/port/pg_popcount_x86.c +++ b/src/port/pg_popcount_x86.c @@ -14,19 +14,12 @@ #ifdef HAVE_X86_64_POPCNTQ -#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) -#include -#endif - #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK #include #endif -#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) -#include -#endif - #include "port/pg_bitutils.h" +#include "port/pg_x86_feature.h" /* * The SSE4.2 versions are built regardless of whether we are building the @@ -58,84 +51,9 @@ static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; -/* - * Return true if CPUID indicates that the POPCNT instruction is available. - */ -static bool -pg_popcount_sse42_available(void) -{ - unsigned int exx[4] = {0, 0, 0, 0}; - -#if defined(HAVE__GET_CPUID) - __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUID) - __cpuid(exx, 1); -#else -#error cpuid instruction not available -#endif - - return (exx[2] & (1 << 23)) != 0; /* POPCNT */ -} #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK -/* - * Does CPUID say there's support for XSAVE instructions? - */ -static inline bool -xsave_available(void) -{ - unsigned int exx[4] = {0, 0, 0, 0}; - -#if defined(HAVE__GET_CPUID) - __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUID) - __cpuid(exx, 1); -#else -#error cpuid instruction not available -#endif - return (exx[2] & (1 << 27)) != 0; /* osxsave */ -} - -/* - * Does XGETBV say the ZMM registers are enabled? - * - * NB: Caller is responsible for verifying that xsave_available() returns true - * before calling this. - */ -#ifdef HAVE_XSAVE_INTRINSICS -pg_attribute_target("xsave") -#endif -static inline bool -zmm_regs_available(void) -{ -#ifdef HAVE_XSAVE_INTRINSICS - return (_xgetbv(0) & 0xe6) == 0xe6; -#else - return false; -#endif -} - -/* - * Does CPUID say there's support for AVX-512 popcount and byte-and-word - * instructions? - */ -static inline bool -avx512_popcnt_available(void) -{ - unsigned int exx[4] = {0, 0, 0, 0}; - -#if defined(HAVE__GET_CPUID_COUNT) - __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUIDEX) - __cpuidex(exx, 7, 0); -#else -#error cpuid instruction not available -#endif - return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */ - (exx[1] & (1 << 30)) != 0; /* avx512-bw */ -} - /* * Returns true if the CPU supports the instructions required for the AVX-512 * pg_popcount() implementation. @@ -143,9 +61,8 @@ avx512_popcnt_available(void) static bool pg_popcount_avx512_available(void) { - return xsave_available() && - zmm_regs_available() && - avx512_popcnt_available(); + return x86_feature_available(PG_AVX512_BW) && + x86_feature_available(PG_AVX512_VPOPCNTDQ); } #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ @@ -159,7 +76,7 @@ pg_popcount_avx512_available(void) static inline void choose_popcount_functions(void) { - if (pg_popcount_sse42_available()) + if (x86_feature_available(PG_POPCNT)) { pg_popcount_optimized = pg_popcount_sse42; pg_popcount_masked_optimized = pg_popcount_masked_sse42; diff --git a/src/port/pg_x86_feature.c b/src/port/pg_x86_feature.c index f586476964f..c92cfbe6d5d 100644 --- a/src/port/pg_x86_feature.c +++ b/src/port/pg_x86_feature.c @@ -1,25 +1,21 @@ /*------------------------------------------------------------------------- * - * pg_crc32c_sse42_choose.c - * Choose between Intel SSE 4.2 and software CRC-32C implementation. - * - * On first call, checks if the CPU we're running on supports Intel SSE - * 4.2. If it does, use the special SSE instructions for CRC-32C - * computation. Otherwise, fall back to the pure software implementation - * (slicing-by-8). + * pg_x86_feature.c + * Runtime CPU feature detection * * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * src/port/pg_crc32c_sse42_choose.c + * src/port/pg_x86_feature.c * *------------------------------------------------------------------------- */ - #include "c.h" +#if defined(USE_SSE2) || defined(__i386__) + #if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) #include #endif @@ -32,7 +28,11 @@ #include #endif -#include "port/pg_crc32c.h" +#include "port/pg_x86_feature.h" + + +bool X86Feature[X86FeatureSize] = {0}; + /* * Does XGETBV say the ZMM registers are enabled? @@ -54,22 +54,13 @@ zmm_regs_available(void) } /* - * This gets called on the first call. It replaces the function pointer - * so that subsequent calls are routed directly to the chosen implementation. + * Parse the CPU ID info for runtime checks. */ -static pg_crc32c -pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) +void +x86_set_runtime_features(void) { unsigned int exx[4] = {0, 0, 0, 0}; - /* - * Set fallback. We must guard since slicing-by-8 is not visible - * everywhere. - */ -#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK - pg_comp_crc32c = pg_comp_crc32c_sb8; -#endif - #if defined(HAVE__GET_CPUID) __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); #elif defined(HAVE__CPUID) @@ -78,32 +69,38 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) #error cpuid instruction not available #endif - if ((exx[2] & (1 << 20)) != 0) /* SSE 4.2 */ - { - pg_comp_crc32c = pg_comp_crc32c_sse42; + X86Feature[PG_SSE4_2] = exx[2] >> 20 & 1; + X86Feature[PG_POPCNT] = exx[2] >> 23 & 1; - if (exx[2] & (1 << 27) && /* OSXSAVE */ - zmm_regs_available()) - { - /* second cpuid call on leaf 7 to check extended AVX-512 support */ + /* All these features depend on OSXSAVE */ + if (exx[2] & (1 << 27)) + { + /* second cpuid call on leaf 7 to check extended AVX-512 support */ - memset(exx, 0, 4 * sizeof(exx[0])); + memset(exx, 0, 4 * sizeof(exx[0])); #if defined(HAVE__GET_CPUID_COUNT) - __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); #elif defined(HAVE__CPUIDEX) - __cpuidex(exx, 7, 0); + __cpuidex(exx, 7, 0); #endif -#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK - if (exx[2] & (1 << 10) && /* VPCLMULQDQ */ - exx[1] & (1 << 31)) /* AVX512-VL */ - pg_comp_crc32c = pg_comp_crc32c_avx512; -#endif + if (zmm_regs_available()) + { + X86Feature[PG_AVX512_BW] = exx[1] >> 30 & 1; + X86Feature[PG_AVX512_VL] = exx[1] >> 31 & 1; + + X86Feature[PG_VPCLMULQDQ] = exx[2] >> 10 & 1; + X86Feature[PG_AVX512_VPOPCNTDQ] = exx[2] >> 14 & 1; } } - return pg_comp_crc32c(crc, data, len); + X86Feature[init] = true; + +#if 1 + /* TODO: DEBUG log all set booleans with enum string */ + fprintf(stderr, "SSE4.2: %s\n", X86Feature[PG_SSE4_2] ? "yes" : "no"); +#endif } -pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; +#endif /* defined(USE_SSE2) || defined(__i386__) */ -- 2.53.0