From f4227534f6090c4c1ddef6f44975fe506c2eb0b3 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Tue, 31 Mar 2026 17:40:38 +0700 Subject: [PATCH v3] Compute CRC32C on ARM using the Crypto Extension where available --- config/c-compiler.m4 | 38 ++++++++- configure | 57 +++++++++++++- configure.ac | 11 ++- meson.build | 33 ++++++++ src/include/pg_config.h.in | 3 + src/include/port/pg_crc32c.h | 22 ++++-- src/port/meson.build | 1 + src/port/pg_crc32c_armv8.c | 124 ++++++++++++++++++++++++++++++ src/port/pg_crc32c_armv8_choose.c | 36 ++++++++- 9 files changed, 315 insertions(+), 10 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 629572ee350..0027ef3710c 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -759,6 +759,41 @@ fi undefine([Ac_cachevar])dnl ])# PGAC_ARMV8_CRC32C_INTRINSICS +# PGAC_ARM_PLMULL_INTRINSICS +# --------------------------- +# Check if the compiler supports Arm CRYPTO carryless multiplication +# instructions used for vectorized CRC. +# +# If the intrinsics are supported, sets pgac_arm_pmull_intrinsics. +############ WIP: is it really safe to overwrite CFLAGS_CRC? +AC_DEFUN([PGAC_ARM_PLMULL_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_arm_pmull_intrinsics_$1])])dnl +AC_CACHE_CHECK([for pmull and pmull2 with CFLAGS=$1], [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1" +AC_LINK_IFELSE([AC_LANG_PROGRAM([#include +#include +uint64x2_t a; +uint64x2_t b; +uint64x2_t c; +uint64x2_t r; +uint64x2_t r2;], + + [__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); + __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b)); + + r = veorq_u64(r, r2); + /* return computed value, to prevent the above being optimized away */ + return __crc32cd(0, vgetq_lane_u64(r, 0));])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + CFLAGS_CRC="$1" + pgac_arm_pmull_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_ARM_PLMULL_INTRINSICS + # PGAC_LOONGARCH_CRC32C_INTRINSICS # --------------------------- # Check if the compiler supports the LoongArch CRCC instructions, using @@ -784,7 +819,8 @@ AC_CACHE_CHECK( /* return computed value, to prevent the above being optimized away */ return crc == 0;])], [Ac_cachevar=yes], - [Ac_cachevar=no])]) + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) if test x"$Ac_cachevar" = x"yes"; then pgac_loongarch_crc32c_intrinsics=yes fi diff --git a/configure b/configure index 0d123d7dc8a..fdecd5b524a 100755 --- a/configure +++ b/configure @@ -18395,6 +18395,53 @@ if test x"$pgac_cv_avx512_pclmul_intrinsics" = x"yes"; then pgac_avx512_pclmul_intrinsics=yes fi +else + if test x"$host_cpu" = x"aarch64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pmull and pmull2 with CFLAGS=-march=armv8-a+crc+simd+crypto" >&5 +$as_echo_n "checking for pmull and pmull2 with CFLAGS=-march=armv8-a+crc+simd+crypto... " >&6; } +if ${pgac_cv_arm_pmull_intrinsics__march_armv8_apcrcpsimdpcrypto+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -march=armv8-a+crc+simd+crypto" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +uint64x2_t a; +uint64x2_t b; +uint64x2_t c; +uint64x2_t r; +uint64x2_t r2; +int +main () +{ +__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); + __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b)); + + r = veorq_u64(r, r2); + /* return computed value, to prevent the above being optimized away */ + return __crc32cd(0, vgetq_lane_u64(r, 0)); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_arm_pmull_intrinsics__march_armv8_apcrcpsimdpcrypto=yes +else + pgac_cv_arm_pmull_intrinsics__march_armv8_apcrcpsimdpcrypto=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_pmull_intrinsics__march_armv8_apcrcpsimdpcrypto" >&5 +$as_echo "$pgac_cv_arm_pmull_intrinsics__march_armv8_apcrcpsimdpcrypto" >&6; } +if test x"$pgac_cv_arm_pmull_intrinsics__march_armv8_apcrcpsimdpcrypto" = x"yes"; then + CFLAGS_CRC="-march=armv8-a+crc+simd+crypto" + pgac_arm_pmull_intrinsics=yes +fi + + fi fi { $as_echo "$as_me:${as_lineno-$LINENO}: checking for vectorized CRC-32C" >&5 @@ -18406,8 +18453,16 @@ $as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h { $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX-512 with runtime check" >&5 $as_echo "AVX-512 with runtime check" >&6; } else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5 + if test x"$pgac_arm_pmull_intrinsics" = x"yes"; then + +$as_echo "#define USE_PMULL_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: PMULL CRC with runtime check" >&5 +$as_echo "PMULL CRC with runtime check" >&6; } + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5 $as_echo "none" >&6; } + fi fi # Select semaphore implementation type. diff --git a/configure.ac b/configure.ac index 2342780359a..0ea56dc53dd 100644 --- a/configure.ac +++ b/configure.ac @@ -2291,6 +2291,10 @@ AC_SUBST(PG_CRC32C_OBJS) # if test x"$host_cpu" = x"x86_64"; then PGAC_AVX512_PCLMUL_INTRINSICS() +else + if test x"$host_cpu" = x"aarch64"; then + PGAC_ARM_PLMULL_INTRINSICS([-march=armv8-a+crc+simd+crypto]) + fi fi AC_MSG_CHECKING([for vectorized CRC-32C]) @@ -2298,7 +2302,12 @@ if test x"$pgac_avx512_pclmul_intrinsics" = x"yes"; then AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 CRC algorithms with a runtime check.]) AC_MSG_RESULT(AVX-512 with runtime check) else - AC_MSG_RESULT(none) + if test x"$pgac_arm_pmull_intrinsics" = x"yes"; then + AC_DEFINE(USE_PMULL_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Arm PMULL CRC algorithms with a runtime check.]) + AC_MSG_RESULT(PMULL CRC with runtime check) + else + AC_MSG_RESULT(none) + fi fi # Select semaphore implementation type. diff --git a/meson.build b/meson.build index 0ee772cd475..a0fac181595 100644 --- a/meson.build +++ b/meson.build @@ -2681,6 +2681,39 @@ int main(void) have_optimized_crc = true endif + # Check if the compiler supports ARMv8 CRYPTO carryless multiplication + # and exclusive-or inline assembly instructions used for computing CRC. + # Check __crc32cd here as well, since the full implementation relies on + # 8-byte CRC instructions. + prog = ''' +#include +#include +uint64x2_t a; +uint64x2_t b; +uint64x2_t c; + +int main(void) +{ + uint64x2_t r; + uint64x2_t r2; + +__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); +__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b)); + + r = veorq_u64(r, r2); + /* return computed value, to prevent the above being optimized away */ + return __crc32cd(0, vgetq_lane_u64(r, 0)); +} +''' + + if cc.links(prog, + name: 'PMULL CRC32C', + args: test_c_args + ['-march=armv8-a+crc+simd+crypto']) + # Use ARM CRYPTO Extension, with runtime check + cflags_crc += '-march=armv8-a+crc+simd+crypto' + cdata.set('USE_PMULL_CRC32C_WITH_RUNTIME_CHECK', 1) + endif + elif host_cpu == 'loongarch64' prog = ''' diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index d8d61918aff..dbc97c565a3 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -729,6 +729,9 @@ /* Define to 1 to build with PAM support. (--with-pam) */ #undef USE_PAM +/* Define to 1 to use Arm PMULL CRC algorithms with a runtime check. */ +#undef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK + /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */ #undef USE_SLICING_BY_8_CRC32C diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 1f8e837d119..1230709197a 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -111,13 +111,22 @@ extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t l #endif #elif defined(USE_ARMV8_CRC32C) -/* Use ARMv8 CRC Extension instructions. */ - +/* + * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions. + * We don't need a runtime check for CRC, so for small constant inputs, + * we can avoid an indirect function call. + */ #define COMP_CRC32C(crc, data, len) \ - ((crc) = pg_comp_crc32c_armv8((crc), (data), (len))) + ((crc) = __builtin_constant_p(len) && len < 32 ? \ + pg_comp_crc32c_armv8((crc), (data), (len)) : \ + pg_comp_crc32c((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) +extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len); +#endif #elif defined(USE_LOONGARCH_CRC32C) /* Use LoongArch CRCC instructions. */ @@ -131,8 +140,8 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_ #elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) /* - * Use ARMv8 instructions, but perform a runtime check first - * to check that they are available. + * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions, + * but perform a runtime check first to check that they are available. */ #define COMP_CRC32C(crc, data, len) \ ((crc) = pg_comp_crc32c((crc), (data), (len))) @@ -141,6 +150,9 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_ extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len); +#endif #else /* diff --git a/src/port/meson.build b/src/port/meson.build index d55cb0424f3..922b3f64676 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -93,6 +93,7 @@ replace_funcs_pos = [ # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'], + ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c index 9ca0f728d39..64b82f6de58 100644 --- a/src/port/pg_crc32c_armv8.c +++ b/src/port/pg_crc32c_armv8.c @@ -20,6 +20,10 @@ #include #endif +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK +#include +#endif + #include "port/pg_crc32c.h" pg_crc32c @@ -77,3 +81,123 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) return crc; } + +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK + +/* + * Note: There is no copyright notice in the following generated code. + * + * We have modified the output to + * - match our function declaration + * - match whitespace to our project style + * - be more friendly for pgindent + */ + +/* Generated by https://github.com/corsix/fast-crc32/ using: */ +/* ./generate -i neon -p crc32c -a v4e */ +/* MIT licensed */ + +static inline +uint64x2_t +clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) +{ + uint64x2_t r; + +__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); + return r; +} + +static inline +uint64x2_t +clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) +{ + uint64x2_t r; + +__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); + return r; +} + +pg_crc32c +pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len) +{ + /* adjust names to match generated code */ + pg_crc32c crc0 = crc; + const char *buf = data; + + /* align to 16 bytes */ + for (; len && ((uintptr_t) buf & 7); --len) + { + crc0 = __crc32cb(crc0, *buf++); + } + if (((uintptr_t) buf & 8) && len >= 8) + { + crc0 = __crc32cd(crc0, *(const uint64_t *) buf); + buf += 8; + len -= 8; + } + + if (len >= 64) + { + const char *end = buf + len; + const char *limit = buf + len - 64; + + /* First vector chunk. */ + uint64x2_t x0 = vld1q_u64((const uint64_t *) buf), + y0; + uint64x2_t x1 = vld1q_u64((const uint64_t *) (buf + 16)), + y1; + uint64x2_t x2 = vld1q_u64((const uint64_t *) (buf + 32)), + y2; + uint64x2_t x3 = vld1q_u64((const uint64_t *) (buf + 48)), + y3; + uint64x2_t k; + + { + static const uint64_t pg_attribute_aligned(16) k_[] = {0x740eef02, 0x9e4addf8}; + + k = vld1q_u64(k_); + } + + /* + * pgindent complained of unmatched parens upstream: + * + * x0 = veorq_u64((uint64x2_t) {crc0, 0}, x0); + */ + x0 = veorq_u64((uint64x2_t) vsetq_lane_u64(crc0, vdupq_n_u64(0), 0), x0); + buf += 64; + + /* Main loop. */ + while (buf <= limit) + { + y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t *) buf)), x0 = clmul_hi_e(x0, k, y0); + y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t *) (buf + 16))), x1 = clmul_hi_e(x1, k, y1); + y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t *) (buf + 32))), x2 = clmul_hi_e(x2, k, y2); + y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t *) (buf + 48))), x3 = clmul_hi_e(x3, k, y3); + buf += 64; + } + + /* Reduce x0 ... x3 to just x0. */ + { + static const uint64_t pg_attribute_aligned(16) k_[] = {0xf20c0dfe, 0x493c7d27}; + + k = vld1q_u64(k_); + } + y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0); + y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2); + { + static const uint64_t pg_attribute_aligned(16) k_[] = {0x3da6d0cb, 0xba4fc28e}; + + k = vld1q_u64(k_); + } + y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0); + + /* Reduce 128 bits to 32 bits, and multiply by x^32. */ + crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1)); + len = end - buf; + } + + return pg_comp_crc32c_armv8(crc0, buf, len); +} + +#endif diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c index a1f0e540c6b..164af65454b 100644 --- a/src/port/pg_crc32c_armv8_choose.c +++ b/src/port/pg_crc32c_armv8_choose.c @@ -108,6 +108,27 @@ pg_crc32c_armv8_available(void) #endif } +static inline bool +pg_pmull_available(void) +{ +#if defined(__aarch64__) && defined(HWCAP_PMULL) + +#ifdef HAVE_ELF_AUX_INFO + unsigned long value; + + return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 && + (value & HWCAP_PMULL) != 0; +#elif defined(HAVE_GETAUXVAL) + return (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0; +#else + return false; +#endif + +#else + return false; +#endif +} + /* * This gets called on the first call. It replaces the function pointer * so that subsequent calls are routed directly to the chosen implementation. @@ -115,10 +136,21 @@ pg_crc32c_armv8_available(void) static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) { + /* + * Set fallback. We must guard since slicing-by-8 is not visible + * everywhere. + */ +#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK + pg_comp_crc32c = pg_comp_crc32c_sb8; +#endif + if (pg_crc32c_armv8_available()) pg_comp_crc32c = pg_comp_crc32c_armv8; - else - pg_comp_crc32c = pg_comp_crc32c_sb8; + +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK + if (pg_pmull_available()) + pg_comp_crc32c = pg_comp_crc32c_pmull; +#endif return pg_comp_crc32c(crc, data, len); } -- 2.53.0