From 5c07fe6c3ecacf2cbcc2c3e081a1bfb2a2fc259b Mon Sep 17 00:00:00 2001 From: John Naylor Date: Tue, 11 Mar 2025 14:57:01 +0700 Subject: [PATCH v14 7/8] AVX-512 CRC / autoconf Author: Raghuveer Devulapalli Author: Paul Amonson --- config/c-compiler.m4 | 30 +++++++++ configure | 151 ++++++++++++++++++++++++++----------------- configure.ac | 104 +++++++++++++---------------- 3 files changed, 164 insertions(+), 121 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 8534cc54c13..f172f260e4e 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -577,6 +577,36 @@ fi undefine([Ac_cachevar])dnl ])# PGAC_SSE42_CRC32_INTRINSICS +# PGAC_AVX512_CRC32_INTRINSICS +# --------------------------- +# Check if the compiler supports the x86 CRC instructions added in AVX-512, +# using intrinsics with function __attribute__((target("..."))): + +AC_DEFUN([PGAC_AVX512_CRC32_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_crc32_intrinsics])])dnl +AC_CACHE_CHECK([for _mm512_clmulepi64_epi128 with function attribute], [Ac_cachevar], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include + #include + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("avx512vl,vpclmulqdq"))) + #endif + static int crc32_avx512_test(void) + { + __m512i x0 = _mm512_set1_epi32(0x1); + __m512i x1 = _mm512_set1_epi32(0x2); + __m512i x2 = _mm512_clmulepi64_epi128(x1, x0, 0x00); // vpclmulqdq + __m128i a1 = _mm_xor_epi64(_mm512_castsi512_si128(x1), _mm512_castsi512_si128(x0)); //avx512vl + int64_t val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); // 64-bit instruction + return (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); + }], + [return crc32_avx512_test();])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_avx512_crc32_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_AVX512_CRC32_INTRINSICS # PGAC_ARMV8_CRC32C_INTRINSICS # ---------------------------- diff --git a/configure b/configure index 91c0ffc8272..a7c3d56f9f2 100755 --- a/configure +++ b/configure @@ -17381,7 +17381,7 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h fi fi -# Check for Intel SSE 4.2 intrinsics to do CRC calculations. +# Check for Intel SSE 4.2 and AVX-512 intrinsics to do CRC calculations. # { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5 $as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32... " >&6; } @@ -17425,6 +17425,52 @@ if test x"$pgac_cv_sse42_crc32_intrinsics" = x"yes"; then fi +# Check if the _mm512_clmulepi64_epi128 and _mm_xor_epi64 can be used with with +# the __attribute__((target("avx512vl,vpclmulqdq"))). +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_clmulepi64_epi128 with function attribute" >&5 +$as_echo_n "checking for _mm512_clmulepi64_epi128 with function attribute... " >&6; } +if ${pgac_cv_avx512_crc32_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + #include + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("avx512vl,vpclmulqdq"))) + #endif + static int crc32_avx512_test(void) + { + __m512i x0 = _mm512_set1_epi32(0x1); + __m512i x1 = _mm512_set1_epi32(0x2); + __m512i x2 = _mm512_clmulepi64_epi128(x1, x0, 0x00); // vpclmulqdq + __m128i a1 = _mm_xor_epi64(_mm512_castsi512_si128(x1), _mm512_castsi512_si128(x0)); //avx512vl + int64_t val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); // 64-bit instruction + return (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); + } +int +main () +{ +return crc32_avx512_test(); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_crc32_intrinsics=yes +else + pgac_cv_avx512_crc32_intrinsics=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_crc32_intrinsics" >&5 +$as_echo "$pgac_cv_avx512_crc32_intrinsics" >&6; } +if test x"$pgac_cv_avx512_crc32_intrinsics" = x"yes"; then + pgac_avx512_crc32_intrinsics=yes +fi + + # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all # define __SSE4_2__ in that case. cat confdefs.h - <<_ACEOF >conftest.$ac_ext @@ -17626,9 +17672,8 @@ fi # If we are targeting a processor that has Intel SSE 4.2 instructions, we can # use the special CRC instructions for calculating CRC-32C. If we're not # targeting such a processor, but we can nevertheless produce code that uses -# the SSE intrinsics, compile both implementations and select which one to use -# at runtime, depending on whether SSE 4.2 is supported by the processor we're -# running on. +# the SSE/AVX-512 intrinsics compile both implementations and select which one +# to use at runtime, depending runtime cpuid information. # # Similarly, if we are targeting an ARM processor that has the CRC # instructions that are part of the ARMv8 CRC Extension, use them. And if @@ -17645,88 +17690,72 @@ fi # # If we are targeting a LoongArch processor, CRC instructions are # always available (at least on 64 bit), so no runtime check is needed. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then - # Use Intel SSE 4.2 if available. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then - USE_SSE42_CRC32C=1 - else - # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for - # the runtime check. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then - USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1 - else - # Use ARM CRC Extension if available. - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then - USE_ARMV8_CRC32C=1 - else - # ARM CRC Extension, with runtime check? - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then - USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 - else - # LoongArch CRCC instructions. - if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then - USE_LOONGARCH_CRC32C=1 - else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 - fi - fi - fi - fi - fi -fi -# Set PG_CRC32C_OBJS appropriately depending on the selected implementation. { $as_echo "$as_me:${as_lineno-$LINENO}: checking which CRC-32C implementation to use" >&5 $as_echo_n "checking which CRC-32C implementation to use... " >&6; } -if test x"$USE_SSE42_CRC32C" = x"1"; then +if test x"$host_cpu" = x"x86_64"; then + #x86 only: + PG_CRC32C_OBJS="pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" + if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5 -$as_echo "SSE 4.2" >&6; } -else - if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then + PG_CRC32C_OBJS+=" pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRC32C baseline feature SSE 4.2" >&5 +$as_echo "CRC32C baseline feature SSE 4.2" >&6; } + else + if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then $as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5 -$as_echo "SSE 4.2 with runtime check" >&6; } - else - if test x"$USE_ARMV8_CRC32C" = x"1"; then + PG_CRC32C_OBJS+=" pg_crc32c_sse42.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRC32C SSE42 with runtime check" >&5 +$as_echo "CRC32C SSE42 with runtime check" >&6; } + fi + fi + if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + +$as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRC32C AVX-512 with runtime check" >&5 +$as_echo "CRC32C AVX-512 with runtime check" >&6; } + fi +else + # non x86 code: + # Use ARM CRC Extension if available. + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then $as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_armv8.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5 + PG_CRC32C_OBJS="pg_crc32c_armv8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5 $as_echo "ARMv8 CRC instructions" >&6; } - else - if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then + else + # ARM CRC Extension, with runtime check? + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5 + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5 $as_echo "ARMv8 CRC instructions with runtime check" >&6; } - else - if test x"$USE_LOONGARCH_CRC32C" = x"1"; then + else + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then $as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_loongarch.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5 + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5 $as_echo "LoongArch CRCC instructions" >&6; } - else + else + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 $as_echo "slicing-by-8" >&6; } - fi fi fi fi diff --git a/configure.ac b/configure.ac index a85bdbd4ff6..ee8b225ed87 100644 --- a/configure.ac +++ b/configure.ac @@ -2057,10 +2057,14 @@ if test x"$host_cpu" = x"x86_64"; then fi fi -# Check for Intel SSE 4.2 intrinsics to do CRC calculations. +# Check for Intel SSE 4.2 and AVX-512 intrinsics to do CRC calculations. # PGAC_SSE42_CRC32_INTRINSICS() +# Check if the _mm512_clmulepi64_epi128 and _mm_xor_epi64 can be used with with +# the __attribute__((target("avx512vl,vpclmulqdq"))). +PGAC_AVX512_CRC32_INTRINSICS([]) + # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all # define __SSE4_2__ in that case. AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [ @@ -2096,9 +2100,8 @@ AC_SUBST(CFLAGS_CRC) # If we are targeting a processor that has Intel SSE 4.2 instructions, we can # use the special CRC instructions for calculating CRC-32C. If we're not # targeting such a processor, but we can nevertheless produce code that uses -# the SSE intrinsics, compile both implementations and select which one to use -# at runtime, depending on whether SSE 4.2 is supported by the processor we're -# running on. +# the SSE/AVX-512 intrinsics compile both implementations and select which one +# to use at runtime, depending runtime cpuid information. # # Similarly, if we are targeting an ARM processor that has the CRC # instructions that are part of the ARMv8 CRC Extension, use them. And if @@ -2115,69 +2118,50 @@ AC_SUBST(CFLAGS_CRC) # # If we are targeting a LoongArch processor, CRC instructions are # always available (at least on 64 bit), so no runtime check is needed. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then - # Use Intel SSE 4.2 if available. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then - USE_SSE42_CRC32C=1 - else - # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for - # the runtime check. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then - USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1 + +AC_MSG_CHECKING([which CRC-32C implementation to use]) +if test x"$host_cpu" = x"x86_64"; then + #x86 only: + PG_CRC32C_OBJS="pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" + if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then + AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.]) + PG_CRC32C_OBJS+=" pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" + AC_MSG_RESULT(CRC32C baseline feature SSE 4.2) else - # Use ARM CRC Extension if available. - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then - USE_ARMV8_CRC32C=1 - else - # ARM CRC Extension, with runtime check? - if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then - USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 - else - # LoongArch CRCC instructions. - if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then - USE_LOONGARCH_CRC32C=1 - else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 - fi + if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.]) + PG_CRC32C_OBJS+=" pg_crc32c_sse42.o" + AC_MSG_RESULT(CRC32C SSE42 with runtime check) fi - fi fi - fi -fi - -# Set PG_CRC32C_OBJS appropriately depending on the selected implementation. -AC_MSG_CHECKING([which CRC-32C implementation to use]) -if test x"$USE_SSE42_CRC32C" = x"1"; then - AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" - AC_MSG_RESULT(SSE 4.2) + if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel AVX 512 CRC instructions with a runtime check.]) + AC_MSG_RESULT(CRC32C AVX-512 with runtime check) + fi else - if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then - AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" - AC_MSG_RESULT(SSE 4.2 with runtime check) + # non x86 code: + # Use ARM CRC Extension if available. + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then + AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.]) + PG_CRC32C_OBJS="pg_crc32c_armv8.o" + AC_MSG_RESULT(ARMv8 CRC instructions) else - if test x"$USE_ARMV8_CRC32C" = x"1"; then - AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.]) - PG_CRC32C_OBJS="pg_crc32c_armv8.o" - AC_MSG_RESULT(ARMv8 CRC instructions) + # ARM CRC Extension, with runtime check? + if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then + AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.]) + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" + AC_MSG_RESULT(ARMv8 CRC instructions with runtime check) else - if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then - AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" - AC_MSG_RESULT(ARMv8 CRC instructions with runtime check) + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then + AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.]) + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + AC_MSG_RESULT(LoongArch CRCC instructions) else - if test x"$USE_LOONGARCH_CRC32C" = x"1"; then - AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.]) - PG_CRC32C_OBJS="pg_crc32c_loongarch.o" - AC_MSG_RESULT(LoongArch CRCC instructions) - else - AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - AC_MSG_RESULT(slicing-by-8) - fi + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + AC_MSG_RESULT(slicing-by-8) fi fi fi -- 2.48.1