From 67a887d8fe8be389c709fdc87b196cd2ad5d2bf7 Mon Sep 17 00:00:00 2001 From: Paul Amonson Date: Wed, 13 Mar 2024 12:49:57 -0700 Subject: [PATCH 2/2] [Feat] Add AVX-512 POPCNT support initial checkin. Signed-off-by: Paul Amonson --- config/c-compiler.m4 | 37 ++++++ configure | 205 ++++++++++++++++++++++++++++++ configure.ac | 44 +++++++ meson.build | 72 +++++++++++ src/Makefile.global.in | 1 + src/include/pg_config.h.in | 12 ++ src/include/port/pg_bitutils.h | 10 +- src/makefiles/meson.build | 1 + src/port/Makefile | 5 + src/port/meson.build | 6 +- src/port/pg_bitutils.c | 9 +- src/port/pg_popcnt_choose.c | 67 +++++++++- src/port/pg_popcnt_x86_64_accel.c | 36 +++++- 13 files changed, 494 insertions(+), 11 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 3268a780bb..54f7415e5a 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -694,3 +694,40 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_LOONGARCH_CRC32C_INTRINSICS + +# PGAC_AVX512_POPCNT_INTRINSICS +# --------------------------- +# Check if the compiler supports the x86_64 AVX512 POPCNT instructions using +# intrinsics used in CPUID features AVX512F and AVX512VPOPCNTDQ. +# +# Optional compiler flags can be passed as argument (e.g. -mavx512vpopcntdq). +# If the intrinsics are supported then pgac_avx512_popcnt_intrinsics and +# CFLAGS_AVX512_POPCNT are set. +AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl +AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1" +AC_LINK_IFELSE([AC_LANG_PROGRAM([#include +#include +#include +#include ], + [const uint64_t *buf = malloc((size_t)64); + uint64_t popcnt = 0; + __m512i accumulator = _mm512_setzero_si512(); + const __m512i v = _mm512_loadu_si512((const __m512i *)buf); + const __m512i p = _mm512_popcnt_epi64(v); + memset(buf, 0, 64); + accumulator = _mm512_add_epi64(accumulator, p); + popcnt = _mm512_reduce_add_epi64(accumulator); + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0;])], + [Ac_cachevar=yes], + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) +if test x"$Ac_cachevar" = x"yes"; then + CFLAGS_AVX512_POPCNT="$1" + pgac_avx512_popcnt_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_AVX512_POPCNT_INTRINSICS \ No newline at end of file diff --git a/configure b/configure index 36feeafbb2..0fbfc7c78f 100755 --- a/configure +++ b/configure @@ -647,6 +647,7 @@ MSGFMT_FLAGS MSGFMT PG_CRC32C_OBJS CFLAGS_CRC +CFLAGS_AVX512_POPCNT LIBOBJS OPENSSL ZSTD @@ -17404,6 +17405,41 @@ $as_echo "#define HAVE__GET_CPUID 1" >>confdefs.h fi +# Check for x86 cpuid_count instruction +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5 +$as_echo_n "checking for __get_cpuid_count... " >&6; } +if ${pgac_cv__get_cpuid_count+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ +unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv__get_cpuid_count="yes" +else + pgac_cv__get_cpuid_count="no" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__get_cpuid_count" >&5 +$as_echo "$pgac_cv__get_cpuid_count" >&6; } +if test x"$pgac_cv__get_cpuid_count" = x"yes"; then + +$as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h + +fi + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuid" >&5 $as_echo_n "checking for __cpuid... " >&6; } if ${pgac_cv__cpuid+:} false; then : @@ -17438,6 +17474,175 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5 +$as_echo_n "checking for __cpuidex... " >&6; } +if ${pgac_cv__cpuidex+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ +unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuidex(exx[0], 7, 0); + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv__cpuidex="yes" +else + pgac_cv__cpuidex="no" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuidex" >&5 +$as_echo "$pgac_cv__cpuidex" >&6; } +if test x"$pgac_cv__cpuidex" = x"yes"; then + +$as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h + +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __immintrin" >&5 +$as_echo_n "checking for __immintrin... " >&6; } +if ${pgac_cv__immintrin+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ +/* Don't exclude code so added return. */ + return 1701; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv__immintrin="yes" +else + pgac_cv__immintrin="no" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__immintrin" >&5 +$as_echo "$pgac_cv__immintrin" >&6; } +if test x"$pgac_cv__immintrin" = x"yes"; then + +$as_echo "#define HAVE__IMMINTRIN 1" >>confdefs.h + +fi + +# Check for Intel AVX512 intrinsics to do POPCNT calculations. +# +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=" >&5 +$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; } +if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS " +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#include +#include +int +main () +{ +const uint64_t *buf = malloc((size_t)64); + uint64_t popcnt = 0; + __m512i accumulator = _mm512_setzero_si512(); + const __m512i v = _mm512_loadu_si512((const __m512i *)buf); + const __m512i p = _mm512_popcnt_epi64(v); + memset(buf, 0, 64); + accumulator = _mm512_add_epi64(accumulator, p); + popcnt = _mm512_reduce_add_epi64(accumulator); + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_popcnt_intrinsics_=yes +else + pgac_cv_avx512_popcnt_intrinsics_=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics_" >&5 +$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; } +if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then + CFLAGS_AVX512_POPCNT="" + pgac_avx512_popcnt_intrinsics=yes +fi + +if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq" >&5 +$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq... " >&6; } +if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#include +#include +int +main () +{ +const uint64_t *buf = malloc((size_t)64); + uint64_t popcnt = 0; + __m512i accumulator = _mm512_setzero_si512(); + const __m512i v = _mm512_loadu_si512((const __m512i *)buf); + const __m512i p = _mm512_popcnt_epi64(v); + memset(buf, 0, 64); + accumulator = _mm512_add_epi64(accumulator, p); + popcnt = _mm512_reduce_add_epi64(accumulator); + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq=yes +else + pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" >&5 +$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" >&6; } +if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" = x"yes"; then + CFLAGS_AVX512_POPCNT="-mavx512vpopcntdq" + pgac_avx512_popcnt_intrinsics=yes +fi + + +$as_echo "#define HAVE__AVX512_POPCNT 1" >>confdefs.h + +fi + + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used diff --git a/configure.ac b/configure.ac index 57f734879e..3c741d457d 100644 --- a/configure.ac +++ b/configure.ac @@ -2052,6 +2052,18 @@ if test x"$pgac_cv__get_cpuid" = x"yes"; then AC_DEFINE(HAVE__GET_CPUID, 1, [Define to 1 if you have __get_cpuid.]) fi +# Check for x86 cpuid_count instruction +AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [[unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + ]])], + [pgac_cv__get_cpuid_count="yes"], + [pgac_cv__get_cpuid_count="no"])]) +if test x"$pgac_cv__get_cpuid_count" = x"yes"; then + AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid.]) +fi + AC_CACHE_CHECK([for __cpuid], [pgac_cv__cpuid], [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [[unsigned int exx[4] = {0, 0, 0, 0}; @@ -2063,6 +2075,38 @@ if test x"$pgac_cv__cpuid" = x"yes"; then AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.]) fi +AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [[unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuidex(exx[0], 7, 0); + ]])], + [pgac_cv__cpuidex="yes"], + [pgac_cv__cpuidex="no"])]) +if test x"$pgac_cv__cpuidex" = x"yes"; then + AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.]) +fi + +AC_CACHE_CHECK([for __immintrin], [pgac_cv__immintrin], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [[/* Don't exclude code so added return. */ + return 1701; + ]])], + [pgac_cv__immintrin="yes"], + [pgac_cv__immintrin="no"])]) +if test x"$pgac_cv__immintrin" = x"yes"; then + AC_DEFINE(HAVE__IMMINTRIN, 1, [Define to 1 if you have immintrin.]) +fi + +# Check for Intel AVX512 intrinsics to do POPCNT calculations. +# +PGAC_AVX512_POPCNT_INTRINSICS([]) +if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then + PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq]) + AC_DEFINE(HAVE__AVX512_POPCNT, 1, [Define to 1 if you have cpu + support for AVX512 POPCNT.]) +fi +AC_SUBST(CFLAGS_AVX512_POPCNT) + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used diff --git a/meson.build b/meson.build index 85788f9dd8..39480b4251 100644 --- a/meson.build +++ b/meson.build @@ -1773,6 +1773,37 @@ elif cc.links(''' endif +if cc.links(''' + #include + int main(int arg, char **argv) + { + unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + } + ''', name: '__get_cpuid_count', + args: test_c_args) + cdata.set('HAVE__GET_CPUID_COUNT', 1) +elif cc.links(''' + #include + int main(int arg, char **argv) + { + unsigned int exx[4] = {0, 0, 0, 0}; + __cpuidex(exx, 7, 0); + } + ''', name: '__cpuidex', + args: test_c_args) + cdata.set('HAVE__CPUIDEX', 1) +endif + + +# Check for header immintrin.h +if cc.has_header('immintrin.h', + include_directories: postgres_inc, args: test_c_args) + cdata.set('HAVE__IMMINTRIN', 1, + description: 'Define to 1 if you have the immintrin.h header file.') +endif + + # Defend against clang being used on x86-32 without SSE2 enabled. As current # versions of clang do not understand -fexcess-precision=standard, the use of # x87 floating point operations leads to problems like isinf possibly returning @@ -2147,6 +2178,47 @@ elif host_cpu == 'ppc' or host_cpu == 'ppc64' endif +############################################################### +# AVX 512 POPCNT Intrinsic check +############################################################### +have_avx512_popcnt = false +cflags_avx512_popcnt = [] +if host_cpu == 'x86_64' + test_flags = ['-mavx512vpopcntdq'] + if host_system == 'windows' + test_flags = ['/arch:AVX512'] + endif + prog = ''' + #include + #include + #include + #include + void main(void) + { + const uint64_t *buf = malloc((size_t)64); + uint64_t popcnt = 0; + __m512i accumulator = _mm512_setzero_si512(); + const __m512i v = _mm512_loadu_si512((const __m512i *)buf); + const __m512i p = _mm512_popcnt_epi64(v); + memset(buf, 0, 64); + accumulator = _mm512_add_epi64(accumulator, p); + popcnt = _mm512_reduce_add_epi64(accumulator); + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0; + }''' + if cc.links(prog, name: '_mm512_* methods with -mavx512vpopcntdq flag.', + args: test_c_args + test_flags) + have_avx512_popcnt = true + cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1) + cdata.set('HAVE__AVX512_POPCNT', 1) + cflags_avx512_popcnt = test_flags + else + have_avx512_popcnt = false + cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1) + cflags_avx512_popcnt = [] + endif # compile/link test +endif # host_cpu check + ############################################################### # Library / OS tests diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 8b3f8c24e0..089f49b7f3 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -263,6 +263,7 @@ CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@ CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@ CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@ CFLAGS_CRC = @CFLAGS_CRC@ +CFLAGS_AVX512_POPCNT = @CFLAGS_AVX512_POPCNT@ PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@ CXXFLAGS = @CXXFLAGS@ diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 591e1ca3df..e4d56dee79 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -555,6 +555,18 @@ /* Define to 1 if you have __cpuid. */ #undef HAVE__CPUID +/* Define to 1 if you have __get_cpuid_count. */ +#undef HAVE__GET_CPUID_COUNT + +/* Define to 1 if you have __get_cpuidex. */ +#undef HAVE__GET_CPUIDEX + +/* Define to 1 if you have immintrin. */ +#undef HAVE__IMMINTRIN + +/* Define to 1 if you have AVX512. */ +#undef HAVE__AVX512_POPCNT + /* Define to 1 if you have __get_cpuid. */ #undef HAVE__GET_CPUID diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 46bf4f0103..cc42ce49c9 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -300,19 +300,19 @@ pg_ceil_log2_64(uint64 num) #ifdef TRY_POPCNT_FAST /* Attempt to use the POPCNT instruction, but perform a runtime check first */ -extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); -extern PGDLLIMPORT int (*pg_popcount64) (uint64 word); - +extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); +extern PGDLLIMPORT int (*pg_popcount64) (uint64 word); +extern PGDLLIMPORT uint64 (*pg_popcount) (const char *buf, int bytes); #else /* Use a portable implementation -- no need for a function pointer. */ extern int pg_popcount32(uint32 word); extern int pg_popcount64(uint64 word); -#endif /* TRY_POPCNT_FAST */ - /* Count the number of one-bits in a byte array */ extern uint64 pg_popcount(const char *buf, int bytes); +#endif /* TRY_POPCNT_FAST */ + /* * Rotate the bits of "word" to the right/left by n bits. */ diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build index b0f4178b3d..ee3647282e 100644 --- a/src/makefiles/meson.build +++ b/src/makefiles/meson.build @@ -100,6 +100,7 @@ pgxs_kv = { ' '.join(cflags_no_decl_after_statement), 'CFLAGS_CRC': ' '.join(cflags_crc), + 'CFLAGS_AVX512_POPCNT': ' '.join(cflags_avx512_popcnt), 'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags), 'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags), diff --git a/src/port/Makefile b/src/port/Makefile index 12c56b0ba7..0b76926301 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -89,6 +89,11 @@ pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC) +# Newer Intel processors can use AVX-512 POPCNT Capabilities (01/30/2024) +pg_popcnt_x86_64_accel.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT) +pg_popcnt_x86_64_accel_shlib.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT) +pg_popcnt_x86_64_accel_srv.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT) + # all versions of pg_crc32c_armv8.o need CFLAGS_CRC pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) diff --git a/src/port/meson.build b/src/port/meson.build index ed8828c739..d7930672cb 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -8,7 +8,6 @@ pgport_sources = [ 'path.c', 'pg_bitutils.c', 'pg_popcnt_choose.c', - 'pg_popcnt_x86_64_accel.c', 'pg_strong_random.c', 'pgcheckdir.c', 'pgmkdirp.c', @@ -86,6 +85,7 @@ replace_funcs_pos = [ ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'], ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], + ['pg_popcnt_x86_64_accel', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'avx512'], # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], @@ -100,8 +100,8 @@ replace_funcs_pos = [ ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'], ] -pgport_cflags = {'crc': cflags_crc} -pgport_sources_cflags = {'crc': []} +pgport_cflags = {'crc': cflags_crc, 'avx512': cflags_avx512_popcnt} +pgport_sources_cflags = {'crc': [], 'avx512': []} foreach f : replace_funcs_neg func = f.get(0) diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index d8b045d0a4..22c51c1679 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -98,6 +98,7 @@ const uint8 pg_number_of_ones[256] = { int pg_popcount32_slow(uint32 word); int pg_popcount64_slow(uint64 word); +uint64 pg_popcount_slow(const char *buf, int bytes); /* * pg_popcount32_slow @@ -170,6 +171,12 @@ pg_popcount64(uint64 word) return pg_popcount64_slow(word); } +uint64 +pg_popcount(const char *buf, int bytes) +{ + return pg_popcount_slow(buf, bytes); +} + #endif /* !TRY_POPCNT_FAST */ /* @@ -177,7 +184,7 @@ pg_popcount64(uint64 word) * Returns the number of 1-bits in buf */ uint64 -pg_popcount(const char *buf, int bytes) +pg_popcount_slow(const char *buf, int bytes) { uint64 popcnt = 0; diff --git a/src/port/pg_popcnt_choose.c b/src/port/pg_popcnt_choose.c index 89fcf2609c..ac1344415d 100644 --- a/src/port/pg_popcnt_choose.c +++ b/src/port/pg_popcnt_choose.c @@ -26,18 +26,23 @@ /* In pg_bitutils.c file */ int pg_popcount32_slow(uint32 word); int pg_popcount64_slow(uint64 word); +uint64 pg_popcount_slow(const char *buf, int bytes); #ifdef TRY_POPCNT_FAST static bool pg_popcount_available(void); +static bool pg_popcount512_available(void); static int pg_popcount32_choose(uint32 word); static int pg_popcount64_choose(uint64 word); +static uint64 pg_popcount_choose(const char *buf, int bytes); /* In pg_popcnt_*_accel source file. */ int pg_popcount32_fast(uint32 word); int pg_popcount64_fast(uint64 word); +uint64 pg_popcount_fast(const char *buf, int bytes); int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; +uint64 (*pg_popcount) (const char *buf, int bytes) = pg_popcount_choose; #endif /* TRY_POPCNT_FAST */ #ifdef TRY_POPCNT_FAST @@ -61,6 +66,52 @@ pg_popcount_available(void) return (exx[2] & (1 << 23)) != 0; /* POPCNT */ } +/* + * Return true if CPUID indicates that the AVX512_POPCNT instruction is + * available. This is similar to the method above; see + * https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features + * + * Finally, we make sure the xgetbv result is consistent with the CPUID + * results. + */ +static bool +pg_popcount512_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + + /* Check for AVX512VPOPCNTDQ and AVX512F */ +#if defined(HAVE__GET_CPUID_COUNT) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUIDEX) + __cpuidex(exx, 7, 0); +#endif + + if ((exx[2] & (0x00004000)) != 0 && (exx[1] & (0x00010000)) != 0) + { + /* + * CPUID succeeded, does the current running OS support the + * ZMM registers which are required for AVX512? This check is + * required to make sure an old OS on a new CPU is correctly + * checked or a VM hypervisor is not excluding AVX512 ZMM + * support in the VM; see "5.1.9 Detection of AVX Instructions" + * https://www.intel.com/content/www/us/en/content-details/671488/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html + */ + uint64 xcr = 0; +#ifdef _MSC_VER + uint64 highlow = _xgetbv(xcr); + + return (highlow & 0xE0) != 0; +#else + uint32 high; + uint32 low; + + __asm__ __volatile__("xgetbv\t\n" : "=a"(low), "=d"(high) : "c"(xcr)); + return (low & 0xE0) != 0; +#endif + } /* POPCNT 512 */ + return false; +} + /* * These functions get called on the first call to pg_popcount32 etc. * They detect whether we can use the asm implementations, and replace @@ -69,15 +120,23 @@ pg_popcount_available(void) */ static void setup_function_pointers() { - if (pg_popcount_available()) + if (pg_popcount512_available()) { pg_popcount32 = pg_popcount32_fast; pg_popcount64 = pg_popcount64_fast; + pg_popcount = pg_popcount_fast; + } + else if (pg_popcount_available()) + { + pg_popcount32 = pg_popcount32_fast; + pg_popcount64 = pg_popcount64_fast; + pg_popcount = pg_popcount_slow; } else { pg_popcount32 = pg_popcount32_slow; pg_popcount64 = pg_popcount64_slow; + pg_popcount = pg_popcount_slow; } } @@ -94,4 +153,10 @@ pg_popcount64_choose(uint64 word) setup_function_pointers(); return pg_popcount64(word); } + +static uint64 +pg_popcount_choose(const char* buf, int bytes) { + setup_function_pointers(); + return pg_popcount(buf, bytes); +} #endif /* TRY_POPCNT_FAST */ diff --git a/src/port/pg_popcnt_x86_64_accel.c b/src/port/pg_popcnt_x86_64_accel.c index 2e9b2ee774..ecc07afd37 100644 --- a/src/port/pg_popcnt_x86_64_accel.c +++ b/src/port/pg_popcnt_x86_64_accel.c @@ -14,8 +14,14 @@ #include "port/pg_bitutils.h" +#if defined(HAVE__IMMINTRIN) +#include +#endif + int pg_popcount32_fast(uint32 word); int pg_popcount64_fast(uint64 word); +uint64 pg_popcount_fast(const char *buf, int bytes); +uint64 pg_popcount_slow(const char *buf, int bytes); #ifdef TRY_POPCNT_FAST /* @@ -52,4 +58,32 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); #endif } -#endif /* TRY_POPCNT_FAST */ +/* + * Use AVX-512 Intrinsics for supported Intel CPUs or fall back the the software + * loop in pg_bunutils.c and use the best 32 or 64 bit fast methods. If no fast + * methods are used this will fall back to __builtin_* or pure software. + */ +uint64 +pg_popcount_fast(const char *buf, int bytes) +{ + uint64 popcnt = 0; + #if defined(HAVE__IMMINTRIN) && HAVE__AVX512_POPCNT == 1 + __m512i accumulator = _mm512_setzero_si512(); + + while (bytes >= 64) + { + const __m512i v = _mm512_loadu_si512((const __m512i *)buf); + const __m512i p = _mm512_popcnt_epi64(v); + + accumulator = _mm512_add_epi64(accumulator, p); + bytes -= 64; + buf += 64; + } + + popcnt = _mm512_reduce_add_epi64(accumulator); +#endif /* defined(HAVE__IMMINTRIN) && HAVE__AVX512_POPCNT == 1 */ + + /* Process any remaining bytes */ + return popcnt + pg_popcount_slow(buf, bytes); +} +#endif /* TRY_POPCNT_FAST */ -- 2.34.1