diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 5db02b2ab7..a5a3246199 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -694,3 +694,36 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_LOONGARCH_CRC32C_INTRINSICS + +# PGAC_AVX512_POPCNT_INTRINSICS +# --------------------------- +# Check if the compiler supports the x86_64 AVX512 POPCNT instructions using +# intrinsics used in CPUID features AVX512F and AVX512VPOPCNTDQ. +# +# Optional compiler flags can be passed as argument (e.g. -mavx512vpopcntdq). +# If the intrinsics are supported then pgac_avx512_popcnt_intrinsics and +# CFLAGS_AVX512_POPCNT are set. +AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl +AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1" +AC_LINK_IFELSE([AC_LANG_PROGRAM([#include +#include ], + [__m512i tmp __attribute__((aligned(64))); + __m512i input = _mm512_setzero_si512(); + __m512i output = _mm512_popcnt_epi64(input); + uint64_t cnt = 999; + _mm512_store_si512(&tmp, output); + cnt = _mm512_reduce_add_epi64(tmp); + /* return computed value, to prevent the above being optimized away */ + return cnt == 0;])], + [Ac_cachevar=yes], + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) +if test x"$Ac_cachevar" = x"yes"; then + CFLAGS_AVX512_POPCNT="$1" + pgac_avx512_popcnt_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_AVX512_POPCNT_INTRINSICS diff --git a/configure b/configure index 2a1ee251f2..7fe669cda2 100755 --- a/configure +++ b/configure @@ -647,6 +647,7 @@ MSGFMT_FLAGS MSGFMT PG_CRC32C_OBJS CFLAGS_CRC +CFLAGS_AVX512_POPCNT LIBOBJS OPENSSL ZSTD @@ -15209,7 +15210,7 @@ else We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -15255,7 +15256,7 @@ else We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -15279,7 +15280,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -15324,7 +15325,7 @@ else We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -15348,7 +15349,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext We can't simply define LARGE_OFF_T to be 9223372036854775807, since some C++ compilers masquerading as C compilers incorrectly reject 9223372036854775807. */ -#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62)) +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 && LARGE_OFF_T % 2147483647 == 1) ? 1 : -1]; @@ -17702,6 +17703,41 @@ $as_echo "#define HAVE__GET_CPUID 1" >>confdefs.h fi +# Check for x86 cpuid_count instruction +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5 +$as_echo_n "checking for __get_cpuid_count... " >&6; } +if ${pgac_cv__get_cpuid_count+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ +unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv__get_cpuid_count="yes" +else + pgac_cv__get_cpuid_count="no" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__get_cpuid_count" >&5 +$as_echo "$pgac_cv__get_cpuid_count" >&6; } +if test x"$pgac_cv__get_cpuid_count" = x"yes"; then + +$as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h + +fi + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuid" >&5 $as_echo_n "checking for __cpuid... " >&6; } if ${pgac_cv__cpuid+:} false; then : @@ -17736,6 +17772,164 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5 +$as_echo_n "checking for __cpuidex... " >&6; } +if ${pgac_cv__cpuidex+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ +unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuidex(exx[0], 7, 0); + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv__cpuidex="yes" +else + pgac_cv__cpuidex="no" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuidex" >&5 +$as_echo "$pgac_cv__cpuidex" >&6; } +if test x"$pgac_cv__cpuidex" = x"yes"; then + +$as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h + +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __immintrin" >&5 +$as_echo_n "checking for __immintrin... " >&6; } +if ${pgac_cv__immintrin+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ +/* Don't exclude code so added return. */ + return 1701; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv__immintrin="yes" +else + pgac_cv__immintrin="no" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__immintrin" >&5 +$as_echo "$pgac_cv__immintrin" >&6; } +if test x"$pgac_cv__immintrin" = x"yes"; then + +$as_echo "#define HAVE__IMMINTRIN 1" >>confdefs.h + +fi + +# Check for Intel AVX512 intrinsics to do POPCNT calculations. +# +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=" >&5 +$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; } +if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS " +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +int +main () +{ +__m512i tmp __attribute__((aligned(64))); + __m512i input = _mm512_setzero_si512(); + __m512i output = _mm512_popcnt_epi64(input); + uint64_t cnt = 999; + _mm512_store_si512(&tmp, output); + cnt = _mm512_reduce_add_epi64(tmp); + /* return computed value, to prevent the above being optimized away */ + return cnt == 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_popcnt_intrinsics_=yes +else + pgac_cv_avx512_popcnt_intrinsics_=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics_" >&5 +$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; } +if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then + CFLAGS_AVX512_POPCNT="" + pgac_avx512_popcnt_intrinsics=yes +fi + +if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512f" >&5 +$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512f... " >&6; } +if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq -mavx512f" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +int +main () +{ +__m512i tmp __attribute__((aligned(64))); + __m512i input = _mm512_setzero_si512(); + __m512i output = _mm512_popcnt_epi64(input); + uint64_t cnt = 999; + _mm512_store_si512(&tmp, output); + cnt = _mm512_reduce_add_epi64(tmp); + /* return computed value, to prevent the above being optimized away */ + return cnt == 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f=yes +else + pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f" >&5 +$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f" >&6; } +if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f" = x"yes"; then + CFLAGS_AVX512_POPCNT="-mavx512vpopcntdq -mavx512f" + pgac_avx512_popcnt_intrinsics=yes +fi + +fi + + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used diff --git a/configure.ac b/configure.ac index 52fd7af446..ae110e9d59 100644 --- a/configure.ac +++ b/configure.ac @@ -2067,6 +2067,18 @@ if test x"$pgac_cv__get_cpuid" = x"yes"; then AC_DEFINE(HAVE__GET_CPUID, 1, [Define to 1 if you have __get_cpuid.]) fi +# Check for x86 cpuid_count instruction +AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [[unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + ]])], + [pgac_cv__get_cpuid_count="yes"], + [pgac_cv__get_cpuid_count="no"])]) +if test x"$pgac_cv__get_cpuid_count" = x"yes"; then + AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid.]) +fi + AC_CACHE_CHECK([for __cpuid], [pgac_cv__cpuid], [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [[unsigned int exx[4] = {0, 0, 0, 0}; @@ -2078,6 +2090,36 @@ if test x"$pgac_cv__cpuid" = x"yes"; then AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.]) fi +AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [[unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuidex(exx[0], 7, 0); + ]])], + [pgac_cv__cpuidex="yes"], + [pgac_cv__cpuidex="no"])]) +if test x"$pgac_cv__cpuidex" = x"yes"; then + AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.]) +fi + +AC_CACHE_CHECK([for __immintrin], [pgac_cv__immintrin], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [[/* Don't exclude code so added return. */ + return 1701; + ]])], + [pgac_cv__immintrin="yes"], + [pgac_cv__immintrin="no"])]) +if test x"$pgac_cv__immintrin" = x"yes"; then + AC_DEFINE(HAVE__IMMINTRIN, 1, [Define to 1 if you have immintrin.]) +fi + +# Check for Intel AVX512 intrinsics to do POPCNT calculations. +# +PGAC_AVX512_POPCNT_INTRINSICS([]) +if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then + PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq -mavx512f]) +fi +AC_SUBST(CFLAGS_AVX512_POPCNT) + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used diff --git a/meson.build b/meson.build index 8ed51b6aae..1e7a4dc942 100644 --- a/meson.build +++ b/meson.build @@ -1773,6 +1773,45 @@ elif cc.links(''' endif +# XXX: The configure.ac check for __cpuidex() is broken, we don't copy that +# here. To prevent problems due to two detection methods working, stop +# checking after one. +if cc.links(''' + #include + int main(int arg, char **argv) + { + unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + } + ''', name: '__get_cpuid_count', + args: test_c_args) + cdata.set('HAVE__GET_CPUID_COUNT', 1) +elif cc.links(''' + #include + int main(int arg, char **argv) + { + unsigned int exx[4] = {0, 0, 0, 0}; + __cpuidex(exx, 7, 0); + } + ''', name: '__cpuidex', + args: test_c_args) + cdata.set('HAVE__CPUIDEX', 1) +endif + + +# Check for header immintrin.h +if cc.links(''' + #include + int main(int arg, char **argv) + { + return 1701; + } + ''', name: '__immintrin', + args: test_c_args) + cdata.set('HAVE__IMMINTRIN', 1) +endif + + # Defend against clang being used on x86-32 without SSE2 enabled. As current # versions of clang do not understand -fexcess-precision=standard, the use of # x87 floating point operations leads to problems like isinf possibly returning @@ -2146,6 +2185,32 @@ elif host_cpu == 'ppc' or host_cpu == 'ppc64' endif endif +############################################################### +# AVX 512 POPCNT Intrinsic check +############################################################### +have_avx512_popcnt = false +cflags_avx512_popcnt = [] +if host_cpu == 'x86_64' + prog = ''' + #include + #include + void main(void) + { + __m512i tmp __attribute__((aligned(64))); + __m512i input = _mm512_setzero_si512(); + __m512i output = _mm512_popcnt_epi64(input); + uint64_t cnt = 999; + _mm512_store_si512(&tmp, output); + cnt = _mm512_reduce_add_epi64(tmp); + /* return computed value, to prevent the above being optimized away */ + return cnt == 0; + }''' + if cc.links(prog, name: '_mm512_setzero_si512, _mm512_popcnt_epi64, _mm512_store_si512, and _mm512_reduce_add_epi64 with -mavx512vpopcntdq -mavx512f', + args: test_c_args + ['-mavx512vpopcntdq', '-mavx512f']) + have_avx512_popcnt = true + cflags_avx512_popcnt = ['-mavx512vpopcntdq', '-mavx512f'] + endif +endif ############################################################### diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 8b3f8c24e0..089f49b7f3 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -263,6 +263,7 @@ CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@ CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@ CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@ CFLAGS_CRC = @CFLAGS_CRC@ +CFLAGS_AVX512_POPCNT = @CFLAGS_AVX512_POPCNT@ PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@ CXXFLAGS = @CXXFLAGS@ diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 07e73567dc..20e14c6499 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -555,6 +555,12 @@ /* Define to 1 if you have __get_cpuid. */ #undef HAVE__GET_CPUID +/* Define to 1 if you have __get_cpuid_count. */ +#undef HAVE__GET_CPUID_COUNT + +/* Define to 1 if you have immintrin. */ +#undef HAVE__IMMINTRIN + /* Define to 1 if your compiler understands _Static_assert. */ #undef HAVE__STATIC_ASSERT diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build index b0f4178b3d..ee3647282e 100644 --- a/src/makefiles/meson.build +++ b/src/makefiles/meson.build @@ -100,6 +100,7 @@ pgxs_kv = { ' '.join(cflags_no_decl_after_statement), 'CFLAGS_CRC': ' '.join(cflags_crc), + 'CFLAGS_AVX512_POPCNT': ' '.join(cflags_avx512_popcnt), 'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags), 'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags), diff --git a/src/port/Makefile b/src/port/Makefile index dcc8737e68..6a01a7d89a 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -87,6 +87,11 @@ pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC) +# Newer Intel processors can use AVX-512 POPCNT Capabilities (01/30/2024) +pg_bitutils.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT) +pg_bitutils_shlib.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT) +pg_bitutils_srv.o:CFLAGS+=$(CFLAGS_AVX512_POPCNT) + # all versions of pg_crc32c_armv8.o need CFLAGS_CRC pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) diff --git a/src/port/meson.build b/src/port/meson.build index 69b30ab21b..1c48a3b07e 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -184,6 +184,7 @@ foreach name, opts : pgport_variants link_with: cflag_libs, c_pch: pch_c_h, kwargs: opts + { + 'c_args': opts.get('c_args', []) + cflags_avx512_popcnt, 'dependencies': opts['dependencies'] + [ssl], } ) diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 640a89561a..cda41e7438 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -15,10 +15,15 @@ #ifdef HAVE__GET_CPUID #include #endif + #ifdef HAVE__CPUID #include #endif +#if defined(HAVE__IMMINTRIN) +#include +#endif + #include "port/pg_bitutils.h" @@ -110,11 +115,15 @@ static int pg_popcount64_slow(uint64 word); static bool pg_popcount_available(void); static int pg_popcount32_choose(uint32 word); static int pg_popcount64_choose(uint64 word); +static uint64 pg_popcount512_choose(const char *buf, int bytes); static int pg_popcount32_fast(uint32 word); static int pg_popcount64_fast(uint64 word); +static uint64 pg_popcount512_fast(const char *buf, int bytes); +static uint64 pg_popcount512_slow(const char *buf, int bytes); int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; +uint64 (*pg_popcount512) (const char *buf, int bytes) = pg_popcount512_choose; #endif /* TRY_POPCNT_FAST */ #ifdef TRY_POPCNT_FAST @@ -138,6 +147,50 @@ pg_popcount_available(void) return (exx[2] & (1 << 23)) != 0; /* POPCNT */ } +/* + * Return true if CPUID indicates that the AVX512_POPCNT instruction is + * available. This is similar to the method above; see + * https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features + * + * Finally, we make sure the xgetbv result is consistent with the CPUID + * results. + */ +static bool +pg_popcount512_available(void) +{ +#ifdef HAVE__IMMINTRIN + unsigned int exx[4] = {0, 0, 0, 0}; + + /* Check for AVX512VPOPCNTDQ and AVX512F */ +#if defined(HAVE__GET_CPUID_COUNT) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUIDEX) + __cpuidex(exx, 7, 0); +#else +#error __get_cpuid_count or __cpuidex instruction not available +#endif + + if ((exx[2] & (0x00004000)) != 0 && (exx[1] & (0x00010000)) != 0) + { + /* + * CPUID succeeded, does the current running OS support the + * ZMM registers which are required for AVX512? This check is + * required to make sure an old OS on a new CPU is correctly + * checked or a VM hypervisor is not excluding AVX512 ZMM + * support in the VM; see "5.1.9 Detection of AVX Instructions" + * https://www.intel.com/content/www/us/en/content-details/671488/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html + */ + uint64 xcr = 0; + uint32 high; + uint32 low; + +__asm__ __volatile__("xgetbv\t\n":"=a"(low), "=d"(high):"c"(xcr)); + return (low & 0xE0) != 0; + } /* POPCNT 512 */ +#endif /* HAVE__IMMINTRIN */ + return false; +} + /* * These functions get called on the first call to pg_popcount32 etc. * They detect whether we can use the asm implementations, and replace @@ -178,6 +231,17 @@ pg_popcount64_choose(uint64 word) return pg_popcount64(word); } +static uint64 +pg_popcount512_choose(const char *buf, int bytes) +{ + if (pg_popcount512_available()) + pg_popcount512 = pg_popcount512_fast; + else + pg_popcount512 = pg_popcount512_slow; + + return pg_popcount512(buf, bytes); +} + /* * pg_popcount32_fast * Return the number of 1 bits set in word @@ -212,6 +276,33 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); #endif } +static uint64 +pg_popcount512_fast(const char *buf, int bytes) +{ + uint64 popcnt = 0; +#ifdef HAVE__IMMINTRIN + __m512i accumulator = _mm512_setzero_si512(); + + while (bytes >= 64) + { + const __m512i v = _mm512_loadu_si512((const __m512i *) buf); + const __m512i p = _mm512_popcnt_epi64(v); + + accumulator = _mm512_add_epi64(accumulator, p); + bytes -= 64; + buf += 64; + } + + popcnt = _mm512_reduce_add_epi64(accumulator); + bytes = bytes % 64; +#endif /* HAVE__IMMINTRIN */ + + /* Process any remaining bytes */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++]; + return popcnt; +} + #endif /* TRY_POPCNT_FAST */ @@ -265,6 +356,31 @@ pg_popcount64_slow(uint64 word) #endif /* HAVE__BUILTIN_POPCOUNT */ } +static uint64 +pg_popcount512_slow(const char *buf, int bytes) +{ + uint64 popcnt = 0; + + if (buf == (const char *) TYPEALIGN(8, buf)) + { + const uint64 *words = (const uint64 *) buf; + + while (bytes >= 8) + { + popcnt += pg_popcount64(*words++); + bytes -= 8; + } + + buf = (const char *) words; + } + + /* Process any remaining bytes */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++]; + + return popcnt; +} + #ifndef TRY_POPCNT_FAST /* @@ -286,6 +402,12 @@ pg_popcount64(uint64 word) return pg_popcount64_slow(word); } +uint64 +pg_popcount512(const char *buf, int bytes) +{ + return pg_popcount512_slow(buf, bytes); +} + #endif /* !TRY_POPCNT_FAST */ /* @@ -298,19 +420,7 @@ pg_popcount(const char *buf, int bytes) uint64 popcnt = 0; #if SIZEOF_VOID_P >= 8 - /* Process in 64-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(8, buf)) - { - const uint64 *words = (const uint64 *) buf; - - while (bytes >= 8) - { - popcnt += pg_popcount64(*words++); - bytes -= 8; - } - - buf = (const char *) words; - } + return pg_popcount512(buf, bytes); #else /* Process in 32-bit chunks if the buffer is aligned. */ if (buf == (const char *) TYPEALIGN(4, buf))