diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 5db02b2ab7..a5a3246199 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -694,3 +694,36 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_LOONGARCH_CRC32C_INTRINSICS + +# PGAC_AVX512_POPCNT_INTRINSICS +# --------------------------- +# Check if the compiler supports the x86_64 AVX512 POPCNT instructions using +# intrinsics used in CPUID features AVX512F and AVX512VPOPCNTDQ. +# +# Optional compiler flags can be passed as argument (e.g. -mavx512vpopcntdq). +# If the intrinsics are supported then pgac_avx512_popcnt_intrinsics and +# CFLAGS_AVX512_POPCNT are set. +AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl +AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1" +AC_LINK_IFELSE([AC_LANG_PROGRAM([#include +#include ], + [__m512i tmp __attribute__((aligned(64))); + __m512i input = _mm512_setzero_si512(); + __m512i output = _mm512_popcnt_epi64(input); + uint64_t cnt = 999; + _mm512_store_si512(&tmp, output); + cnt = _mm512_reduce_add_epi64(tmp); + /* return computed value, to prevent the above being optimized away */ + return cnt == 0;])], + [Ac_cachevar=yes], + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) +if test x"$Ac_cachevar" = x"yes"; then + CFLAGS_AVX512_POPCNT="$1" + pgac_avx512_popcnt_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_AVX512_POPCNT_INTRINSICS diff --git a/configure b/configure index 6b87e5c9a8..0252dab6d5 100755 --- a/configure +++ b/configure @@ -647,6 +647,7 @@ MSGFMT_FLAGS MSGFMT PG_CRC32C_OBJS CFLAGS_CRC +CFLAGS_AVX512_POPCNT LIBOBJS OPENSSL ZSTD @@ -17708,6 +17709,41 @@ $as_echo "#define HAVE__GET_CPUID 1" >>confdefs.h fi +# Check for x86 cpuid_count instruction +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5 +$as_echo_n "checking for __get_cpuid_count... " >&6; } +if ${pgac_cv__get_cpuid_count+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ +unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv__get_cpuid_count="yes" +else + pgac_cv__get_cpuid_count="no" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__get_cpuid_count" >&5 +$as_echo "$pgac_cv__get_cpuid_count" >&6; } +if test x"$pgac_cv__get_cpuid_count" = x"yes"; then + +$as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h + +fi + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuid" >&5 $as_echo_n "checking for __cpuid... " >&6; } if ${pgac_cv__cpuid+:} false; then : @@ -17742,6 +17778,164 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5 +$as_echo_n "checking for __cpuidex... " >&6; } +if ${pgac_cv__cpuidex+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ +unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuidex(exx[0], 7, 0); + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv__cpuidex="yes" +else + pgac_cv__cpuidex="no" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuidex" >&5 +$as_echo "$pgac_cv__cpuidex" >&6; } +if test x"$pgac_cv__cpuidex" = x"yes"; then + +$as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h + +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __immintrin" >&5 +$as_echo_n "checking for __immintrin... " >&6; } +if ${pgac_cv__immintrin+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ +/* Don't exclude code so added return. */ + return 1701; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv__immintrin="yes" +else + pgac_cv__immintrin="no" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__immintrin" >&5 +$as_echo "$pgac_cv__immintrin" >&6; } +if test x"$pgac_cv__immintrin" = x"yes"; then + +$as_echo "#define HAVE__IMMINTRIN 1" >>confdefs.h + +fi + +# Check for Intel AVX512 intrinsics to do POPCNT calculations. +# +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=" >&5 +$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; } +if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS " +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +int +main () +{ +__m512i tmp __attribute__((aligned(64))); + __m512i input = _mm512_setzero_si512(); + __m512i output = _mm512_popcnt_epi64(input); + uint64_t cnt = 999; + _mm512_store_si512(&tmp, output); + cnt = _mm512_reduce_add_epi64(tmp); + /* return computed value, to prevent the above being optimized away */ + return cnt == 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_popcnt_intrinsics_=yes +else + pgac_cv_avx512_popcnt_intrinsics_=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics_" >&5 +$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; } +if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then + CFLAGS_AVX512_POPCNT="" + pgac_avx512_popcnt_intrinsics=yes +fi + +if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512f" >&5 +$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512f... " >&6; } +if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq -mavx512f" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +int +main () +{ +__m512i tmp __attribute__((aligned(64))); + __m512i input = _mm512_setzero_si512(); + __m512i output = _mm512_popcnt_epi64(input); + uint64_t cnt = 999; + _mm512_store_si512(&tmp, output); + cnt = _mm512_reduce_add_epi64(tmp); + /* return computed value, to prevent the above being optimized away */ + return cnt == 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f=yes +else + pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f" >&5 +$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f" >&6; } +if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512f" = x"yes"; then + CFLAGS_AVX512_POPCNT="-mavx512vpopcntdq -mavx512f" + pgac_avx512_popcnt_intrinsics=yes +fi + +fi + + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used diff --git a/configure.ac b/configure.ac index 6e64ece11d..8fcf635b08 100644 --- a/configure.ac +++ b/configure.ac @@ -2068,6 +2068,18 @@ if test x"$pgac_cv__get_cpuid" = x"yes"; then AC_DEFINE(HAVE__GET_CPUID, 1, [Define to 1 if you have __get_cpuid.]) fi +# Check for x86 cpuid_count instruction +AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [[unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + ]])], + [pgac_cv__get_cpuid_count="yes"], + [pgac_cv__get_cpuid_count="no"])]) +if test x"$pgac_cv__get_cpuid_count" = x"yes"; then + AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid.]) +fi + AC_CACHE_CHECK([for __cpuid], [pgac_cv__cpuid], [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [[unsigned int exx[4] = {0, 0, 0, 0}; @@ -2079,6 +2091,36 @@ if test x"$pgac_cv__cpuid" = x"yes"; then AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.]) fi +AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [[unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuidex(exx[0], 7, 0); + ]])], + [pgac_cv__cpuidex="yes"], + [pgac_cv__cpuidex="no"])]) +if test x"$pgac_cv__cpuidex" = x"yes"; then + AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.]) +fi + +AC_CACHE_CHECK([for __immintrin], [pgac_cv__immintrin], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [[/* Don't exclude code so added return. */ + return 1701; + ]])], + [pgac_cv__immintrin="yes"], + [pgac_cv__immintrin="no"])]) +if test x"$pgac_cv__immintrin" = x"yes"; then + AC_DEFINE(HAVE__IMMINTRIN, 1, [Define to 1 if you have immintrin.]) +fi + +# Check for Intel AVX512 intrinsics to do POPCNT calculations. +# +PGAC_AVX512_POPCNT_INTRINSICS([]) +if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then + PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq -mavx512f]) +fi +AC_SUBST(CFLAGS_AVX512_POPCNT) + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used diff --git a/meson.build b/meson.build index 8ed51b6aae..bd297d9fa9 100644 --- a/meson.build +++ b/meson.build @@ -1773,6 +1773,37 @@ elif cc.links(''' endif +if cc.links(''' + #include + int main(int arg, char **argv) + { + unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + } + ''', name: '__get_cpuid_count', + args: test_c_args) + cdata.set('HAVE__GET_CPUID_COUNT', 1) +elif cc.links(''' + #include + int main(int arg, char **argv) + { + unsigned int exx[4] = {0, 0, 0, 0}; + __cpuidex(exx, 7, 0); + } + ''', name: '__cpuidex', + args: test_c_args) + cdata.set('HAVE__CPUIDEX', 1) +endif + + +# Check for header immintrin.h +if cc.has_header('immintrin.h', + include_directories: postgres_inc, args: test_c_args) + cdata.set('HAVE__IMMINTRIN', 1, + description: 'Define to 1 if you have the immintrin.h header file.') +endif + + # Defend against clang being used on x86-32 without SSE2 enabled. As current # versions of clang do not understand -fexcess-precision=standard, the use of # x87 floating point operations leads to problems like isinf possibly returning @@ -2146,6 +2177,43 @@ elif host_cpu == 'ppc' or host_cpu == 'ppc64' endif endif +############################################################### +# AVX 512 POPCNT Intrinsic check +############################################################### +have_avx512_popcnt = false +cflags_avx512_popcnt = [] +if host_cpu == 'x86_64' + test_flags = ['-mavx512vpopcntdq', '-mavx512f'] + if host_system == 'windows' + test_flags = ['/arch:AVX512'] + endif + prog = ''' + #include + #include + void main(void) + { + __m512i tmp __attribute__((aligned(64))); + __m512i input = _mm512_setzero_si512(); + __m512i output = _mm512_popcnt_epi64(input); + uint64_t cnt = 999; + _mm512_store_si512(&tmp, output); + cnt = _mm512_reduce_add_epi64(tmp); + /* return computed value, to prevent the above being optimized away */ + return cnt == 0; + }''' + if cc.links(prog, name: '_mm512_* methods with -mavx512vpopcntdq -mavx512f', + args: test_c_args + test_flags) + have_avx512_popcnt = true + cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1) + cdata.set('HAVE__AVX512_POPCNT', 1) + cflags_avx512_popcnt = test_flags + else + have_avx512_popcnt = false + cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1) + cflags_avx512_popcnt = [] + endif # compile/link test +endif # host_cpu check + ############################################################### diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 8b3f8c24e0..089f49b7f3 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -263,6 +263,7 @@ CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@ CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@ CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@ CFLAGS_CRC = @CFLAGS_CRC@ +CFLAGS_AVX512_POPCNT = @CFLAGS_AVX512_POPCNT@ PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@ CXXFLAGS = @CXXFLAGS@ diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 07e73567dc..20e14c6499 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -555,6 +555,12 @@ /* Define to 1 if you have __get_cpuid. */ #undef HAVE__GET_CPUID +/* Define to 1 if you have __get_cpuid_count. */ +#undef HAVE__GET_CPUID_COUNT + +/* Define to 1 if you have immintrin. */ +#undef HAVE__IMMINTRIN + /* Define to 1 if your compiler understands _Static_assert. */ #undef HAVE__STATIC_ASSERT diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 799f70d052..caca78d805 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -303,16 +303,23 @@ pg_ceil_log2_64(uint64 num) extern int (*pg_popcount32) (uint32 word); extern int (*pg_popcount64) (uint64 word); +#if defined(_MSC_VER) +extern uint64 pg_popcount(const char *buf, int bytes); +extern uint64 (*pg_popcount_indirect)(const char *buf, int bytes); +#else +extern uint64 (*pg_popcount)(const char *buf, int bytes); +#endif + #else /* Use a portable implementation -- no need for a function pointer. */ extern int pg_popcount32(uint32 word); extern int pg_popcount64(uint64 word); -#endif /* TRY_POPCNT_FAST */ - /* Count the number of one-bits in a byte array */ extern uint64 pg_popcount(const char *buf, int bytes); +#endif /* TRY_POPCNT_FAST */ + /* * Rotate the bits of "word" to the right/left by n bits. */ diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build index b0f4178b3d..ee3647282e 100644 --- a/src/makefiles/meson.build +++ b/src/makefiles/meson.build @@ -100,6 +100,7 @@ pgxs_kv = { ' '.join(cflags_no_decl_after_statement), 'CFLAGS_CRC': ' '.join(cflags_crc), + 'CFLAGS_AVX512_POPCNT': ' '.join(cflags_avx512_popcnt), 'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags), 'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags), diff --git a/src/port/Makefile b/src/port/Makefile index dcc8737e68..ef6c02a6bf 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -43,6 +43,8 @@ OBJS = \ inet_net_ntop.o \ noblock.o \ path.o \ + pg_popcnt_choose.o \ + pg_popcnt_x86_64_accel.o \ pg_bitutils.o \ pg_strong_random.o \ pgcheckdir.o \ @@ -87,6 +89,11 @@ pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC) +# Newer Intel processors can use AVX-512 POPCNT Capabilities (01/30/2024) +pg_popcnt_x86_64_accel.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT) +pg_popcnt_x86_64_accel_shlib.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT) +pg_popcnt_x86_64_accel_srv.o: CFLAGS+=$(CFLAGS_AVX512_POPCNT) + # all versions of pg_crc32c_armv8.o need CFLAGS_CRC pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) diff --git a/src/port/meson.build b/src/port/meson.build index 92b593e6ef..d7930672cb 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_popcnt_choose.c', 'pg_strong_random.c', 'pgcheckdir.c', 'pgmkdirp.c', @@ -84,6 +85,7 @@ replace_funcs_pos = [ ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'], ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], + ['pg_popcnt_x86_64_accel', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'avx512'], # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], @@ -98,8 +100,8 @@ replace_funcs_pos = [ ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'], ] -pgport_cflags = {'crc': cflags_crc} -pgport_sources_cflags = {'crc': []} +pgport_cflags = {'crc': cflags_crc, 'avx512': cflags_avx512_popcnt} +pgport_sources_cflags = {'crc': [], 'avx512': []} foreach f : replace_funcs_neg func = f.get(0) diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 640a89561a..942e396141 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -12,16 +12,8 @@ */ #include "c.h" -#ifdef HAVE__GET_CPUID -#include -#endif -#ifdef HAVE__CPUID -#include -#endif - #include "port/pg_bitutils.h" - /* * Array giving the position of the left-most set bit for each possible * byte value. We count the right-most position as the 0th bit, and the @@ -78,6 +70,7 @@ const uint8 pg_rightmost_one_pos[256] = { 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 }; + /* * Array giving the number of 1-bits in each possible byte value. * @@ -103,123 +96,35 @@ const uint8 pg_number_of_ones[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; -static int pg_popcount32_slow(uint32 word); -static int pg_popcount64_slow(uint64 word); +int pg_popcount32_slow(uint32 word); +int pg_popcount64_slow(uint64 word); +uint64 pg_popcount_slow(const char *buf, int bytes); #ifdef TRY_POPCNT_FAST -static bool pg_popcount_available(void); -static int pg_popcount32_choose(uint32 word); -static int pg_popcount64_choose(uint64 word); -static int pg_popcount32_fast(uint32 word); -static int pg_popcount64_fast(uint64 word); +extern int pg_popcount32_choose(uint32 word); +extern int pg_popcount64_choose(uint64 word); +extern uint64 pg_popcount_choose(const char *buf, int bytes); int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; -#endif /* TRY_POPCNT_FAST */ - -#ifdef TRY_POPCNT_FAST - -/* - * Return true if CPUID indicates that the POPCNT instruction is available. - */ -static bool -pg_popcount_available(void) -{ - unsigned int exx[4] = {0, 0, 0, 0}; - -#if defined(HAVE__GET_CPUID) - __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUID) - __cpuid(exx, 1); -#else -#error cpuid instruction not available -#endif - - return (exx[2] & (1 << 23)) != 0; /* POPCNT */ -} - -/* - * These functions get called on the first call to pg_popcount32 etc. - * They detect whether we can use the asm implementations, and replace - * the function pointers so that subsequent calls are routed directly to - * the chosen implementation. - */ -static int -pg_popcount32_choose(uint32 word) -{ - if (pg_popcount_available()) - { - pg_popcount32 = pg_popcount32_fast; - pg_popcount64 = pg_popcount64_fast; - } - else - { - pg_popcount32 = pg_popcount32_slow; - pg_popcount64 = pg_popcount64_slow; - } - - return pg_popcount32(word); -} - -static int -pg_popcount64_choose(uint64 word) -{ - if (pg_popcount_available()) - { - pg_popcount32 = pg_popcount32_fast; - pg_popcount64 = pg_popcount64_fast; - } - else - { - pg_popcount32 = pg_popcount32_slow; - pg_popcount64 = pg_popcount64_slow; - } - - return pg_popcount64(word); -} - -/* - * pg_popcount32_fast - * Return the number of 1 bits set in word - */ -static int -pg_popcount32_fast(uint32 word) +#if defined(_MSC_VER) +uint64 (*pg_popcount_indirect)(const char *buf, int bytes) = pg_popcount_choose; +uint64 pg_popcount(const char *buf, int bytes) { -#ifdef _MSC_VER - return __popcnt(word); -#else - uint32 res; - -__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc"); - return (int) res; -#endif + return pg_popcount_indirect(buf, bytes); } - -/* - * pg_popcount64_fast - * Return the number of 1 bits set in word - */ -static int -pg_popcount64_fast(uint64 word) -{ -#ifdef _MSC_VER - return __popcnt64(word); #else - uint64 res; - -__asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); - return (int) res; +uint64 (*pg_popcount) (const char *buf, int bytes) = pg_popcount_choose; #endif -} - -#endif /* TRY_POPCNT_FAST */ - +#else /* TRY_POPCNT_FAST */ +uint64 pg_popcount(const char *buf, int bytes); +#endif /* TRY_POPCNT_FAST */ /* * pg_popcount32_slow * Return the number of 1 bits set in word */ -static int +int pg_popcount32_slow(uint32 word) { #ifdef HAVE__BUILTIN_POPCOUNT @@ -241,7 +146,7 @@ pg_popcount32_slow(uint32 word) * pg_popcount64_slow * Return the number of 1 bits set in word */ -static int +int pg_popcount64_slow(uint64 word) { #ifdef HAVE__BUILTIN_POPCOUNT @@ -286,22 +191,29 @@ pg_popcount64(uint64 word) return pg_popcount64_slow(word); } +uint64 +pg_popcount(const char *buf, int bytes) +{ + return pg_popcount_slow(buf, bytes); +} + #endif /* !TRY_POPCNT_FAST */ /* * pg_popcount - * Returns the number of 1-bits in buf + * Returns the number of 1-bits in buf using either 32 or 64 bit loops + * or fallback to __builtin_* or pure software. */ uint64 -pg_popcount(const char *buf, int bytes) +pg_popcount_slow(const char *buf, int bytes) { uint64 popcnt = 0; -#if SIZEOF_VOID_P >= 8 +#if SIZEOF_VOID_P == 8 /* Process in 64-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(8, buf)) + if (buf == (const char *)TYPEALIGN(8, buf)) { - const uint64 *words = (const uint64 *) buf; + const uint64 *words = (const uint64 *)buf; while (bytes >= 8) { @@ -309,9 +221,9 @@ pg_popcount(const char *buf, int bytes) bytes -= 8; } - buf = (const char *) words; + buf = (const char *)words; } -#else +#elif SIZEOF_VOID_P == 4 /* Process in 32-bit chunks if the buffer is aligned. */ if (buf == (const char *) TYPEALIGN(4, buf)) { diff --git a/src/port/pg_popcnt_choose.c b/src/port/pg_popcnt_choose.c new file mode 100644 index 0000000000..e170e16ff9 --- /dev/null +++ b/src/port/pg_popcnt_choose.c @@ -0,0 +1,168 @@ +/*------------------------------------------------------------------------- + * + * pg_popcnt_x86_64_choose.c + * Miscellaneous functions for bit-wise operations. + * + * Copyright (c) 2024, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcnt_x86_64_choose.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#include "port/pg_bitutils.h" + +#ifdef TRY_POPCNT_FAST + +#ifdef HAVE__GET_CPUID +#include +#endif + +#ifdef HAVE__CPUID +#include +#endif + +static bool pg_popcount_available(void); +int pg_popcount32_choose(uint32 word); +int pg_popcount64_choose(uint64 word); +uint64 pg_popcount_choose(const char *buf, int bytes); + +extern int pg_popcount32_fast(uint32 word); +extern int pg_popcount64_fast(uint64 word); +extern int pg_popcount32_slow(uint32 word); +extern int pg_popcount64_slow(uint64 word); +extern uint64 pg_popcount512_fast(const char *buf, int bytes); +extern uint64 pg_popcount_slow(const char *buf, int bytes); +extern uint64 (*pg_popcount_indirect)(const char *buf, int bytes); + +extern int (*pg_popcount32)(uint32 word); +extern int (*pg_popcount64)(uint64 word); + +/* + * Return true if CPUID indicates that the POPCNT instruction is available. + */ +static bool +pg_popcount_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + +#if defined(HAVE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuid(exx, 1); +#else +#error cpuid instruction not available +#endif + + return (exx[2] & (1 << 23)) != 0; /* POPCNT */ +} + +/* + * Return true if CPUID indicates that the AVX512_POPCNT instruction is + * available. This is similar to the method above; see + * https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features + * + * Finally, we make sure the xgetbv result is consistent with the CPUID + * results. + */ +static bool +pg_popcount512_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + + /* Check for AVX512VPOPCNTDQ and AVX512F */ +#if defined(HAVE__GET_CPUID_COUNT) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUIDEX) + __cpuidex(exx, 7, 0); +#endif + + if ((exx[2] & (0x00004000)) != 0 && (exx[1] & (0x00010000)) != 0) + { + /* + * CPUID succeeded, does the current running OS support the + * ZMM registers which are required for AVX512? This check is + * required to make sure an old OS on a new CPU is correctly + * checked or a VM hypervisor is not excluding AVX512 ZMM + * support in the VM; see "5.1.9 Detection of AVX Instructions" + * https://www.intel.com/content/www/us/en/content-details/671488/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html + */ + uint64 xcr = 0; +#ifdef _MSC_VER + uint64 highlow = _xgetbv(xcr); + + return (highlow & 0xE0) != 0; +#else + uint32 high; + uint32 low; + + __asm__ __volatile__("xgetbv\t\n" : "=a"(low), "=d"(high) : "c"(xcr)); + return (low & 0xE0) != 0; +#endif + } /* POPCNT 512 */ + return false; +} + +/* + * These functions get called on the first call to pg_popcount32 etc. + * They detect whether we can use the asm implementations, and replace + * the function pointers so that subsequent calls are routed directly to + * the chosen implementation. + */ +static void set_up_function_pointers() +{ + if (pg_popcount512_available()) + { +#if defined(_MSC_VER) + pg_popcount_indirect = pg_popcount512_fast; +#else + pg_popcount = pg_popcount512_fast; +#endif + } + else + { +#if defined(_MSC_VER) + pg_popcount_indirect = pg_popcount_slow; +#else + pg_popcount = pg_popcount_slow; +#endif + } + if (pg_popcount_available()) + { + pg_popcount32 = pg_popcount32_fast; + pg_popcount64 = pg_popcount64_fast; + } + else + { + pg_popcount32 = pg_popcount32_slow; + pg_popcount64 = pg_popcount64_slow; + } +} + +int pg_popcount32_choose(uint32 word) +{ + set_up_function_pointers(); + return pg_popcount32(word); +} + +int +pg_popcount64_choose(uint64 word) +{ + set_up_function_pointers(); + return pg_popcount64(word); +} + +uint64 +pg_popcount_choose(const char *buf, int bytes) +{ + set_up_function_pointers(); +#if defined(_MSC_VER) + return pg_popcount_indirect(buf, bytes); +#else + return pg_popcount(buf, bytes); +#endif +} + +#endif /* TRY_POPCNT_FAST */ diff --git a/src/port/pg_popcnt_x86_64_accel.c b/src/port/pg_popcnt_x86_64_accel.c new file mode 100644 index 0000000000..aef32c1174 --- /dev/null +++ b/src/port/pg_popcnt_x86_64_accel.c @@ -0,0 +1,93 @@ +/*------------------------------------------------------------------------- + * + * pg_popcnt_x86_64_accel.c + * Miscellaneous functions for bit-wise operations. + * + * Copyright (c) 2024, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcnt_x86_64_accel.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#if defined(HAVE__IMMINTRIN) +#include +#endif + +#include "port/pg_bitutils.h" + +#ifdef TRY_POPCNT_FAST +extern const uint8 pg_number_of_ones[256]; +extern uint64 pg_popcount_slow(const char *buf, int bytes); +uint64 pg_popcount512_fast(const char *buf, int bytes); +int pg_popcount32_fast(uint32 word); +int pg_popcount64_fast(uint64 word); + +/* + * pg_popcount32_fast + * Return the number of 1 bits set in word + */ +int pg_popcount32_fast(uint32 word) +{ +#ifdef _MSC_VER + return __popcnt(word); +#else + uint32 res; + + __asm__ __volatile__(" popcntl %1,%0\n" : "=q"(res) : "rm"(word) : "cc"); + return (int)res; +#endif +} + +/* + * pg_popcount64_fast + * Return the number of 1 bits set in word + */ +int +pg_popcount64_fast(uint64 word) +{ +#ifdef _MSC_VER + return __popcnt64(word); +#else + uint64 res; + + __asm__ __volatile__(" popcntq %1,%0\n" : "=q"(res) : "rm"(word) : "cc"); + return (int)res; +#endif +} + +/* + * Use AVX-512 Intrinsics for supported Intel CPUs or fall back the the software + * loop in pg_bunutils.c and use the best 32 or 64 bit fast methods. If no fast + * methods are used this will fall back to __builtin_* or pure software. + */ +uint64 +pg_popcount512_fast(const char *buf, int bytes) +{ +#if defined(HAVE__IMMINTRIN) && HAVE__AVX512_POPCNT == 1 + uint64 popcnt = 0; + __m512i accumulator = _mm512_setzero_si512(); + + while (bytes >= 64) + { + const __m512i v = _mm512_loadu_si512((const __m512i *)buf); + const __m512i p = _mm512_popcnt_epi64(v); + + accumulator = _mm512_add_epi64(accumulator, p); + bytes -= 64; + buf += 64; + } + + popcnt = _mm512_reduce_add_epi64(accumulator); + + /* Process any remaining bytes */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char)*buf++]; + return popcnt; +#else + return pg_popcount_slow(buf, bytes); +#endif /* USE_AVX512_CODE */ +} +#endif /* TRY_POPCNT_FAST */