From e2830537bd5388e87ba7c4ebe61a156fcee17e4b Mon Sep 17 00:00:00 2001 From: Chiranmoy Bhattacharya Date: Wed, 12 Mar 2025 15:21:36 +0530 Subject: [PATCH v6] SVE and NEON support for popcount --- config/c-compiler.m4 | 36 ++++++++++ configure | 56 +++++++++++++++ configure.ac | 9 +++ meson.build | 33 +++++++++ src/include/pg_config.h.in | 3 + src/include/port/pg_bitutils.h | 25 +++++++ src/port/Makefile | 2 + src/port/meson.build | 2 + src/port/pg_bitutils.c | 55 +++++++++++++- src/port/pg_popcount_neon.c | 91 +++++++++++++++++++++++ src/port/pg_popcount_sve.c | 127 +++++++++++++++++++++++++++++++++ 11 files changed, 438 insertions(+), 1 deletion(-) create mode 100644 src/port/pg_popcount_neon.c create mode 100644 src/port/pg_popcount_sve.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 8534cc54c13..c3c2d6fe29d 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -704,3 +704,39 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_AVX512_POPCNT_INTRINSICS + +# PGAC_ARM_SVE_POPCNT_INTRINSICS +# ------------------------------ +# Check if the compiler supports the ARM SVE popcount instructions using the +# svdup_u64, svwhilelt_b8, svcntb, svaddv, svadd_x, svcnt_x, svld1, +# svptrue_b64 and svand_x intrinsic functions. +# +# If the intrinsics are supported, sets pgac_arm_sve_popcnt_intrinsics. +AC_DEFUN([PGAC_ARM_SVE_POPCNT_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_arm_sve_popcnt_intrinsics])])dnl +AC_CACHE_CHECK([for svcnt_x and other intrinsics], [Ac_cachevar], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include + #if defined(__has_attribute) && __has_attribute(target) + __attribute__((target("arch=armv8-a+sve"))) + #endif + static int sve_popcount_test(void) + { + int popcnt = 0; + const char buf@<:@sizeof(uint64_t)@:>@; + svbool_t pred8 = svwhilelt_b8(0, 8), pred64 = svptrue_b64(); + svuint64_t accum = svdup_u64(0), vec; + if (svcntb() > 0) + popcnt = svaddv(pred8, svcnt_x(pred8, svld1(pred8, (const uint8_t *) buf))); + vec = svand_x(pred64, svld1(pred64, (const uint64_t *) buf), 0xf0f0); + accum = svadd_x(pred64, accum, svcnt_x(pred64, vec)); + popcnt += svaddv(pred64, accum); + return popcnt; + }], + [return sve_popcount_test();])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_arm_sve_popcnt_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_ARM_SVE_POPCNT_INTRINSICS diff --git a/configure b/configure index 0ffcaeb4367..b227b826092 100755 --- a/configure +++ b/configure @@ -17049,6 +17049,62 @@ $as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h fi fi +# Check for ARM SVE popcount intrinsics +# +if test x"$host_cpu" = x"aarch64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for svcnt_x and other intrinsics" >&5 +$as_echo_n "checking for svcnt_x and other intrinsics... " >&6; } +if ${pgac_cv_arm_sve_popcnt_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + #if defined(__has_attribute) && __has_attribute(target) + __attribute__((target("arch=armv8-a+sve"))) + #endif + static int sve_popcount_test(void) + { + int popcnt = 0; + const char buf[sizeof(uint64_t)]; + svbool_t pred8 = svwhilelt_b8(0, 8), pred64 = svptrue_b64(); + svuint64_t accum = svdup_u64(0), vec; + if (svcntb() > 0) + popcnt = svaddv(pred8, svcnt_x(pred8, svld1(pred8, (const uint8_t *) buf))); + vec = svand_x(pred64, svld1(pred64, (const uint64_t *) buf), 0xf0f0); + accum = svadd_x(pred64, accum, svcnt_x(pred64, vec)); + popcnt += svaddv(pred64, accum); + return popcnt; + } +int +main () +{ +return sve_popcount_test(); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_arm_sve_popcnt_intrinsics=yes +else + pgac_cv_arm_sve_popcnt_intrinsics=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_sve_popcnt_intrinsics" >&5 +$as_echo "$pgac_cv_arm_sve_popcnt_intrinsics" >&6; } +if test x"$pgac_cv_arm_sve_popcnt_intrinsics" = x"yes"; then + pgac_arm_sve_popcnt_intrinsics=yes +fi + + if test x"$pgac_arm_sve_popcnt_intrinsics" = x"yes"; then + +$as_echo "#define USE_SVE_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h + + fi +fi + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5 diff --git a/configure.ac b/configure.ac index f56681e0d91..5c649870550 100644 --- a/configure.ac +++ b/configure.ac @@ -2016,6 +2016,15 @@ if test x"$host_cpu" = x"x86_64"; then fi fi +# Check for ARM SVE popcount intrinsics +# +if test x"$host_cpu" = x"aarch64"; then + PGAC_ARM_SVE_POPCNT_INTRINSICS() + if test x"$pgac_arm_sve_popcnt_intrinsics" = x"yes"; then + AC_DEFINE(USE_SVE_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARM popcount instructions.]) + fi +fi + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # PGAC_SSE42_CRC32_INTRINSICS() diff --git a/meson.build b/meson.build index 7dd7110318d..db3120442b5 100644 --- a/meson.build +++ b/meson.build @@ -2195,6 +2195,39 @@ int main(void) endif +############################################################### +# Check for the availability of ARM SVE popcount intrinsics. +############################################################### + +if host_cpu == 'aarch64' + + prog = ''' +#include +#if defined(__has_attribute) && __has_attribute(target) + __attribute__((target("arch=armv8-a+sve"))) +#endif +int main () +{ + int popcnt = 0; + const char buf[sizeof(uint64_t)]; + svbool_t pred8 = svwhilelt_b8(0, 8), pred64 = svptrue_b64(); + svuint64_t accum = svdup_u64(0), vec; + if (svcntb() > 0) + popcnt = svaddv(pred8, svcnt_x(pred8, svld1(pred8, (const uint8_t *) buf))); + vec = svand_x(pred64, svld1(pred64, (const uint64_t *) buf), 0xf0f0); + accum = svadd_x(pred64, accum, svcnt_x(pred64, vec)); + popcnt += svaddv(pred64, accum); + return popcnt; +} +''' + + if cc.links(prog, name: 'ARM SVE popcount', args: test_c_args) + cdata.set('USE_SVE_POPCNT_WITH_RUNTIME_CHECK', 1) + endif + +endif + + ############################################################### # Select CRC-32C implementation. # diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 07b2f798abd..29c32bbbbe3 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -648,6 +648,9 @@ /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */ #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK +/* Define to 1 to use SVE popcount instructions with a runtime check. */ +#undef USE_SVE_POPCNT_WITH_RUNTIME_CHECK + /* Define to 1 to build with Bonjour support. (--with-bonjour) */ #undef USE_BONJOUR diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 62554ce685a..ffafbc926af 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -298,6 +298,16 @@ pg_ceil_log2_64(uint64 num) #endif #endif +/* + * On aarch64, try using SVE popcount instructions, but only if + * we can verify that the CPU supports it via a runtime check. + * + * Otherwise, we fall back to NEON implementation. + */ +#if defined(__aarch64__) && defined(__ARM_NEON) +#define POPCNT_FAST_AARCH64 1 +#endif + #ifdef TRY_POPCNT_FAST /* Attempt to use the POPCNT instruction, but perform a runtime check first */ extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); @@ -315,6 +325,21 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes); extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); #endif +#elif POPCNT_FAST_AARCH64 +extern int pg_popcount32(uint32 word); +extern int pg_popcount64(uint64 word); +extern uint64 pg_popcount_neon(const char *buf, int bytes); +extern uint64 pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask); +extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); +extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); + +/* Attempt to use the SVE instructions, but perform a runtime check first */ +#if USE_SVE_POPCNT_WITH_RUNTIME_CHECK +extern bool pg_popcount_sve_available(void); +extern uint64 pg_popcount_sve(const char *buf, int bytes); +extern uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask); +#endif + #else /* Use a portable implementation -- no need for a function pointer. */ extern int pg_popcount32(uint32 word); diff --git a/src/port/Makefile b/src/port/Makefile index 4c224319512..9ea21fb6477 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -45,6 +45,8 @@ OBJS = \ path.o \ pg_bitutils.o \ pg_popcount_avx512.o \ + pg_popcount_neon.o \ + pg_popcount_sve.o \ pg_strong_random.o \ pgcheckdir.o \ pgmkdirp.o \ diff --git a/src/port/meson.build b/src/port/meson.build index 7fcfa728d43..7a0743ab233 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -8,6 +8,8 @@ pgport_sources = [ 'path.c', 'pg_bitutils.c', 'pg_popcount_avx512.c', + 'pg_popcount_neon.c', + 'pg_popcount_sve.c', 'pg_strong_random.c', 'pgcheckdir.c', 'pgmkdirp.c', diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 5677525693d..2afe76b2796 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -339,6 +339,59 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask) #endif /* TRY_POPCNT_FAST */ +#ifdef POPCNT_FAST_AARCH64 +static uint64 pg_popcount_choose_aarch64(const char *buf, int bytes); +static uint64 pg_popcount_masked_choose_aarch64(const char *buf, int bytes, bits8 mask); +uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose_aarch64; +uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose_aarch64; + +/* + * On AArch64 these functions are invoked on the first call to pg_popcount and + * pg_popcount_masked. They detect whether we can use the SVE implementations, + * and replace the function pointers so that subsequent calls are routed + * directly to the chosen implementation. + */ +static inline void +choose_popcount_functions_aarch64(void) +{ + pg_popcount_optimized = pg_popcount_neon; + pg_popcount_masked_optimized = pg_popcount_masked_neon; + +#if USE_SVE_POPCNT_WITH_RUNTIME_CHECK + if (pg_popcount_sve_available()) + { + pg_popcount_optimized = pg_popcount_sve; + pg_popcount_masked_optimized = pg_popcount_masked_sve; + } +#endif +} + +static uint64 +pg_popcount_choose_aarch64(const char *buf, int bytes) +{ + choose_popcount_functions_aarch64(); + return pg_popcount_optimized(buf, bytes); +} + +static uint64 +pg_popcount_masked_choose_aarch64(const char *buf, int bytes, bits8 mask) +{ + choose_popcount_functions_aarch64(); + return pg_popcount_masked(buf, bytes, mask); +} + +int +pg_popcount32(uint32 word) +{ + return pg_popcount32_slow(word); +} + +int +pg_popcount64(uint64 word) +{ + return pg_popcount64_slow(word); +} +#endif /* POPCNT_FAST_AARCH64 */ /* * pg_popcount32_slow @@ -486,7 +539,7 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask) return popcnt; } -#ifndef TRY_POPCNT_FAST +#if !defined(TRY_POPCNT_FAST) && !defined(POPCNT_FAST_AARCH64) /* * When the POPCNT instruction is not available, there's no point in using diff --git a/src/port/pg_popcount_neon.c b/src/port/pg_popcount_neon.c new file mode 100644 index 00000000000..ee78fa01f26 --- /dev/null +++ b/src/port/pg_popcount_neon.c @@ -0,0 +1,91 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_neon.c + * Holds the NEON pg_popcount() implementation. + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount_neon.c + *POPCNT_FAST_AARCH64 + *------------------------------------------------------------------------- + */ +#include "c.h" +#include "port/pg_bitutils.h" + +#ifdef POPCNT_FAST_AARCH64 + +#include + +/* + * pg_popcount_neon + * Returns the number of 1-bits in buf + */ +uint64 +pg_popcount_neon(const char *buf, int bytes) +{ + uint8x16_t vec8; + uint64x2_t accum1 = vdupq_n_u64(0), + accum2 = vdupq_n_u64(0); + uint32 i = 0, + vec_len = sizeof(uint8x16_t), + loop_bytes = bytes & ~(vec_len * 2 - 1); + uint64 popcnt = 0; + + /* Process 2 complete vectors */ + for (; i < loop_bytes; i += vec_len * 2) + { + vec8 = vld1q_u8((const uint8 *) (buf + i)); + accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec8)))); + vec8 = vld1q_u8((const uint8 *) (buf + i + vec_len)); + accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec8)))); + } + + /* Reduce the accumulators */ + popcnt += vaddvq_u64(vaddq_u64(accum1, accum2)); + + /* Process any remaining bytes */ + bytes -= loop_bytes; + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++]; + + return popcnt; +} + +/* +* pg_popcount_masked_neon +* Returns the number of 1-bits in buf after applying the mask +*/ +uint64 +pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask) +{ + uint8x16_t vec8, + mask_vec = vdupq_n_u8(mask); + uint64x2_t accum1 = vdupq_n_u64(0), + accum2 = vdupq_n_u64(0); + uint32 i = 0, + vec_len = sizeof(uint8x16_t), + loop_bytes = bytes & ~(vec_len * 2 - 1); + uint64 popcnt = 0; + + /* Process 2 complete vectors */ + for (; i < loop_bytes; i += vec_len * 2) + { + vec8 = vandq_u8(vld1q_u8((const uint8 *) (buf + i)), mask_vec); + accum1 = vpadalq_u32(accum1, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec8)))); + vec8 = vandq_u8(vld1q_u8((const uint8 *) (buf + i)), mask_vec); + accum2 = vpadalq_u32(accum2, vpaddlq_u16(vpaddlq_u8(vcntq_u8(vec8)))); + } + + /* Reduce the accumulators */ + popcnt += vaddvq_u64(vaddq_u64(accum1, accum2)); + + /* Process any remaining bytes */ + bytes -= loop_bytes; + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask]; + + return popcnt; +} + +#endif /* POPCNT_FAST_AARCH64 */ diff --git a/src/port/pg_popcount_sve.c b/src/port/pg_popcount_sve.c new file mode 100644 index 00000000000..20bb20d9130 --- /dev/null +++ b/src/port/pg_popcount_sve.c @@ -0,0 +1,127 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_sve.c + * Holds the SVE pg_popcount() implementation. + * + * Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount_sve.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" +#include "port/pg_bitutils.h" + +/* + * It's unlikely that USE_SVE_POPCNT_WITH_RUNTIME_CHECK is set and + * POPCNT_FAST_AARCH64 is not, but we check it anyway to be sure. + */ +#if defined(POPCNT_FAST_AARCH64) && defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK) + +#include + +#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL) +#include +#endif + +/* + * Returns true if the CPU supports the instructions required for the SVE + * pg_popcount() implementation. + */ +bool +pg_popcount_sve_available(void) +{ +#if defined(HAVE_ELF_AUX_INFO) && defined(__aarch64__) /* FreeBSD */ + unsigned long hwcap; + return elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)) == 0 && + (hwcap & HWCAP_SVE) != 0; +#elif defined(HAVE_GETAUXVAL) && defined(__aarch64__) /* Linux */ + return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0; +#else + return false; +#endif +} + +/* + * pg_popcount_sve + * Returns the number of 1-bits in buf + */ +pg_attribute_target("arch=armv8-a+sve") +uint64 +pg_popcount_sve(const char *buf, int bytes) +{ + svbool_t pred = svptrue_b64(); + svuint64_t vec64, + accum1 = svdup_u64(0), + accum2 = svdup_u64(0); + uint32 i = 0, + vec_len = svcntb(), + loop_bytes = bytes & ~(vec_len * 2 - 1); + uint64 popcnt = 0; + + /* Process 2 complete vectors */ + for (; i < loop_bytes; i += vec_len * 2) + { + vec64 = svld1(pred, (const uint64 *) (buf + i)); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec64)); + vec64 = svld1(pred, (const uint64 *) (buf + i + vec_len)); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec64)); + } + + /* Reduce the accumulators */ + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + + /* Process the last incomplete vector */ + for(; i < bytes; i += vec_len) + { + pred = svwhilelt_b8(i, (uint32) bytes); + popcnt += svaddv(pred, svcnt_x(pred, svld1(pred, (const uint8 *) (buf + i)))); + } + + return popcnt; +} + +/* + * pg_popcount_masked_sve + * Returns the number of 1-bits in buf after applying the mask + */ +pg_attribute_target("arch=armv8-a+sve") +uint64 +pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask) +{ + svbool_t pred = svptrue_b64(); + svuint8_t vec8; + svuint64_t vec64, + accum1 = svdup_u64(0), + accum2 = svdup_u64(0); + uint32 i = 0, + vec_len = svcntb(), + loop_bytes = bytes & ~(vec_len * 2 - 1); + uint64 popcnt = 0, + mask64 = ~UINT64CONST(0) / 0xFF * mask; + + /* Process 2 complete vectors */ + for (; i < loop_bytes; i += vec_len * 2) + { + vec64 = svand_x(pred, svld1(pred, (const uint64 *) (buf + i)), mask64); + accum1 = svadd_x(pred, accum1, svcnt_x(pred, vec64)); + vec64 = svand_x(pred, svld1(pred, (const uint64 *) (buf + i + vec_len)), mask64); + accum2 = svadd_x(pred, accum2, svcnt_x(pred, vec64)); + } + + /* Reduce the accumulators */ + popcnt += svaddv(pred, svadd_x(pred, accum1, accum2)); + + /* Process the last incomplete vectors */ + for(; i < bytes; i += vec_len) + { + pred = svwhilelt_b8(i, (uint32) bytes); + vec8 = svand_x(pred, svld1(pred, (const uint8 *) (buf + i)), mask); + popcnt += svaddv(pred, svcnt_x(pred, vec8)); + } + + return popcnt; +} + +#endif /* POPCNT_FAST_AARCH64 && USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ -- 2.34.1