From ba867ed01a1a25b2603eeda13a44e94a0a61648e Mon Sep 17 00:00:00 2001 From: Paul Amonson Date: Thu, 21 Mar 2024 11:19:23 -0700 Subject: [PATCH 1/2] [Refactor] Split pg_popcount functions into multiple files. Signed-off-by: Paul Amonson --- src/include/port/pg_bitutils.h | 6 +- src/port/Makefile | 2 + src/port/meson.build | 2 + src/port/pg_bitutils.c | 232 +++------------------------ src/port/pg_popcount_x86_64_accel.c | 134 ++++++++++++++++ src/port/pg_popcount_x86_64_choose.c | 158 ++++++++++++++++++ 6 files changed, 324 insertions(+), 210 deletions(-) create mode 100644 src/port/pg_popcount_x86_64_accel.c create mode 100644 src/port/pg_popcount_x86_64_choose.c diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 53e5239717..26f6a48377 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -305,11 +305,13 @@ extern PGDLLIMPORT int (*pg_popcount64) (uint64 word); extern PGDLLIMPORT uint64 (*pg_popcount) (const char *buf, int bytes); #else -/* Use a portable implementation -- no need for a function pointer. */ +/* + * Use a portable implementation -- no need for a function pointer. Use + * inlining for small speed increase. + */ extern int pg_popcount32(uint32 word); extern int pg_popcount64(uint64 word); extern uint64 pg_popcount(const char *buf, int bytes); - #endif /* TRY_POPCNT_FAST */ /* diff --git a/src/port/Makefile b/src/port/Makefile index dcc8737e68..1499985dfc 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -44,6 +44,8 @@ OBJS = \ noblock.o \ path.o \ pg_bitutils.o \ + pg_popcount_x86_64_choose.o \ + pg_popcount_x86_64_accel.o \ pg_strong_random.o \ pgcheckdir.o \ pgmkdirp.o \ diff --git a/src/port/meson.build b/src/port/meson.build index 92b593e6ef..cf6e9fa06c 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,8 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_popcount_x86_64_choose.c', + 'pg_popcount_x86_64_accel.c', 'pg_strong_random.c', 'pgcheckdir.c', 'pgmkdirp.c', diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 1197696e97..21a4d0ca97 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -21,7 +21,6 @@ #include "port/pg_bitutils.h" - /* * Array giving the position of the left-most set bit for each possible * byte value. We count the right-most position as the 0th bit, and the @@ -103,196 +102,46 @@ const uint8 pg_number_of_ones[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; -static inline int pg_popcount32_slow(uint32 word); -static inline int pg_popcount64_slow(uint64 word); -static uint64 pg_popcount_slow(const char *buf, int bytes); - -#ifdef TRY_POPCNT_FAST -static bool pg_popcount_available(void); -static int pg_popcount32_choose(uint32 word); -static int pg_popcount64_choose(uint64 word); -static uint64 pg_popcount_choose(const char *buf, int bytes); -static inline int pg_popcount32_fast(uint32 word); -static inline int pg_popcount64_fast(uint64 word); -static uint64 pg_popcount_fast(const char *buf, int bytes); - -int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; -int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; -uint64 (*pg_popcount) (const char *buf, int bytes) = pg_popcount_choose; -#endif /* TRY_POPCNT_FAST */ - -#ifdef TRY_POPCNT_FAST - +#ifndef TRY_POPCNT_FAST /* - * Return true if CPUID indicates that the POPCNT instruction is available. + * Optimize function signature if using the slow functions. */ -static bool -pg_popcount_available(void) -{ - unsigned int exx[4] = {0, 0, 0, 0}; +#define INLINE static inline -#if defined(HAVE__GET_CPUID) - __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUID) - __cpuid(exx, 1); #else -#error cpuid instruction not available -#endif - - return (exx[2] & (1 << 23)) != 0; /* POPCNT */ -} - -/* - * These functions get called on the first call to pg_popcount32 etc. - * They detect whether we can use the asm implementations, and replace - * the function pointers so that subsequent calls are routed directly to - * the chosen implementation. - */ -static int -pg_popcount32_choose(uint32 word) -{ - if (pg_popcount_available()) - { - pg_popcount32 = pg_popcount32_fast; - pg_popcount64 = pg_popcount64_fast; - pg_popcount = pg_popcount_fast; - } - else - { - pg_popcount32 = pg_popcount32_slow; - pg_popcount64 = pg_popcount64_slow; - pg_popcount = pg_popcount_slow; - } - - return pg_popcount32(word); -} - -static int -pg_popcount64_choose(uint64 word) -{ - if (pg_popcount_available()) - { - pg_popcount32 = pg_popcount32_fast; - pg_popcount64 = pg_popcount64_fast; - pg_popcount = pg_popcount_fast; - } - else - { - pg_popcount32 = pg_popcount32_slow; - pg_popcount64 = pg_popcount64_slow; - pg_popcount = pg_popcount_slow; - } - - return pg_popcount64(word); -} - -static uint64 -pg_popcount_choose(const char *buf, int bytes) -{ - if (pg_popcount_available()) - { - pg_popcount32 = pg_popcount32_fast; - pg_popcount64 = pg_popcount64_fast; - pg_popcount = pg_popcount_fast; - } - else - { - pg_popcount32 = pg_popcount32_slow; - pg_popcount64 = pg_popcount64_slow; - pg_popcount = pg_popcount_slow; - } +#define INLINE +#endif /* !TRY_POPCNT_FAST */ - return pg_popcount(buf, bytes); -} +/* Forward References */ +INLINE int pg_popcount32_slow(uint32 word); +INLINE int pg_popcount64_slow(uint64 word); +INLINE uint64 pg_popcount_slow(const char *buf, int bytes); -/* - * pg_popcount32_fast - * Return the number of 1 bits set in word - */ -static inline int -pg_popcount32_fast(uint32 word) +#ifndef TRY_POPCNT_FAST +/* Slow function defintions for exported functions. */ +int +pg_popcount32(uint32 word) { -#ifdef _MSC_VER - return __popcnt(word); -#else - uint32 res; - -__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc"); - return (int) res; -#endif + return pg_popcount32_slow(word); } -/* - * pg_popcount64_fast - * Return the number of 1 bits set in word - */ -static inline int -pg_popcount64_fast(uint64 word) +int +pg_popcount64(uint64 word) { -#ifdef _MSC_VER - return __popcnt64(word); -#else - uint64 res; - -__asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); - return (int) res; -#endif + return pg_popcount64_slow(word); } - -/* - * pg_popcount_fast - * Returns the number of 1-bits in buf - */ -static uint64 -pg_popcount_fast(const char *buf, int bytes) +uint64 +pg_popcount(const char *buf, int bytes) { - uint64 popcnt = 0; - -#if SIZEOF_VOID_P >= 8 - /* Process in 64-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(8, buf)) - { - const uint64 *words = (const uint64 *) buf; - - while (bytes >= 8) - { - popcnt += pg_popcount64_fast(*words++); - bytes -= 8; - } - - buf = (const char *) words; - } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(4, buf)) - { - const uint32 *words = (const uint32 *) buf; - - while (bytes >= 4) - { - popcnt += pg_popcount32_fast(*words++); - bytes -= 4; - } - - buf = (const char *) words; - } -#endif - - /* Process any remaining bytes */ - while (bytes--) - popcnt += pg_number_of_ones[(unsigned char) *buf++]; - - return popcnt; + return pg_popcount_slow(buf, bytes); } - -#endif /* TRY_POPCNT_FAST */ - +#endif /* !TRY_POPCNT_FAST */ /* * pg_popcount32_slow * Return the number of 1 bits set in word */ -static inline int +INLINE int pg_popcount32_slow(uint32 word) { #ifdef HAVE__BUILTIN_POPCOUNT @@ -314,7 +163,7 @@ pg_popcount32_slow(uint32 word) * pg_popcount64_slow * Return the number of 1 bits set in word */ -static inline int +INLINE int pg_popcount64_slow(uint64 word) { #ifdef HAVE__BUILTIN_POPCOUNT @@ -342,7 +191,7 @@ pg_popcount64_slow(uint64 word) * pg_popcount_slow * Returns the number of 1-bits in buf */ -static uint64 +INLINE uint64 pg_popcount_slow(const char *buf, int bytes) { uint64 popcnt = 0; @@ -383,36 +232,3 @@ pg_popcount_slow(const char *buf, int bytes) return popcnt; } - -#ifndef TRY_POPCNT_FAST - -/* - * When the POPCNT instruction is not available, there's no point in using - * function pointers to vary the implementation between the fast and slow - * method. We instead just make these actual external functions when - * TRY_POPCNT_FAST is not defined. The compiler should be able to inline - * the slow versions here. - */ -int -pg_popcount32(uint32 word) -{ - return pg_popcount32_slow(word); -} - -int -pg_popcount64(uint64 word) -{ - return pg_popcount64_slow(word); -} - -/* - * pg_popcount - * Returns the number of 1-bits in buf - */ -uint64 -pg_popcount(const char *buf, int bytes) -{ - return pg_popcount_slow(buf, bytes); -} - -#endif /* !TRY_POPCNT_FAST */ diff --git a/src/port/pg_popcount_x86_64_accel.c b/src/port/pg_popcount_x86_64_accel.c new file mode 100644 index 0000000000..d5500d56e7 --- /dev/null +++ b/src/port/pg_popcount_x86_64_accel.c @@ -0,0 +1,134 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_x86_64_accel.c + * Miscellaneous functions for bit-wise operations. + * + * Copyright (c) 2019-2024, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount_x86_64_accel.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" +#include "port/pg_bitutils.h" + +#if defined(HAVE__IMMINTRIN) +#include +#endif + +#ifdef TRY_POPCNT_FAST +int pg_popcount32_fast(uint32 word); +int pg_popcount64_fast(uint64 word); +uint64 pg_popcount_fast(const char *buf, int bytes); +uint64 pg_popcount512_fast(const char *buf, int bytes); + +/* + * pg_popcount32_fast + * Return the number of 1 bits set in word + */ +int +pg_popcount32_fast(uint32 word) +{ +#ifdef _MSC_VER + return __popcnt(word); +#else + uint32 res; + +__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc"); + return (int) res; +#endif +} + +/* + * pg_popcount64_fast + * Return the number of 1 bits set in word + */ +int +pg_popcount64_fast(uint64 word) +{ +#ifdef _MSC_VER + return __popcnt64(word); +#else + uint64 res; + +__asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); + return (int) res; +#endif +} + +/* + * pg_popcount_fast + * Returns the number of 1-bits in buf + */ +uint64 +pg_popcount_fast(const char *buf, int bytes) +{ + uint64 popcnt = 0; + +#if SIZEOF_VOID_P >= 8 + /* Process in 64-bit chunks if the buffer is aligned. */ + if (buf == (const char *) TYPEALIGN(8, buf)) + { + const uint64 *words = (const uint64 *) buf; + + while (bytes >= 8) + { + popcnt += pg_popcount64_fast(*words++); + bytes -= 8; + } + + buf = (const char *) words; + } +#else + /* Process in 32-bit chunks if the buffer is aligned. */ + if (buf == (const char *) TYPEALIGN(4, buf)) + { + const uint32 *words = (const uint32 *) buf; + + while (bytes >= 4) + { + popcnt += pg_popcount32_fast(*words++); + bytes -= 4; + } + + buf = (const char *) words; + } +#endif + + /* Process any remaining bytes */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++]; + + return popcnt; +} + +/* + * Use AVX-512 Intrinsics for supported CPUs or fall back the non-152 fast + * implem entation and use the best 64 bit fast methods. If no fast + * methods are used this will fall back to __builtin_* or pure software. + */ +uint64 +pg_popcount512_fast(const char *buf, int bytes) +{ + uint64 popcnt = 0; + #if defined(HAVE__IMMINTRIN) && HAVE__AVX512_POPCNT == 1 + __m512i accumulator = _mm512_setzero_si512(); + + while (bytes >= 64) + { + const __m512i v = _mm512_loadu_si512((const __m512i *)buf); + const __m512i p = _mm512_popcnt_epi64(v); + + accumulator = _mm512_add_epi64(accumulator, p); + bytes -= 64; + buf += 64; + } + + popcnt = _mm512_reduce_add_epi64(accumulator); +#endif /* defined(HAVE__IMMINTRIN) && HAVE__AVX512_POPCNT == 1 */ + + /* Process any remaining bytes */ + return popcnt + pg_popcount_fast(buf, bytes); +} +#endif /* TRY_POPCNT_FAST */ diff --git a/src/port/pg_popcount_x86_64_choose.c b/src/port/pg_popcount_x86_64_choose.c new file mode 100644 index 0000000000..e73d1999ad --- /dev/null +++ b/src/port/pg_popcount_x86_64_choose.c @@ -0,0 +1,158 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_x86_64_choose.c + * Miscellaneous functions for bit-wise operations. + * + * Copyright (c) 2019-2024, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount_x86_64_choose.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" +#include "port/pg_bitutils.h" + +#ifdef TRY_POPCNT_FAST + +#ifdef HAVE__GET_CPUID +#include +#endif +#ifdef HAVE__CPUID +#include +#endif + +static bool pg_popcount_available(void); +static int pg_popcount32_choose(uint32 word); +static int pg_popcount64_choose(uint64 word); +static uint64 pg_popcount_choose(const char *buf, int bytes); +extern int pg_popcount32_fast(uint32 word); +extern int pg_popcount64_fast(uint64 word); +extern uint64 pg_popcount_fast(const char *buf, int bytes); +extern uint64 pg_popcount512_fast(const char *buf, int bytes); +extern int pg_popcount32_slow(uint32 word); +extern int pg_popcount64_slow(uint64 word); +extern uint64 pg_popcount_slow(const char *buf, int bytes); + +int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; +int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; +uint64 (*pg_popcount) (const char *buf, int bytes) = pg_popcount_choose; + +/* + * Return true if CPUID indicates that the POPCNT instruction is available. + */ +static bool +pg_popcount_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + +#if defined(HAVE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuid(exx, 1); +#else +#error cpuid instruction not available +#endif + + return (exx[2] & (1 << 23)) != 0; /* POPCNT */ +} + +/* + * Return true if CPUID indicates that the AVX512_POPCNT instruction is + * available. This is similar to the method above; see + * https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features + * + * Finally, we make sure the xgetbv result is consistent with the CPUID + * results. + */ +static bool +pg_popcount512_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + + /* Check for AVX512VPOPCNTDQ and AVX512F */ +#if defined(HAVE__GET_CPUID_COUNT) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUIDEX) + __cpuidex(exx, 7, 0); +#endif + + if ((exx[2] & (0x00004000)) != 0 && (exx[1] & (0x00010000)) != 0) + { + /* + * CPUID succeeded, does the current running OS support the + * ZMM registers which are required for AVX512? This check is + * required to make sure an old OS on a new CPU is correctly + * checked or a VM hypervisor is not excluding AVX512 ZMM + * support in the VM; see "5.1.9 Detection of AVX Instructions" + * https://www.intel.com/content/www/us/en/content-details/671488/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html + */ + uint64 xcr = 0; +#ifdef _MSC_VER + uint64 highlow = _xgetbv(xcr); + + return (highlow & 0xE0) != 0; +#else + uint32 high; + uint32 low; + + __asm__ __volatile__("xgetbv\t\n" : "=a"(low), "=d"(high) : "c"(xcr)); + return (low & 0xE0) != 0; +#endif + } /* POPCNT 512 */ + return false; +} + +/* + * These functions get called on the first call to pg_popcount32 etc. + * They detect whether we can use the asm implementations, and replace + * the function pointers so that subsequent calls are routed directly to + * the chosen implementation. + */ +static inline void set_function_pointers() +{ +if (pg_popcount512_available()) + { + pg_popcount32 = pg_popcount32_fast; + pg_popcount64 = pg_popcount64_fast; + pg_popcount = pg_popcount_fast; + } + else + { + if (pg_popcount_available()) + { + pg_popcount32 = pg_popcount32_fast; + pg_popcount64 = pg_popcount64_fast; + pg_popcount = pg_popcount_fast; + } + else + { + pg_popcount32 = pg_popcount32_slow; + pg_popcount64 = pg_popcount64_slow; + pg_popcount = pg_popcount_slow; + } + } +} + +static inline int +pg_popcount32_choose(uint32 word) +{ + set_function_pointers(); + return pg_popcount32(word); +} + +static inline int +pg_popcount64_choose(uint64 word) +{ + set_function_pointers(); + return pg_popcount64(word); +} + +static inline uint64 +pg_popcount_choose(const char *buf, int bytes) +{ + set_function_pointers(); + return pg_popcount(buf, bytes); +} + +#endif /* TRY_POPCNT_FAST */ -- 2.34.1