From 6ae6af741b866ed95d367d810cdd4eef64a6ac91 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Tue, 11 Mar 2025 14:16:13 +0700 Subject: [PATCH v14 6/8] AVX-512 CRC / Meson Author: Raghuveer Devulapalli Author: Paul Amonson --- meson.build | 23 ++++++++++ src/include/port/pg_crc32c.h | 9 +--- src/port/pg_crc32c_sse42.c | 83 +++++++++++++++--------------------- 3 files changed, 60 insertions(+), 55 deletions(-) diff --git a/meson.build b/meson.build index 13c13748e5d..f2f1164a25e 100644 --- a/meson.build +++ b/meson.build @@ -2352,6 +2352,29 @@ int main(void) have_optimized_crc = true endif + avx512_crc_prog = ''' +#include +#include +#if defined(__has_attribute) && __has_attribute (target) +__attribute__((target("avx512vl,vpclmulqdq"))) +#endif +int main(void) +{ + __m512i x0 = _mm512_set1_epi32(0x1); + __m512i x1 = _mm512_set1_epi32(0x2); + __m512i x2 = _mm512_clmulepi64_epi128(x1, x0, 0x00); // vpclmulqdq + __m128i a1 = _mm_xor_epi64(_mm512_castsi512_si128(x1), _mm512_castsi512_si128(x0)); //avx512vl + int64_t val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); // 64-bit instruction + return (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); +} +''' + + if cc.links(avx512_crc_prog, + name: 'AVX-512 CRC32C', + args: test_c_args) + cdata.set('USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 1) + endif + endif elif host_cpu == 'arm' or host_cpu == 'aarch64' diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 28253b48018..a45f56a9405 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -37,11 +37,6 @@ typedef uint32 pg_crc32c; -/* WIP: configure checks */ -#ifdef __x86_64__ -#define USE_PCLMUL_WITH_RUNTIME_CHECK -#endif - /* The INIT and EQ macros are the same for all implementations. */ #define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF) #define EQ_CRC32C(c1, c2) ((c1) == (c2)) @@ -60,7 +55,7 @@ typedef uint32 pg_crc32c; extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); -#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK +#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len); #endif @@ -106,7 +101,7 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); -#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK +#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len); #endif diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index c57d6c6293b..f392eb5b236 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -15,7 +15,7 @@ #include "c.h" #include -#include +#include #include "port/pg_crc32c.h" @@ -70,16 +70,16 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) return crc; } -#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK +#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK /* Generated by https://github.com/corsix/fast-crc32/ using: */ -/* ./generate -i sse -p crc32c -a v4e */ +/* ./generate -i avx512_vpclmulqdq -p crc32c -a v1e */ /* MIT licensed */ -#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0)) -#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17)) +#define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0)) +#define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17)) -pg_attribute_target("sse4.2,pclmul") +pg_attribute_target("avx512vl,vpclmulqdq") pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length) { @@ -88,67 +88,54 @@ pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length) size_t len = length; const char *buf = data; - // This prolog is trying to avoid loads straddling - // cache lines, but it doesn't seem worth it if - // we're trying to be fast on small inputs as well -#if 0 - for (; len && ((uintptr_t) buf & 7); --len) + /* Align on cacheline boundary. WIP: The threshold needs testing. */ + if (unlikely(len > 256)) { - crc0 = _mm_crc32_u8(crc0, *buf++); - } - if (((uintptr_t) buf & 8) && len >= 8) - { - crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf); - buf += 8; - len -= 8; + for (; len && ((uintptr_t) buf & 7); --len) + { + crc0 = _mm_crc32_u8(crc0, *buf++); + } + while (((uintptr_t) buf & 56) && len >= 8) + { + crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf); + buf += 8; + len -= 8; + } } -#endif + if (len >= 64) { const char *end = buf + len; const char *limit = buf + len - 64; + __m128i z0; /* First vector chunk. */ - __m128i x0 = _mm_loadu_si128((const __m128i *) buf), + __m512i x0 = _mm512_loadu_si512((const void *) buf), y0; - __m128i x1 = _mm_loadu_si128((const __m128i *) (buf + 16)), - y1; - __m128i x2 = _mm_loadu_si128((const __m128i *) (buf + 32)), - y2; - __m128i x3 = _mm_loadu_si128((const __m128i *) (buf + 48)), - y3; - __m128i k; - - k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0); - x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0); + __m512i k; + + k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0)); + x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0); buf += 64; + /* Main loop. */ while (buf <= limit) { y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k); - y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i *) buf)), x0 = _mm_xor_si128(x0, y0); - y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i *) (buf + 16))), x1 = _mm_xor_si128(x1, y1); - y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i *) (buf + 32))), x2 = _mm_xor_si128(x2, y2); - y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i *) (buf + 48))), x3 = _mm_xor_si128(x3, y3); + x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512((const void *) buf), 0x96); buf += 64; } - /* Reduce x0 ... x3 to just x0. */ - k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k); - y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0); - y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2); - k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0); - y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); - y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0); + /* Reduce 512 bits to 128 bits. */ + k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0, 0x3da6d0cb, 0, 0xba4fc28e, 0, 0xf20c0dfe, 0, 0x493c7d27, 0, 0, 0, 0, 0); + y0 = clmul_lo(x0, k), k = clmul_hi(x0, k); + y0 = _mm512_xor_si512(y0, k); + z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0), _mm512_extracti32x4_epi32(y0, 1), _mm512_extracti32x4_epi32(y0, 2), 0x96); + z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3)); /* Reduce 128 bits to 32 bits, and multiply by x^32. */ - crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0)); - crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1)); + crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0)); + crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1)); len = end - buf; } -- 2.48.1