From 6ae6af741b866ed95d367d810cdd4eef64a6ac91 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Tue, 11 Mar 2025 14:16:13 +0700
Subject: [PATCH v14 6/8] AVX-512 CRC / Meson

Author: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Author: Paul Amonson <paul.d.amonson@intel.com>
---
 meson.build                  | 23 ++++++++++
 src/include/port/pg_crc32c.h |  9 +---
 src/port/pg_crc32c_sse42.c   | 83 +++++++++++++++---------------------
 3 files changed, 60 insertions(+), 55 deletions(-)

diff --git a/meson.build b/meson.build
index 13c13748e5d..f2f1164a25e 100644
--- a/meson.build
+++ b/meson.build
@@ -2352,6 +2352,29 @@ int main(void)
       have_optimized_crc = true
     endif
 
+    avx512_crc_prog = '''
+#include <immintrin.h>
+#include <stdint.h>
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("avx512vl,vpclmulqdq")))
+#endif
+int main(void)
+{
+      __m512i x0 = _mm512_set1_epi32(0x1);
+      __m512i x1 = _mm512_set1_epi32(0x2);
+      __m512i x2 = _mm512_clmulepi64_epi128(x1, x0, 0x00); // vpclmulqdq
+      __m128i a1 = _mm_xor_epi64(_mm512_castsi512_si128(x1), _mm512_castsi512_si128(x0)); //avx512vl
+      int64_t val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); // 64-bit instruction
+      return (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
+}
+'''
+
+    if cc.links(avx512_crc_prog,
+        name: 'AVX-512 CRC32C',
+        args: test_c_args)
+      cdata.set('USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 1)
+    endif
+
   endif
 
 elif host_cpu == 'arm' or host_cpu == 'aarch64'
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 28253b48018..a45f56a9405 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -37,11 +37,6 @@
 
 typedef uint32 pg_crc32c;
 
-/* WIP: configure checks */
-#ifdef __x86_64__
-#define USE_PCLMUL_WITH_RUNTIME_CHECK
-#endif
-
 /* The INIT and EQ macros are the same for all implementations. */
 #define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF)
 #define EQ_CRC32C(c1, c2) ((c1) == (c2))
@@ -60,7 +55,7 @@ typedef uint32 pg_crc32c;
 
 extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
-#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
 extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len);
 #endif
 
@@ -106,7 +101,7 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
 extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
-#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
 extern pg_crc32c pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t len);
 #endif
 
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index c57d6c6293b..f392eb5b236 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -15,7 +15,7 @@
 #include "c.h"
 
 #include <nmmintrin.h>
-#include <wmmintrin.h>
+#include <immintrin.h>
 
 #include "port/pg_crc32c.h"
 
@@ -70,16 +70,16 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 	return crc;
 }
 
-#ifdef USE_PCLMUL_WITH_RUNTIME_CHECK
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
 
 /* Generated by https://github.com/corsix/fast-crc32/ using: */
-/* ./generate -i sse -p crc32c -a v4e */
+/* ./generate -i avx512_vpclmulqdq -p crc32c -a v1e */
 /* MIT licensed */
 
-#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0))
-#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17))
+#define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0))
+#define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17))
 
-pg_attribute_target("sse4.2,pclmul")
+pg_attribute_target("avx512vl,vpclmulqdq")
 pg_crc32c
 pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length)
 {
@@ -88,67 +88,54 @@ pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length)
 	size_t		len = length;
 	const char *buf = data;
 
-	// This prolog is trying to avoid loads straddling
-	// cache lines, but it doesn't seem worth it if
-	// we're trying to be fast on small inputs as well
-#if 0
-	for (; len && ((uintptr_t) buf & 7); --len)
+	/* Align on cacheline boundary. WIP: The threshold needs testing. */
+	if (unlikely(len > 256))
 	{
-		crc0 = _mm_crc32_u8(crc0, *buf++);
-	}
-	if (((uintptr_t) buf & 8) && len >= 8)
-	{
-		crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);
-		buf += 8;
-		len -= 8;
+		for (; len && ((uintptr_t) buf & 7); --len)
+		{
+			crc0 = _mm_crc32_u8(crc0, *buf++);
+		}
+		while (((uintptr_t) buf & 56) && len >= 8)
+		{
+			crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);
+			buf += 8;
+			len -= 8;
+		}
 	}
-#endif
+
 	if (len >= 64)
 	{
 		const char *end = buf + len;
 		const char *limit = buf + len - 64;
+		__m128i		z0;
 
 		/* First vector chunk. */
-		__m128i		x0 = _mm_loadu_si128((const __m128i *) buf),
+		__m512i		x0 = _mm512_loadu_si512((const void *) buf),
 					y0;
-		__m128i		x1 = _mm_loadu_si128((const __m128i *) (buf + 16)),
-					y1;
-		__m128i		x2 = _mm_loadu_si128((const __m128i *) (buf + 32)),
-					y2;
-		__m128i		x3 = _mm_loadu_si128((const __m128i *) (buf + 48)),
-					y3;
-		__m128i		k;
-
-		k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0);
-		x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
+		__m512i		k;
+
+		k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0));
+		x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0);
 		buf += 64;
+
 		/* Main loop. */
 		while (buf <= limit)
 		{
 			y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-			y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
-			y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-			y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
-			y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i *) buf)), x0 = _mm_xor_si128(x0, y0);
-			y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i *) (buf + 16))), x1 = _mm_xor_si128(x1, y1);
-			y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i *) (buf + 32))), x2 = _mm_xor_si128(x2, y2);
-			y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i *) (buf + 48))), x3 = _mm_xor_si128(x3, y3);
+			x0 = _mm512_ternarylogic_epi64(x0, y0, _mm512_loadu_si512((const void *) buf), 0x96);
 			buf += 64;
 		}
 
-		/* Reduce x0 ... x3 to just x0. */
-		k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0);
-		y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-		y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
-		y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
-		y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
-		k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0);
-		y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
-		y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
+		/* Reduce 512 bits to 128 bits. */
+		k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0, 0x3da6d0cb, 0, 0xba4fc28e, 0, 0xf20c0dfe, 0, 0x493c7d27, 0, 0, 0, 0, 0);
+		y0 = clmul_lo(x0, k), k = clmul_hi(x0, k);
+		y0 = _mm512_xor_si512(y0, k);
+		z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0), _mm512_extracti32x4_epi32(y0, 1), _mm512_extracti32x4_epi32(y0, 2), 0x96);
+		z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3));
 
 		/* Reduce 128 bits to 32 bits, and multiply by x^32. */
-		crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0));
-		crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1));
+		crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0));
+		crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1));
 		len = end - buf;
 	}
 
-- 
2.48.1