From da15aefd7269b8a342f751e2b5c2c4b9c1b0627c Mon Sep 17 00:00:00 2001 From: John Naylor Date: Mon, 2 Mar 2026 17:28:58 +0700 Subject: [PATCH v13] Enable autovectorizing page checksums with AVX2 where available We already rely on autovectorization for computing page checksums, but on x86 we can get nearly three times the performance by annotating pg_checksum_block() with a function target attribute for AVX2. That feature set not only uses 256-bit registers, but can also use vector multiplication rather than the vector shifts and adds available in SSE2. This matters most when using io_uring since in that case the checksum computation is not done in parallel via workers. Co-authored-by: Matthew Sterrett Co-authored-by: Andrew Kim Reviewed-by: Oleg Tselebrovskiy Discussion: https://postgr.es/m/CA%2BvA85_5GTu%2BHHniSbvvP%2B8k3%3DxZO%3DWE84NPwiKyxztqvpfZ3Q%40mail.gmail.com Discussion: https://postgr.es/m/20250911054220.3784-1-root%40ip-172-31-36-228.ec2.internal --- config/c-compiler.m4 | 23 +++++++++++ configure | 46 +++++++++++++++++++++ configure.ac | 9 ++++ meson.build | 28 +++++++++++++ src/backend/storage/page/checksum.c | 44 +++++++++++++++++++- src/include/pg_config.h.in | 3 ++ src/include/port/pg_cpu.h | 3 ++ src/include/storage/checksum_block.inc.c | 42 +++++++++++++++++++ src/include/storage/checksum_impl.h | 52 ++++++++---------------- src/port/pg_cpu_x86.c | 4 ++ 10 files changed, 219 insertions(+), 35 deletions(-) create mode 100644 src/include/storage/checksum_block.inc.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 629572ee350..4d5acf8be6e 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -687,6 +687,29 @@ fi undefine([Ac_cachevar])dnl ])# PGAC_SSE42_CRC32_INTRINSICS +# PGAC_AVX2_SUPPORT +# --------------------------- +# Check if the compiler supports AVX2 as a target +# +# If AVX2 target attribute is supported, sets pgac_avx2_support. +AC_DEFUN([PGAC_AVX2_SUPPORT], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx2_support])])dnl +AC_CACHE_CHECK([for AVX2 target attribute support], [Ac_cachevar], +[AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include + __attribute__((target("avx2"))) + static int avx2_test(void) + { + return 0; + }], + [return avx2_test();])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_avx2_support=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_AVX2_SUPPORT + # PGAC_AVX512_PCLMUL_INTRINSICS # --------------------------- # Check if the compiler supports AVX-512 carryless multiplication diff --git a/configure b/configure index 8e0e7483c1d..1ae527215c7 100755 --- a/configure +++ b/configure @@ -17725,6 +17725,52 @@ $as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h fi fi +# Check for AVX2 target and intrinsic support +# +if test x"$host_cpu" = x"x86_64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 target attribute support" >&5 +$as_echo_n "checking for AVX2 target attribute support... " >&6; } +if ${pgac_cv_avx2_support+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("avx2"))) + static int avx2_test(void) + { + return 0; + } + #endif +int +main () +{ +return avx2_test(); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv_avx2_support=yes +else + pgac_cv_avx2_support=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx2_support" >&5 +$as_echo "$pgac_cv_avx2_support" >&6; } +if test x"$pgac_cv_avx2_support" = x"yes"; then + pgac_avx2_support=yes +fi + + if test x"$pgac_avx2_support" = x"yes"; then + +$as_echo "#define USE_AVX2_WITH_RUNTIME_CHECK 1" >>confdefs.h + + fi +fi + # Check for XSAVE intrinsics # { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv" >&5 diff --git a/configure.ac b/configure.ac index 2baac5e9da7..a43add51ca6 100644 --- a/configure.ac +++ b/configure.ac @@ -2129,6 +2129,15 @@ else fi fi +# Check for AVX2 target support +# +if test x"$host_cpu" = x"x86_64"; then + PGAC_AVX2_SUPPORT() + if test x"$pgac_avx2_support" = x"yes"; then + AC_DEFINE(USE_AVX2_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX2 instructions with a runtime check.]) + fi +fi + # Check for XSAVE intrinsics # PGAC_XSAVE_INTRINSICS() diff --git a/meson.build b/meson.build index ea31cbce9c0..5cf23195a6a 100644 --- a/meson.build +++ b/meson.build @@ -2451,6 +2451,34 @@ int main(void) endif +############################################################### +# Check if the compiler supports AVX2 as a target +############################################################### + +if host_cpu == 'x86_64' + + prog = ''' +#include +#include +__attribute__((target("avx2"))) +static int avx2_test(void) +{ + return 0; +} + +int main(void) +{ + return avx2_test(); +} +''' + + if cc.links(prog, name: 'AVX2 support', args: test_c_args) + cdata.set('USE_AVX2_WITH_RUNTIME_CHECK', 1) + endif + +endif + + ############################################################### # Check for the availability of AVX-512 popcount intrinsics. ############################################################### diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c index 8716651c8b5..7ce51fe9d2e 100644 --- a/src/backend/storage/page/checksum.c +++ b/src/backend/storage/page/checksum.c @@ -13,10 +13,52 @@ */ #include "postgres.h" +#include "port/pg_cpu.h" #include "storage/checksum.h" /* * The actual code is in storage/checksum_impl.h. This is done so that * external programs can incorporate the checksum code by #include'ing - * that file from the exported Postgres headers. (Compare our CRC code.) + * that file from the exported Postgres headers. (Compare our legacy + * CRC code in pg_crc.h.) + * The PG_CHECKSUM_INTERNAL symbol allows core to use hardware-specific + * coding without affecting external programs. */ +#define PG_CHECKSUM_INTERNAL #include "storage/checksum_impl.h" /* IWYU pragma: keep */ + + +static uint32 +pg_checksum_block_fallback(const PGChecksummablePage *page) +{ +#include "storage/checksum_block.inc.c" +} + +/* + * AVX2-optimized block checksum algorithm. + */ +#ifdef USE_AVX2_WITH_RUNTIME_CHECK +pg_attribute_target("avx2") +static uint32 +pg_checksum_block_avx2(const PGChecksummablePage *page) +{ +#include "storage/checksum_block.inc.c" +} +#endif /* USE_AVX2_WITH_RUNTIME_CHECK */ + +/* + * Choose the best available checksum implementation. + */ +static uint32 +pg_checksum_choose(const PGChecksummablePage *page) +{ + pg_checksum_block = pg_checksum_block_fallback; + +#ifdef USE_AVX2_WITH_RUNTIME_CHECK + if (x86_feature_available(PG_AVX2)) + pg_checksum_block = pg_checksum_block_avx2; +#endif + + return pg_checksum_block(page); +} + +static uint32 (*pg_checksum_block) (const PGChecksummablePage *page) = pg_checksum_choose; diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index d8d61918aff..5394a614f87 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -677,6 +677,9 @@ /* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */ #undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK +/* Define to 1 to use AVX2 instructions with a runtime check. */ +#undef USE_AVX2_WITH_RUNTIME_CHECK + /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */ #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h index b93b828d3ac..c5d96bb4f47 100644 --- a/src/include/port/pg_cpu.h +++ b/src/include/port/pg_cpu.h @@ -24,6 +24,9 @@ typedef enum X86FeatureId PG_SSE4_2, PG_POPCNT, + /* 256-bit YMM registers */ + PG_AVX2, + /* 512-bit ZMM registers */ PG_AVX512_BW, PG_AVX512_VL, diff --git a/src/include/storage/checksum_block.inc.c b/src/include/storage/checksum_block.inc.c new file mode 100644 index 00000000000..6ef8a911145 --- /dev/null +++ b/src/include/storage/checksum_block.inc.c @@ -0,0 +1,42 @@ +/*------------------------------------------------------------------------- + * + * checksum_block.inc.c + * Core algorithm for page checksums, semi-private to checksum_impl.h + * and checksum.c. + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/checksum_block.inc.c + * + *------------------------------------------------------------------------- + */ + +/* there is deliberately not an #ifndef CHECKSUM_BLOCK_INC_C here */ + +uint32 sums[N_SUMS]; +uint32 result = 0; +uint32 i, + j; + +/* ensure that the size is compatible with the algorithm */ +Assert(sizeof(PGChecksummablePage) == BLCKSZ); + +/* initialize partial checksums to their corresponding offsets */ +memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); + +/* main checksum calculation */ +for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++) + for (j = 0; j < N_SUMS; j++) + CHECKSUM_COMP(sums[j], page->data[i][j]); + +/* finally add in two rounds of zeroes for additional mixing */ +for (i = 0; i < 2; i++) + for (j = 0; j < N_SUMS; j++) + CHECKSUM_COMP(sums[j], 0); + +/* xor fold partial checksums together */ +for (i = 0; i < N_SUMS; i++) + result ^= sums[i]; + +return result; diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h index 5c2dcbc63e7..28570abdda0 100644 --- a/src/include/storage/checksum_impl.h +++ b/src/include/storage/checksum_impl.h @@ -72,12 +72,13 @@ * random segments of page with 0x00, 0xFF and random data all show optimal * 2e-16 false positive rate within margin of error. * - * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer - * multiplication instruction. As of 2013 the corresponding instruction is - * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32). - * Vectorization requires a compiler to do the vectorization for us. For recent - * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough - * to achieve vectorization. + * Vectorization of the algorithm works best with a 32bit x 32bit -> 32bit + * vector integer multiplication instruction, Examples include x86 AVX2 + * extensions (vpmulld) and ARM NEON (vmul.i32). Without that, vectorization + * is still possible if the compiler can turn multiplication by FNV_PRIME + * into a sequence of vectorized shifts and adds. For simplicity we rely + * on the compiler to do the vectorization for us. For GCC and clang the + * flags -funroll-loops -ftree-vectorize are enough to achieve vectorization. * * The optimal amount of parallelism to use depends on CPU specific instruction * latency, SIMD instruction width, throughput and the amount of registers @@ -89,8 +90,9 @@ * * The parallelism number 32 was chosen based on the fact that it is the * largest state that fits into architecturally visible x86 SSE registers while - * leaving some free registers for intermediate values. For future processors - * with 256bit vector registers this will leave some performance on the table. + * leaving some free registers for intermediate values. For processors + * with 256-bit vector registers this leaves some performance on the table. + * * When vectorization is not available it might be beneficial to restructure * the computation to calculate a subset of the columns at a time and perform * multiple passes to avoid register spilling. This optimization opportunity @@ -138,6 +140,9 @@ do { \ (checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17); \ } while (0) +/* Provide a static definition for external programs */ +#ifndef PG_CHECKSUM_INTERNAL + /* * Block checksum algorithm. The page must be adequately aligned * (at least on 4-byte boundary). @@ -145,34 +150,13 @@ do { \ static uint32 pg_checksum_block(const PGChecksummablePage *page) { - uint32 sums[N_SUMS]; - uint32 result = 0; - uint32 i, - j; - - /* ensure that the size is compatible with the algorithm */ - Assert(sizeof(PGChecksummablePage) == BLCKSZ); - - /* initialize partial checksums to their corresponding offsets */ - memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); - - /* main checksum calculation */ - for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++) - for (j = 0; j < N_SUMS; j++) - CHECKSUM_COMP(sums[j], page->data[i][j]); - - /* finally add in two rounds of zeroes for additional mixing */ - for (i = 0; i < 2; i++) - for (j = 0; j < N_SUMS; j++) - CHECKSUM_COMP(sums[j], 0); - - /* xor fold partial checksums together */ - for (i = 0; i < N_SUMS; i++) - result ^= sums[i]; - - return result; +#include "storage/checksum_block.inc.c" } +#else +static uint32 (*pg_checksum_block) (const PGChecksummablePage *page); +#endif + /* * Compute the checksum for a Postgres page. * diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c index e2ab92b09ac..f069afd1c53 100644 --- a/src/port/pg_cpu_x86.c +++ b/src/port/pg_cpu_x86.c @@ -121,6 +121,10 @@ set_x86_features(void) xcr0_val = _xgetbv(0); #endif + /* Are YMM registers enabled? */ + if (mask_available(xcr0_val, XMM | YMM)) + X86Features[PG_AVX2] = reg[EBX] >> 5 & 1; + /* Are ZMM registers enabled? */ if (mask_available(xcr0_val, XMM | YMM | OPMASK | ZMM0_15 | ZMM16_31)) -- 2.53.0