From 2306c2a94e1610b76a1b7584bbaea63d8bf57302 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Wed, 11 Dec 2024 11:30:33 +0700 Subject: [PATCH v10 1/3] Execute hardware CRC computation in parallel CRC computations on the current input word depend not only on that input, but also on the CRC of the previous input. This means that the speed is limited by the latency of the CRC instruction. Most modern CPUs can start executing a new CRC instruction before a currently executing one has finished, i.e. the reciprocal throughput is lower than latency. By computing partial CRCs of non-overlapping segments of the input, we can achieve the full throughput that the CPU is capable of. To preserve the correctness of the result, however, we must recombine the partial results using carryless multiplication with constants specific to the input length. We get these from a lookup table of pre-computed CRCs. Because of the overhead of the recombinination step, parallelism is only faster with inputs of at least a few hundred bytes. For now we only implement parallelism for x86 and Arm. It might be worthwhile to apply this technique to LoongArch, depending on the throughput of CRC on that platform. XXX The lookup table and supporting code is found in pg_crc32c_sb.c, which is now built unconditionally on all platforms. Perhaps s/sb8/common/ ? This technique originated from the Intel white paper "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction", by Vinodh Gopal et al, 2011. Thanks to Raghuveer Devulapalli for assistance in verifying the usability of this technique from a legal perspective. Xiang Gao's original proposal was specific to the Arm architecture, computed in fixed-size chunks of 1024 bytes, and required hardware support for carryless multiplication. I added support for x86 and a wider range of chunk sizes, and switched to pure C for carryless multiplication. The portability of the latter is important for two reasons: 1) We may want to use this technique on architectures that don't have hardware carryless multiplication and 2) This is intended as a fallback, since if hardware carryless multiplication is available, there are other algorithms that are useful on much smaller inputs than this one. Author: Xiang Gao Author: John Naylor Reviewed-by: Nathan Bossart Discussion: https://postgr.es/m/DB9PR08MB6991329A73923BF8ED4B3422F5DBA@DB9PR08MB6991.eurprd08.prod.outlook.com --- configure | 5 +- configure.ac | 5 +- src/include/port/pg_crc32c.h | 12 ++++ src/port/Makefile | 1 + src/port/meson.build | 6 +- src/port/pg_crc32c_armv8.c | 34 +++++++++++ src/port/pg_crc32c_sb8.c | 111 +++++++++++++++++++++++++++++++++++ src/port/pg_crc32c_sse42.c | 77 +++++++++++++++++++++++- 8 files changed, 239 insertions(+), 12 deletions(-) diff --git a/configure b/configure index 93fddd69981..3403bb0c931 100755 --- a/configure +++ b/configure @@ -17692,7 +17692,7 @@ else $as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5 $as_echo "SSE 4.2 with runtime check" >&6; } else @@ -17708,7 +17708,7 @@ $as_echo "ARMv8 CRC instructions" >&6; } $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5 $as_echo "ARMv8 CRC instructions with runtime check" >&6; } else @@ -17723,7 +17723,6 @@ $as_echo "LoongArch CRCC instructions" >&6; } $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sb8.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 $as_echo "slicing-by-8" >&6; } fi diff --git a/configure.ac b/configure.ac index b6d02f5ecc7..855949b7d74 100644 --- a/configure.ac +++ b/configure.ac @@ -2156,7 +2156,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then else if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" AC_MSG_RESULT(SSE 4.2 with runtime check) else if test x"$USE_ARMV8_CRC32C" = x"1"; then @@ -2166,7 +2166,7 @@ else else if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" + PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o" AC_MSG_RESULT(ARMv8 CRC instructions with runtime check) else if test x"$USE_LOONGARCH_CRC32C" = x"1"; then @@ -2175,7 +2175,6 @@ else AC_MSG_RESULT(LoongArch CRCC instructions) else AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) - PG_CRC32C_OBJS="pg_crc32c_sb8.o" AC_MSG_RESULT(slicing-by-8) fi fi diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 65ebeacf4b1..e6c149c71f1 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -41,6 +41,8 @@ typedef uint32 pg_crc32c; #define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF) #define EQ_CRC32C(c1, c2) ((c1) == (c2)) +extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); + #if defined(USE_SSE42_CRC32C) /* Use Intel SSE4.2 instructions. */ #define COMP_CRC32C(crc, data, len) \ @@ -107,4 +109,14 @@ extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len) #endif +/* semi-private to files in src/port that compute CRCs in parallel */ + +#define CRC_BYTES_PER_ITER (3 * sizeof(uint64)) +/* for parallel computation, max number of words per block for recombination */ +#define CRC_MAX_BLOCK_LEN 350 + +extern PGDLLIMPORT const uint64 combine_crc_lookup[CRC_MAX_BLOCK_LEN]; + +extern uint64 pg_clmul(uint32 a, uint32 b); + #endif /* PG_CRC32C_H */ diff --git a/src/port/Makefile b/src/port/Makefile index 4c224319512..a8e2467a866 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -44,6 +44,7 @@ OBJS = \ noblock.o \ path.o \ pg_bitutils.o \ + pg_crc32c_sb8.o \ pg_popcount_avx512.o \ pg_strong_random.o \ pgcheckdir.o \ diff --git a/src/port/meson.build b/src/port/meson.build index 7fcfa728d43..8aed1de2d1d 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -7,6 +7,7 @@ pgport_sources = [ 'noblock.c', 'path.c', 'pg_bitutils.c', + 'pg_crc32c_sb8.c', 'pg_popcount_avx512.c', 'pg_strong_random.c', 'pgcheckdir.c', @@ -84,19 +85,14 @@ replace_funcs_pos = [ ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'], ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], - ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'], ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], - ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], # loongarch ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'], - - # generic fallback - ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'], ] pgport_cflags = {'crc': cflags_crc} diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c index 5ba070bb99d..0265a2a13d7 100644 --- a/src/port/pg_crc32c_armv8.c +++ b/src/port/pg_crc32c_armv8.c @@ -23,6 +23,8 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) { const unsigned char *p = data; const unsigned char *pend = p + len; + const size_t min_blocklen = 42; /* Min size to consider interleaving */ + const pg_crc32c orig_crc = crc; // XXX not for commit /* * ARMv8 doesn't require alignment, but aligned memory access is @@ -48,6 +50,36 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) p += 4; } + /* See pg_crc32c_sse42.c for explanation */ + while (p + min_blocklen * CRC_BYTES_PER_ITER <= pend) + { + const size_t block_len = Min(CRC_MAX_BLOCK_LEN, (pend - p) / CRC_BYTES_PER_ITER); + const uint64 *in64 = (const uint64 *) (p); + pg_crc32c crc0 = crc, + crc1 = 0, + crc2 = 0; + uint64 mul0, + mul1, + precompute; + + for (int i = 0; i < block_len; i++, in64++) + { + crc0 = __crc32cd(crc0, *(in64)); + crc1 = __crc32cd(crc1, *(in64 + block_len)); + crc2 = __crc32cd(crc2, *(in64 + block_len * 2)); + } + + precompute = combine_crc_lookup[block_len - 1]; + mul0 = pg_clmul(crc0, (uint32) precompute); + mul1 = pg_clmul(crc1, (uint32) (precompute >> 32)); + + crc0 = __crc32cd(0, mul0); + crc1 = __crc32cd(0, mul1); + crc = crc0 ^ crc1 ^ crc2; + + p += block_len * CRC_BYTES_PER_ITER; + } + /* Process eight bytes at a time, as far as we can. */ while (p + 8 <= pend) { @@ -71,5 +103,7 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) crc = __crc32cb(crc, *p); } + // XXX not for commit + Assert(crc == pg_comp_crc32c_sb8(orig_crc, data, len)); return crc; } diff --git a/src/port/pg_crc32c_sb8.c b/src/port/pg_crc32c_sb8.c index 19659d186a0..004fe92d70b 100644 --- a/src/port/pg_crc32c_sb8.c +++ b/src/port/pg_crc32c_sb8.c @@ -1167,3 +1167,114 @@ static const uint32 pg_crc32c_table[8][256] = { } #endif /* WORDS_BIGENDIAN */ }; + + +/* + * Carryless multiplication in software + */ +uint64 +pg_clmul(uint32 a, uint32 b) +{ + uint64 result = 0; + + for (uint32 i = 0; i < 32; i++) + if ((a >> i) & 1) + result ^= (uint64) b << i; + + return result; +} + +/* + * Lookup table for combining partial CRC computations + */ +const uint64 combine_crc_lookup[CRC_MAX_BLOCK_LEN] = +{ + 0x00000001493c7d27, 0x493c7d27ba4fc28e, 0xf20c0dfeddc0152b, 0xba4fc28e9e4addf8, + 0x3da6d0cb39d3b296, 0xddc0152b0715ce53, 0x1c291d0447db8317, 0x9e4addf80d3b6092, + 0x740eef02c96cfdc0, 0x39d3b296878a92a7, 0x083a6eecdaece73e, 0x0715ce53ab7aff2a, + 0xc49f4f672162d385, 0x47db831783348832, 0x2ad91c30299847d5, 0x0d3b6092b9e02b86, + 0x6992cea218b33a4e, 0xc96cfdc0b6dd949b, 0x7e90804878d9ccb7, 0x878a92a7bac2fd7b, + 0x1b3d8f29a60ce07b, 0xdaece73ece7f39f4, 0xf1d0f55e61d82e56, 0xab7aff2ad270f1a2, + 0xa87ab8a8c619809d, 0x2162d3852b3cac5d, 0x8462d80065863b64, 0x833488321b03397f, + 0x71d111a8ebb883bd, 0x299847d5b3e32c28, 0xffd852c6064f7f26, 0xb9e02b86dd7e3b0c, + 0xdcb17aa4f285651c, 0x18b33a4e10746f3c, 0xf37c5aeec7a68855, 0xb6dd949b271d9844, + 0x6051d5a28e766a0c, 0x78d9ccb793a5f730, 0x18b0d4ff6cb08e5c, 0xbac2fd7b6b749fb2, + 0x21f3d99c1393e203, 0xa60ce07bcec3662e, 0x8f15801496c515bb, 0xce7f39f4e6fc4e6a, + 0xa00457f78227bb8a, 0x61d82e56b0cd4768, 0x8d6d2c4339c7ff35, 0xd270f1a2d7a4825c, + 0x00ac29cf0ab3844b, 0xc619809d0167d312, 0xe9adf796f6076544, 0x2b3cac5d26f6a60a, + 0x96638b34a741c1bf, 0x65863b6498d8d9cb, 0xe0e9f35149c3cc9c, 0x1b03397f68bce87a, + 0x9af01f2d57a3d037, 0xebb883bd6956fc3b, 0x2cff42cf42d98888, 0xb3e32c283771e98f, + 0x88f25a3ab42ae3d9, 0x064f7f262178513a, 0x4e36f0b0e0ac139e, 0xdd7e3b0c170076fa, + 0xbd6f81f8444dd413, 0xf285651c6f345e45, 0x91c9bd4b41d17b64, 0x10746f3cff0dba97, + 0x885f087ba2b73df1, 0xc7a68855f872e54c, 0x4c1449321e41e9fc, 0x271d984486d8e4d2, + 0x52148f02651bd98b, 0x8e766a0c5bb8f1bc, 0xa3c6f37aa90fd27a, 0x93a5f730b3af077a, + 0xd7c0557f4984d782, 0x6cb08e5cca6ef3ac, 0x63ded06a234e0b26, 0x6b749fb2dd66cbbb, + 0x4d56973c4597456a, 0x1393e203e9e28eb4, 0x9669c9df7b3ff57a, 0xcec3662ec9c8b782, + 0xe417f38a3f70cc6f, 0x96c515bb93e106a4, 0x4b9e0f7162ec6c6d, 0xe6fc4e6ad813b325, + 0xd104b8fc0df04680, 0x8227bb8a2342001e, 0x5b3977300a2a8d7e, 0xb0cd47686d9a4957, + 0xe78eb416e8b6368b, 0x39c7ff35d2c3ed1a, 0x61ff0e01995a5724, 0xd7a4825c9ef68d35, + 0x8d96551c0c139b31, 0x0ab3844bf2271e60, 0x0bf80dd20b0bf8ca, 0x0167d3122664fd8b, + 0x8821abeded64812d, 0xf607654402ee03b2, 0x6a45d2b28604ae0f, 0x26f6a60a363bd6b3, + 0xd8d26619135c83fd, 0xa741c1bf5fabe670, 0xde87806c35ec3279, 0x98d8d9cb00bcf5f6, + 0x143387548ae00689, 0x49c3cc9c17f27698, 0x5bd2011f58ca5f00, 0x68bce87aaa7c7ad5, + 0xdd07448eb5cfca28, 0x57a3d037ded288f8, 0xdde8f5b959f229bc, 0x6956fc3b6d390dec, + 0xa3e3e02c37170390, 0x42d988886353c1cc, 0xd73c7beac4584f5c, 0x3771e98ff48642e9, + 0x80ff0093531377e2, 0xb42ae3d9dd35bc8d, 0x8fe4c34db25b29f2, 0x2178513a9a5ede41, + 0xdf99fc11a563905d, 0xe0ac139e45cddf4e, 0x6c23e841acfa3103, 0x170076faa51b6135, + 0xfe314258dfd94fb2, 0x444dd41380f2886b, 0x0d8373a067969a6a, 0x6f345e45021ac5ef, + 0x19e3635ee8310afa, 0x41d17b6475451b04, 0x29f268b48e1450f7, 0xff0dba97cbbe4ee1, + 0x1dc0632a3a83de21, 0xa2b73df1e0cdcf86, 0x1614f396453c1679, 0xf872e54cdefba41c, + 0x9e2993d3613eee91, 0x1e41e9fcddaf5114, 0x6bebd73c1f1dd124, 0x86d8e4d2bedc6ba1, + 0x63ae91e6eca08ffe, 0x651bd98b3ae30875, 0xf8c9da7a0cd1526a, 0x5bb8f1bcb1630f04, + 0x945a19c1ff47317b, 0xa90fd27ad6c3a807, 0xee8213b79a7781e0, 0xb3af077a63d097e9, + 0x93781dc71d31175f, 0x4984d78294eb256e, 0xccc4a1b913184649, 0xca6ef3ac4be7fd90, + 0xa2c2d9717d5c1d64, 0x234e0b2680ba859a, 0x1cad44526eeed1c9, 0xdd66cbbb22c3799f, + 0x74922601d8ecc578, 0x4597456ab3a6da94, 0xc55f7eabcaf933fe, 0xe9e28eb450bfaade, + 0xa19623292e7d11a7, 0x7b3ff57a7d14748f, 0x2d37074932d8041c, 0xc9c8b782889774e1, + 0x397d84a16cc8a0ff, 0x3f70cc6f5aa1f3cf, 0x791132708a074012, 0x93e106a433bc58b3, + 0xbc8178039f2b002a, 0x62ec6c6dbd0bb25f, 0x88eb3c0760bf0a6a, 0xd813b3258515c07f, + 0x6e4cb6303be3c09b, 0x0df04680d8440525, 0x71971d5c682d085d, 0x2342001e465a4eee, + 0xf33b8bc628b5de82, 0x0a2a8d7e077d54e0, 0x9fb3bbc02e5f3c8c, 0x6d9a4957c00df280, + 0x6ef22b23d0a37f43, 0xe8b6368ba52f58ec, 0xce2df76800712e86, 0xd2c3ed1ad6748e82, + 0xe53a4fc747972100, 0x995a572451aeef66, 0xbe60a91a71900712, 0x9ef68d35359674f7, + 0x1dfa0a15647fbd15, 0x0c139b311baaa809, 0x8ec52396469aef86, 0xf2271e6086d42d06, + 0x0e766b114aba1470, 0x0b0bf8ca1c2cce0a, 0x475846a4aa0cd2d3, 0x2664fd8bf8448640, + 0xb2a3dfa6ac4fcdec, 0xed64812de81cf154, 0xdc1a160cc2c7385c, 0x02ee03b295ffd7dc, + 0x79afdf1c91de6176, 0x8604ae0f84ee89ac, 0x07ac6e46533e308d, 0x363bd6b35f0e0438, + 0x15f85253604d6e09, 0x135c83fdaeb3e622, 0x1bec24dd4263eb04, 0x5fabe67050c2cb16, + 0x4c36cd5b6667afe7, 0x35ec32791a6889b8, 0xe0a22e29de42c92a, 0x00bcf5f67f47463d, + 0x7c2b6ed9b82b6080, 0x8ae00689828d550b, 0x06ff88fddca2b4da, 0x17f276984ac726eb, + 0xf7317cf0529295e6, 0x58ca5f005e9f28eb, 0x61b6e40b40c14fff, 0xaa7c7ad596a1f19b, + 0xde8a97f8997157e1, 0xb5cfca28b0ed8196, 0x88f61445097e41e6, 0xded288f84ce8bfe5, + 0xd4520e9ee36841ad, 0x59f229bcd1a9427c, 0x0c592bd593f3319c, 0x6d390decb58ac6fe, + 0x38edfaf3e3809241, 0x37170390f22fd3e2, 0x72cbfcdb83c2df88, 0x6353c1ccd6b1825a, + 0x348331a54e4ff232, 0xc4584f5c6664d9c1, 0xc3977c19836b5a6e, 0xf48642e923d5e7e5, + 0xdafaea7c65065343, 0x531377e21495d20d, 0x73db4c04a29c82eb, 0xdd35bc8df370b37f, + 0x72675ce8ea6dd7dc, 0xb25b29f2e9415bce, 0x3ec2ff8396309b0f, 0x9a5ede41c776b648, + 0xe8c7a017c22c52c5, 0xa563905dcecfcd43, 0xcf4bfaefd8311ee7, 0x45cddf4e24e6fe8f, + 0x6bde1ac7d0c6d7c9, 0xacfa310345aa5d4a, 0xae1175c2cf067065, 0xa51b613582f89c77, + 0xf7506984a348c84e, 0xdfd94fb2d07737ea, 0xe0863e5636069dd2, 0x80f2886bc4cedd32, + 0xd7e661ae9a97be47, 0x67969a6af45cd585, 0x01afc14f93f36e2b, 0x021ac5ef195bc82d, + 0xd2fd8e3ce622aaca, 0xe8310afa23912612, 0xc4eb27b2a1fd0859, 0x75451b04a2edbd17, + 0x632098732cefbfdd, 0x8e1450f7f36d84e2, 0xf29971cf9664532d, 0xcbbe4ee1cfeff4b3, + 0xaf6939d96737eead, 0x3a83de21f52d28d3, 0x650ef6c5fb3bb2c8, 0xe0cdcf864a9d4498, + 0x36e108faaef471c1, 0x453c16790d08bb68, 0x09c20a6c3b6c03be, 0xdefba41c4de20a7c, + 0x0a1a6a8877792405, 0x613eee91b95a9eb0, 0x286d109b11f2bc8f, 0xddaf51147956e76a, + 0x9fd51b88032a8058, 0x1f1dd1241b93589c, 0x4860285dcc66546f, 0xbedc6ba1005bb964, + 0x6e221adc28198362, 0xeca08ffe3f2e57b1, 0x0e0a10735f54bb14, 0x3ae3087599e44d01, + 0x37f194212591f073, 0x0cd1526a0871bd30, 0xe9bbf6481fb48d12, 0xb1630f043888ed03, + 0x0fa0277f1e22167e, 0xff47317bd272cadf, 0xeb2fb89a8653215c, 0xd6c3a807f6e6d69e, + 0x1c47ed30b9f5bf62, 0x9a7781e09dde6145, 0x271cfb40ed49c4d3, 0x63d097e91ad321bc, + 0xea1cb6e7f206e4b8, 0x1d31175f60165873, 0x9f737f83159dff70, 0x94eb256ee1a468d0, + 0xd4619bbcda11f51b, 0x1318464993189d18, 0x794dd0f2ac4f4691, 0x4be7fd9087d07ae0, + 0xb6d42cd90db9f589, 0x7d5c1d64c51a240c, 0x8b9be230ab819fbb, 0x80ba859a5cc19671, + 0xd4617a4c46183f0a, 0x6eeed1c993e7c448, 0xb9f93bd067fe6e36, 0x22c3799fca110698, + 0xb8b67c1c8a2acc83, 0xd8ecc578eb75a090, 0xc5ca433a3e18bd99, 0xb3a6da94480e7e4d, + 0x5e5dcd9560bced33, 0xcaf933fe0bf69a3f, 0x7b589372dbf59471, 0x50bfaadea00bae3d, + 0x0afb7f3b4b7df256, 0x2e7d11a77959fe2a, 0x0f97c69068e3d179, 0x7d14748fa160e585, + 0x3e254fe4cac39a0b, 0x32d8041cf4229b7a, 0x141e8512007ca0f9, 0x889774e10e0126b2, + 0xe5e25bd082fe946e, 0x6cc8a0ff013b3856, 0xacf1231667c69966, 0x5aa1f3cffa0f2bd0, + 0x7b454cb35d4c91fc, 0x8a07401279d95f64, 0x311709b8807121c0, 0x33bc58b3b0a3f16d, + 0x9948a7d2618e0996, 0x9f2b002a2308bca9, 0x809cef1f343272b3, 0xbd0bb25ff5c40599, + 0x11dde5b740c1c64c, 0x60bf0a6a71cc89e8, +}; diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 22c2137df31..f674d3f71d7 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -18,10 +18,85 @@ #include "port/pg_crc32c.h" -pg_attribute_no_sanitize_alignment() +/* min size to compute multiple segments in parallel */ +#define MIN_PARALLEL_LENGTH 600 + +static pg_crc32c pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len); + + pg_attribute_target("sse4.2") pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) +{ + const unsigned char *p = data; + pg_crc32c crc0 = crc; + + /* XXX not for commit */ + const size_t orig_len PG_USED_FOR_ASSERTS_ONLY = len; + +#if SIZEOF_VOID_P >= 8 + if (unlikely(len >= MIN_PARALLEL_LENGTH)) + { + /* + * Align pointer to avoid straddling cacheline boundaries, since we + * issue three loads per loop iteration below. + */ + for (; (uintptr_t) p & 7; len--) + crc0 = _mm_crc32_u8(crc0, *p++); + + /* + * A CRC instruction can be issued every cycle but the latency of its + * result will take several cycles. We can take advantage of this by + * dividing the input into 3 equal blocks and computing the CRC of + * each independently. + */ + while (len >= MIN_PARALLEL_LENGTH) + { + const size_t block_len = Min(CRC_MAX_BLOCK_LEN, + len / CRC_BYTES_PER_ITER); + const uint64 *in64 = (const uint64 *) (p); + pg_crc32c crc1 = 0, + crc2 = 0; + uint64 mul0, + mul1, + precompute; + + for (int i = 0; i < block_len; i++, in64++) + { + crc0 = _mm_crc32_u64(crc0, *(in64)); + crc1 = _mm_crc32_u64(crc1, *(in64 + block_len)); + crc2 = _mm_crc32_u64(crc2, *(in64 + block_len * 2)); + } + + /* + * Combine the partial CRCs using carryless multiplication on + * pre-computed length-specific constants. + */ + precompute = combine_crc_lookup[block_len - 1]; + mul0 = pg_clmul(crc0, (uint32) precompute); + mul1 = pg_clmul(crc1, (uint32) (precompute >> 32)); + crc0 = _mm_crc32_u64(0, mul0); + crc0 ^= _mm_crc32_u64(0, mul1); + crc0 ^= crc2; + + p += block_len * CRC_BYTES_PER_ITER; + len -= block_len * CRC_BYTES_PER_ITER; + } + } +#endif + + crc0 = pg_comp_crc32c_sse42_tail(crc0, p, len); + + /* XXX not for commit */ + Assert(crc0 == pg_comp_crc32c_sb8(crc, data, orig_len)); + + return crc0; +} + +pg_attribute_no_sanitize_alignment() +pg_attribute_target("sse4.2") +static pg_crc32c +pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len) { const unsigned char *p = data; const unsigned char *pend = p + len; -- 2.48.1