From 3b14aa4dcd03d0f86aec3ba78b2750b650f9cb67 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Sun, 6 Jun 2021 11:23:38 -0400 Subject: [PATCH v11 2/2] Use SSE instructions for pg_utf8_verifystr() where available On x86-64, we use SSE 4.1 with a lookup algorithm based on "Validating UTF-8 In Less Than One Instruction Per Byte" by John Keiser and Daniel Lemire. Since configure already tests for SSE 4.2 for CRC, we piggy-back on top of that. The lookup tables are taken from the simdjson library (Apache 2.0 licensed), but the code is written from scratch using simdjson as a reference. --- config/c-compiler.m4 | 28 +- configure | 114 +++-- configure.ac | 61 ++- src/Makefile.global.in | 3 + src/common/wchar.c | 155 +------ src/include/pg_config.h.in | 9 + src/include/port/pg_utf8.h | 95 +++++ src/port/Makefile | 6 + src/port/pg_utf8_fallback.c | 129 ++++++ src/port/pg_utf8_sse42.c | 508 +++++++++++++++++++++++ src/port/pg_utf8_sse42_choose.c | 68 +++ src/test/regress/expected/conversion.out | 34 ++ src/test/regress/sql/conversion.sql | 31 ++ src/tools/msvc/Mkvcbuild.pm | 4 + src/tools/msvc/Solution.pm | 3 + 15 files changed, 1053 insertions(+), 195 deletions(-) create mode 100644 src/include/port/pg_utf8.h create mode 100644 src/port/pg_utf8_fallback.c create mode 100644 src/port/pg_utf8_sse42.c create mode 100644 src/port/pg_utf8_sse42_choose.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 780e906ecc..b1604eac58 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -591,36 +591,46 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).]) fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS -# PGAC_SSE42_CRC32_INTRINSICS +# PGAC_SSE42_INTRINSICS # --------------------------- # Check if the compiler supports the x86 CRC instructions added in SSE 4.2, # using the _mm_crc32_u8 and _mm_crc32_u32 intrinsic functions. (We don't # test the 8-byte variant, _mm_crc32_u64, but it is assumed to be present if # the other ones are, on x86-64 platforms) # +# While at it, check for support x86 instructions added in SSSE3 and SSE4.1, +# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128. +# We should be able to assume these are understood by the compiler if CRC +# intrinsics are, but it's better to document our reliance on them here. +# +# We don't test for SSE2 intrinsics, as they are assumed to be present on +# x86-64 platforms, which we can easily check at compile time. +# # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the -# intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_SSE42. -AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS], -[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl -AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar], +# intrinsics are supported, sets pgac_sse42_intrinsics, and CFLAGS_SSE42. +AC_DEFUN([PGAC_SSE42_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_intrinsics_$1])])dnl +AC_CACHE_CHECK([for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar], [pgac_save_CFLAGS=$CFLAGS CFLAGS="$pgac_save_CFLAGS $1" AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [unsigned int crc = 0; crc = _mm_crc32_u8(crc, 0); crc = _mm_crc32_u32(crc, 0); + __m128i vec = _mm_set1_epi8(crc); + vec = _mm_shuffle_epi8(vec, + _mm_alignr_epi8(vec, vec, 1)); /* return computed value, to prevent the above being optimized away */ - return crc == 0;])], + return _mm_testz_si128(vec, vec);])], [Ac_cachevar=yes], [Ac_cachevar=no]) CFLAGS="$pgac_save_CFLAGS"]) if test x"$Ac_cachevar" = x"yes"; then CFLAGS_SSE42="$1" - pgac_sse42_crc32_intrinsics=yes + pgac_sse42_intrinsics=yes fi undefine([Ac_cachevar])dnl -])# PGAC_SSE42_CRC32_INTRINSICS - +])# PGAC_SSE42_INTRINSICS # PGAC_ARMV8_CRC32C_INTRINSICS # ---------------------------- diff --git a/configure b/configure index e9b98f442f..1663e2d466 100755 --- a/configure +++ b/configure @@ -645,6 +645,7 @@ XGETTEXT MSGMERGE MSGFMT_FLAGS MSGFMT +PG_UTF8_OBJS PG_CRC32C_OBJS CFLAGS_ARMV8_CRC32C CFLAGS_SSE42 @@ -17963,14 +17964,14 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h fi -# Check for Intel SSE 4.2 intrinsics to do CRC calculations. +# Check for Intel SSE 4.2 intrinsics. # -# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used +# First check if these intrinsics can be used # with the default compiler flags. If not, check if adding the -msse4.2 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required. -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5 -$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; } -if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then : +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5 +$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; } +if ${pgac_cv_sse42_intrinsics_+:} false; then : $as_echo_n "(cached) " >&6 else pgac_save_CFLAGS=$CFLAGS @@ -17984,32 +17985,35 @@ main () unsigned int crc = 0; crc = _mm_crc32_u8(crc, 0); crc = _mm_crc32_u32(crc, 0); + __m128i vec = _mm_set1_epi8(crc); + vec = _mm_shuffle_epi8(vec, + _mm_alignr_epi8(vec, vec, 1)); /* return computed value, to prevent the above being optimized away */ - return crc == 0; + return _mm_testz_si128(vec, vec); ; return 0; } _ACEOF if ac_fn_c_try_link "$LINENO"; then : - pgac_cv_sse42_crc32_intrinsics_=yes + pgac_cv_sse42_intrinsics_=yes else - pgac_cv_sse42_crc32_intrinsics_=no + pgac_cv_sse42_intrinsics_=no fi rm -f core conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext CFLAGS="$pgac_save_CFLAGS" fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5 -$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; } -if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics_" >&5 +$as_echo "$pgac_cv_sse42_intrinsics_" >&6; } +if test x"$pgac_cv_sse42_intrinsics_" = x"yes"; then CFLAGS_SSE42="" - pgac_sse42_crc32_intrinsics=yes + pgac_sse42_intrinsics=yes fi -if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5 -$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; } -if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then : +if test x"$pgac_sse42_intrinsics" != x"yes"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5 +$as_echo_n "checking for for _mm_crc32_u8, _mm_crc32_u32, _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; } +if ${pgac_cv_sse42_intrinsics__msse4_2+:} false; then : $as_echo_n "(cached) " >&6 else pgac_save_CFLAGS=$CFLAGS @@ -18023,26 +18027,29 @@ main () unsigned int crc = 0; crc = _mm_crc32_u8(crc, 0); crc = _mm_crc32_u32(crc, 0); + __m128i vec = _mm_set1_epi8(crc); + vec = _mm_shuffle_epi8(vec, + _mm_alignr_epi8(vec, vec, 1)); /* return computed value, to prevent the above being optimized away */ - return crc == 0; + return _mm_testz_si128(vec, vec); ; return 0; } _ACEOF if ac_fn_c_try_link "$LINENO"; then : - pgac_cv_sse42_crc32_intrinsics__msse4_2=yes + pgac_cv_sse42_intrinsics__msse4_2=yes else - pgac_cv_sse42_crc32_intrinsics__msse4_2=no + pgac_cv_sse42_intrinsics__msse4_2=no fi rm -f core conftest.err conftest.$ac_objext \ conftest$ac_exeext conftest.$ac_ext CFLAGS="$pgac_save_CFLAGS" fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5 -$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; } -if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_intrinsics__msse4_2" >&5 +$as_echo "$pgac_cv_sse42_intrinsics__msse4_2" >&6; } +if test x"$pgac_cv_sse42_intrinsics__msse4_2" = x"yes"; then CFLAGS_SSE42="-msse4.2" - pgac_sse42_crc32_intrinsics=yes + pgac_sse42_intrinsics=yes fi fi @@ -18177,12 +18184,12 @@ fi # in the template or configure command line. if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then # Use Intel SSE 4.2 if available. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then + if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then USE_SSE42_CRC32C=1 else # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for # the runtime check. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1 else # Use ARM CRC Extension if available. @@ -18196,7 +18203,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && # fall back to slicing-by-8 algorithm, which doesn't require any # special CPU support. USE_SLICING_BY_8_CRC32C=1 - fi + fi fi fi fi @@ -18249,6 +18256,61 @@ $as_echo "slicing-by-8" >&6; } fi +# Select UTF-8 validator implementation. +# +# If we are targeting a processor that has SSE 4.2 instructions, we can use +# those to validate UTF-8 characters. If we're not targeting such +# a processor, but we can nevertheless produce code that uses the SSE +# intrinsics, perhaps with some extra CFLAGS, compile both implementations and +# select which one to use at runtime, depending on whether SSE 4.2 is supported +# by the processor we're running on. +# +# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1 +# in the template or configure command line. +if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then + if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then + USE_SSE42_UTF8=1 + else + # the CPUID instruction is needed for the runtime check. + if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1 + else + # fall back to algorithm which doesn't require any special + # CPU support. + USE_FALLBACK_UTF8=1 + fi + fi +fi + +# Set PG_UTF8_OBJS appropriately depending on the selected implementation. +# Note: We need the fallback for error handling in all builds. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5 +$as_echo_n "checking which UTF-8 validator to use... " >&6; } +if test x"$USE_SSE42_UTF8" = x"1"; then + +$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h + + PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5 +$as_echo "SSE 4.2" >&6; } +else + if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then + +$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h + + PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5 +$as_echo "SSE 4.2 with runtime check" >&6; } + else + +$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h + + PG_UTF8_OBJS="pg_utf8_fallback.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: fallback" >&5 +$as_echo "fallback" >&6; } + fi +fi + # Select semaphore implementation type. if test "$PORTNAME" != "win32"; then diff --git a/configure.ac b/configure.ac index 3b42d8bdc9..fff229e570 100644 --- a/configure.ac +++ b/configure.ac @@ -2059,14 +2059,14 @@ if test x"$pgac_cv__cpuid" = x"yes"; then AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.]) fi -# Check for Intel SSE 4.2 intrinsics to do CRC calculations. +# Check for Intel SSE 4.2 intrinsics. # -# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used +# First check if these intrinsics can be used # with the default compiler flags. If not, check if adding the -msse4.2 # flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required. -PGAC_SSE42_CRC32_INTRINSICS([]) -if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then - PGAC_SSE42_CRC32_INTRINSICS([-msse4.2]) +PGAC_SSE42_INTRINSICS([]) +if test x"$pgac_sse42_intrinsics" != x"yes"; then + PGAC_SSE42_INTRINSICS([-msse4.2]) fi AC_SUBST(CFLAGS_SSE42) @@ -2107,12 +2107,12 @@ AC_SUBST(CFLAGS_ARMV8_CRC32C) # in the template or configure command line. if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then # Use Intel SSE 4.2 if available. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then + if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then USE_SSE42_CRC32C=1 else # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for # the runtime check. - if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1 else # Use ARM CRC Extension if available. @@ -2126,7 +2126,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && # fall back to slicing-by-8 algorithm, which doesn't require any # special CPU support. USE_SLICING_BY_8_CRC32C=1 - fi + fi fi fi fi @@ -2163,6 +2163,51 @@ else fi AC_SUBST(PG_CRC32C_OBJS) +# Select UTF-8 validator implementation. +# +# If we are targeting a processor that has SSE 4.2 instructions, we can use +# those to validate UTF-8 characters. If we're not targeting such +# a processor, but we can nevertheless produce code that uses the SSE +# intrinsics, perhaps with some extra CFLAGS, compile both implementations and +# select which one to use at runtime, depending on whether SSE 4.2 is supported +# by the processor we're running on. +# +# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1 +# in the template or configure command line. +if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then + if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then + USE_SSE42_UTF8=1 + else + # the CPUID instruction is needed for the runtime check. + if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1 + else + # fall back to algorithm which doesn't require any special + # CPU support. + USE_FALLBACK_UTF8=1 + fi + fi +fi + +# Set PG_UTF8_OBJS appropriately depending on the selected implementation. +# Note: We need the fallback for error handling in all builds. +AC_MSG_CHECKING([which UTF-8 validator to use]) +if test x"$USE_SSE42_UTF8" = x"1"; then + AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.]) + PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o" + AC_MSG_RESULT(SSE 4.2) +else + if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then + AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.]) + PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o" + AC_MSG_RESULT(SSE 4.2 with runtime check) + else + AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.]) + PG_UTF8_OBJS="pg_utf8_fallback.o" + AC_MSG_RESULT(fallback) + fi +fi +AC_SUBST(PG_UTF8_OBJS) # Select semaphore implementation type. if test "$PORTNAME" != "win32"; then diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 8f05840821..f54433933b 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -721,6 +721,9 @@ LIBOBJS = @LIBOBJS@ # files needed for the chosen CRC-32C implementation PG_CRC32C_OBJS = @PG_CRC32C_OBJS@ +# files needed for the chosen UTF-8 validation implementation +PG_UTF8_OBJS = @PG_UTF8_OBJS@ + LIBS := -lpgcommon -lpgport $(LIBS) # to make ws2_32.lib the last library diff --git a/src/common/wchar.c b/src/common/wchar.c index 2805d01f7f..37c4d4489b 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -13,58 +13,9 @@ #include "c.h" #include "mb/pg_wchar.h" +#include "port/pg_utf8.h" -/* for UTF-8 */ -#define IS_CONTINUATION_BYTE(c) (((c) & 0xC0) == 0x80) -#define IS_TWO_BYTE_LEAD(c) (((c) & 0xE0) == 0xC0) -#define IS_THREE_BYTE_LEAD(c) (((c) & 0xF0) == 0xE0) -#define IS_FOUR_BYTE_LEAD(c) (((c) & 0xF8) == 0xF0) - -/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */ -static inline int -check_ascii(const unsigned char *s, int len) -{ - uint64 half1, - half2, - highbits_set, - x1, - x2, - x; - - if (len >= 2 * sizeof(uint64)) - { - memcpy(&half1, s, sizeof(uint64)); - memcpy(&half2, s + sizeof(uint64), sizeof(uint64)); - - /* Check if any bytes in this chunk have the high bit set. */ - highbits_set = (half1 | half2) & UINT64CONST(0x8080808080808080); - if (highbits_set) - return 0; - - /* - * Check if there are any zero bytes in this chunk. - * - * First, add 0x7f to each byte. This sets the high bit in each byte, - * unless it was a zero. We already checked that none of the bytes had - * the high bit set previously, so the max value each byte can have - * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to - * worry about carrying over to the next byte. - */ - x1 = half1 + UINT64CONST(0x7f7f7f7f7f7f7f7f); - x2 = half2 + UINT64CONST(0x7f7f7f7f7f7f7f7f); - - /* Then check that the high bit is set in each byte. */ - x = (x1 & x2) & UINT64CONST(0x8080808080808080); - if (x != UINT64CONST(0x8080808080808080)) - return 0; - - return 2 * sizeof(uint64); - } - else - return 0; -} - /* * Operations on multi-byte encodings are driven by a table of helper * functions. @@ -1810,108 +1761,8 @@ pg_utf8_verifychar(const unsigned char *s, int len) static int pg_utf8_verifystr(const unsigned char *s, int len) { - const unsigned char *start = s; - unsigned char b1, - b2, - b3, - b4; - - while (len > 0) - { - int l; - - /* fast path for ASCII-subset characters */ - l = check_ascii(s, len); - if (l) - { - s += l; - len -= l; - continue; - } - - /* Found non-ASCII or zero above, so verify a single character. */ - if (!IS_HIGHBIT_SET(*s)) - { - if (*s == '\0') - break; - l = 1; - } - /* code points U+0080 through U+07FF */ - else if (IS_TWO_BYTE_LEAD(*s)) - { - l = 2; - if (len < l) - break; - - b1 = *s; - b2 = *(s + 1); - - if (!IS_CONTINUATION_BYTE(b2)) - break; - - /* check 2-byte overlong: 1100.000x.10xx.xxxx */ - if (b1 < 0xC2) - break; - } - /* code points U+0800 through U+D7FF and U+E000 through U+FFFF */ - else if (IS_THREE_BYTE_LEAD(*s)) - { - l = 3; - if (len < l) - break; - - b1 = *s; - b2 = *(s + 1); - b3 = *(s + 2); - - if (!IS_CONTINUATION_BYTE(b2) || - !IS_CONTINUATION_BYTE(b3)) - break; - - /* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */ - if (b1 == 0xE0 && b2 < 0xA0) - break; - - /* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */ - if (b1 == 0xED && b2 > 0x9F) - break; - } - /* code points U+010000 through U+10FFFF */ - else if (IS_FOUR_BYTE_LEAD(*s)) - { - l = 4; - if (len < l) - break; - - b1 = *s; - b2 = *(s + 1); - b3 = *(s + 2); - b4 = *(s + 3); - - if (!IS_CONTINUATION_BYTE(b2) || - !IS_CONTINUATION_BYTE(b3) || - !IS_CONTINUATION_BYTE(b4)) - break; - - /* - * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx - */ - if (b1 == 0xF0 && b2 < 0x90) - break; - - /* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */ - if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4) - break; - } - else - /* invalid byte */ - break; - - s += l; - len -= l; - } - - return s - start; + /* platform-specific implementation in src/port */ + return UTF8_VERIFYSTR(s, len); } /* diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 783b8fc1ba..1aa839d258 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -926,6 +926,15 @@ /* Define to 1 to build with PAM support. (--with-pam) */ #undef USE_PAM +/* Define to 1 to use the fallback UTF-8 validator written in C. */ +#undef USE_FALLBACK_UTF8 + +/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */ +#undef USE_SSE42_UTF8 + +/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */ +#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK + /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */ #undef USE_SLICING_BY_8_CRC32C diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h new file mode 100644 index 0000000000..d3c2e757a4 --- /dev/null +++ b/src/include/port/pg_utf8.h @@ -0,0 +1,95 @@ +/*------------------------------------------------------------------------- + * + * pg_utf8.h + * Routines for fast validation of UTF-8 text. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/port/pg_utf8.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_UTF8_H +#define PG_UTF8_H + + +#if defined(USE_SSE42_UTF8) +/* Use Intel SSE4.2 instructions. */ +#define UTF8_VERIFYSTR(s, len) \ + pg_validate_utf8_sse42((s), (len)) + +extern int pg_validate_utf8_sse42(const unsigned char *s, int len); + +#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK) +/* + * Use Intel SSE 4.2 instructions, but perform a runtime check first + * to check that they are available. + */ +#define UTF8_VERIFYSTR(s, len) \ + pg_validate_utf8((s), (len)) + +extern int (*pg_validate_utf8) (const unsigned char *s, int len); +extern int pg_validate_utf8_sse42(const unsigned char *s, int len); + +#else +#define UTF8_VERIFYSTR(s, len) \ + pg_validate_utf8_fallback((s), (len)) + +#endif /* USE_SSE42_UTF8 */ + +/* The following need to be visible everywhere. */ + +extern int pg_validate_utf8_fallback(const unsigned char *s, int len); + +#define IS_CONTINUATION_BYTE(c) (((c) & 0xC0) == 0x80) +#define IS_TWO_BYTE_LEAD(c) (((c) & 0xE0) == 0xC0) +#define IS_THREE_BYTE_LEAD(c) (((c) & 0xF0) == 0xE0) +#define IS_FOUR_BYTE_LEAD(c) (((c) & 0xF8) == 0xF0) + +/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */ +static inline int +check_ascii(const unsigned char *s, int len) +{ + uint64 half1, + half2, + highbits_set, + x1, + x2, + x; + + if (len >= 2 * sizeof(uint64)) + { + memcpy(&half1, s, sizeof(uint64)); + memcpy(&half2, s + sizeof(uint64), sizeof(uint64)); + + /* Check if any bytes in this chunk have the high bit set. */ + highbits_set = (half1 | half2) & UINT64CONST(0x8080808080808080); + if (highbits_set) + return 0; + + /* + * Check if there are any zero bytes in this chunk. + * + * First, add 0x7f to each byte. This sets the high bit in each byte, + * unless it was a zero. We already checked that none of the bytes had + * the high bit set previously, so the max value each byte can have + * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to + * worry about carrying over to the next byte. + */ + x1 = half1 + UINT64CONST(0x7f7f7f7f7f7f7f7f); + x2 = half2 + UINT64CONST(0x7f7f7f7f7f7f7f7f); + + /* Then check that the high bit is set in each byte. */ + x = (x1 & x2) & UINT64CONST(0x8080808080808080); + if (x != UINT64CONST(0x8080808080808080)) + return 0; + + return 2 * sizeof(uint64); + } + else + return 0; +} + +#endif /* PG_UTF8_H */ diff --git a/src/port/Makefile b/src/port/Makefile index 52dbf5783f..04838b0ab2 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS) OBJS = \ $(LIBOBJS) \ $(PG_CRC32C_OBJS) \ + $(PG_UTF8_OBJS) \ bsearch_arg.o \ chklocale.o \ erand48.o \ @@ -89,6 +90,11 @@ libpgport.a: $(OBJS) thread.o: CFLAGS+=$(PTHREAD_CFLAGS) thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS) +# all versions of pg_utf8_sse42.o need CFLAGS_SSE42 +pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42) +pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42) +pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42) + # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42) pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42) diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c new file mode 100644 index 0000000000..1efedc2429 --- /dev/null +++ b/src/port/pg_utf8_fallback.c @@ -0,0 +1,129 @@ +/*------------------------------------------------------------------------- + * + * pg_utf8_fallback.c + * Validate UTF-8 using plain C. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_utf8_fallback.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#include "port/pg_utf8.h" + + +/* + * See the comment in common/wchar.c under "multibyte sequence validators". + */ +int +pg_validate_utf8_fallback(const unsigned char *s, int len) +{ + const unsigned char *start = s; + unsigned char b1, + b2, + b3, + b4; + + while (len > 0) + { + int l; + + /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ + if (!IS_HIGHBIT_SET(*s)) + { + if (*s == '\0') + break; + l = 1; + } + /* code points U+0080 through U+07FF */ + else if (IS_TWO_BYTE_LEAD(*s)) + { + l = 2; + if (len < l) + break; + + b1 = *s; + b2 = *(s + 1); + + if (!IS_CONTINUATION_BYTE(b2)) + break; + + /* check 2-byte overlong: 1100.000x.10xx.xxxx */ + if (b1 < 0xC2) + break; + } + /* code points U+0800 through U+D7FF and U+E000 through U+FFFF */ + else if (IS_THREE_BYTE_LEAD(*s)) + { + l = 3; + if (len < l) + break; + + b1 = *s; + b2 = *(s + 1); + b3 = *(s + 2); + + if (!IS_CONTINUATION_BYTE(b2) || + !IS_CONTINUATION_BYTE(b3)) + break; + + /* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */ + if (b1 == 0xE0 && b2 < 0xA0) + break; + + /* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */ + if (b1 == 0xED && b2 > 0x9F) + break; + } + /* code points U+010000 through U+10FFFF */ + else if (IS_FOUR_BYTE_LEAD(*s)) + { + l = 4; + if (len < l) + break; + + b1 = *s; + b2 = *(s + 1); + b3 = *(s + 2); + b4 = *(s + 3); + + if (!IS_CONTINUATION_BYTE(b2) || + !IS_CONTINUATION_BYTE(b3) || + !IS_CONTINUATION_BYTE(b4)) + break; + + /* + * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx + */ + if (b1 == 0xF0 && b2 < 0x90) + break; + + /* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */ + if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4) + break; + } + else + /* invalid byte */ + break; + + s += l; + len -= l; + } + + return s - start; +} diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c new file mode 100644 index 0000000000..cd050ec2bf --- /dev/null +++ b/src/port/pg_utf8_sse42.c @@ -0,0 +1,508 @@ +/*------------------------------------------------------------------------- + * + * pg_utf8_sse42.c + * Validate UTF-8 using Intel SSE 4.2 instructions. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_utf8_sse42.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#include + +#include "port/pg_utf8.h" + +/* + * This module is based on the paper "Validating UTF-8 In Less Than One + * Instruction Per Byte" by John Keiser and Daniel Lemire, arXiv:2010.03090 + * [cs.DB], 10 Oct 2020. + * + * The authors provide an implementation of this algorithm + * in the simdjson library (Apache 2.0 license) found at + * https://github.com/simdjson/simdjson. Even if it were practical to + * use this library directly, we cannot because it simply returns valid + * or not valid, and we need to return the number of valid bytes found + * before the first invalid one. + * + * Therefore, the PG code was written from scratch, but with some idioms + * and naming conventions adapted from the Westmere implementation of + * simdjson. The constants and lookup tables were taken directly from + * simdjson with some cosmetic rearrangements. + * + * The core of the lookup algorithm is a two-part process: + * + * 1. Classify 2-byte sequences. All 2-byte errors can be found by looking + * at the first three nibbles of each overlapping 2-byte sequence, + * using three separate lookup tables. The interesting bytes are either + * definite errors or two continuation bytes in a row. The latter may + * be valid depending on what came before. + * + * 2. Find starts of possible 3- and 4-byte sequences. + * + * Combining the above results allows us to verify any UTF-8 sequence. + */ + + +/* constants for comparing bytes */ +#define MAX_CONTINUATION 0xBF +#define MAX_TWO_BYTE_LEAD 0xDF +#define MAX_THREE_BYTE_LEAD 0xEF + +/* lookup tables for classifying two-byte sequences */ + +/* + * 11______ 0_______ + * 11______ 11______ + */ +#define TOO_SHORT (1 << 0) + +/* 0_______ 10______ */ +#define TOO_LONG (1 << 1) + +/* 1100000_ 10______ */ +#define OVERLONG_2 (1 << 2) + +/* 11100000 100_____ */ +#define OVERLONG_3 (1 << 3) + +/* The following two symbols intentionally share the same value. */ + +/* 11110000 1000____ */ +#define OVERLONG_4 (1 << 4) + +/* + * 11110101 1000____ + * 1111011_ 1000____ + * 11111___ 1000____ + */ +#define TOO_LARGE_1000 (1 << 4) + +/* + * 11110100 1001____ + * 11110100 101_____ + * 11110101 1001____ + * 11110101 101_____ + * 1111011_ 1001____ + * 1111011_ 101_____ + * 11111___ 1001____ + * 11111___ 101_____ + */ +#define TOO_LARGE (1 << 5) + +/* 11101101 101_____ */ +#define SURROGATE (1 << 6) + +/* + * 10______ 10______ + * + * The cast here is to silence warnings about implicit conversion + * from 'int' to 'char'. It's fine that this is a negative value, + * because we only care about the pattern of bits. + */ +#define TWO_CONTS ((char) (1 << 7)) + +/* These all have ____ in byte 1 */ +#define CARRY (TOO_SHORT | TOO_LONG | TWO_CONTS) + +/* + * table for categorizing bits in the high nibble of + * the first byte of a 2-byte sequence + */ +#define BYTE_1_HIGH_TABLE \ + /* 0_______ ________ */ \ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, \ + /* 10______ ________ */ \ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, \ + /* 1100____ ________ */ \ + TOO_SHORT | OVERLONG_2, \ + /* 1101____ ________ */ \ + TOO_SHORT, \ + /* 1110____ ________ */ \ + TOO_SHORT | OVERLONG_3 | SURROGATE, \ + /* 1111____ ________ */ \ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + +/* + * table for categorizing bits in the low nibble of + * the first byte of a 2-byte sequence + */ +#define BYTE_1_LOW_TABLE \ + /* ____0000 ________ */ \ + CARRY | OVERLONG_2 | OVERLONG_3 | OVERLONG_4, \ + /* ____0001 ________ */ \ + CARRY | OVERLONG_2, \ + /* ____001_ ________ */ \ + CARRY, \ + CARRY, \ + /* ____0100 ________ */ \ + CARRY | TOO_LARGE, \ + /* ____0101 ________ */ \ + CARRY | TOO_LARGE | TOO_LARGE_1000, \ + /* ____011_ ________ */ \ + CARRY | TOO_LARGE | TOO_LARGE_1000, \ + CARRY | TOO_LARGE | TOO_LARGE_1000, \ + /* ____1___ ________ */ \ + CARRY | TOO_LARGE | TOO_LARGE_1000, \ + CARRY | TOO_LARGE | TOO_LARGE_1000, \ + CARRY | TOO_LARGE | TOO_LARGE_1000, \ + CARRY | TOO_LARGE | TOO_LARGE_1000, \ + CARRY | TOO_LARGE | TOO_LARGE_1000, \ + /* ____1101 ________ */ \ + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, \ + CARRY | TOO_LARGE | TOO_LARGE_1000, \ + CARRY | TOO_LARGE | TOO_LARGE_1000 + +/* + * table for categorizing bits in the high nibble of + * the second byte of a 2-byte sequence + */ +#define BYTE_2_HIGH_TABLE \ + /* ________ 0_______ */ \ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, \ + /* ________ 1000____ */ \ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, \ + /* ________ 1001____ */ \ + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, \ + /* ________ 101_____ */ \ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \ + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, \ + /* ________ 11______ */ \ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT \ + + +/* helper functions to wrap intrinsics */ + +#define vset(...) _mm_setr_epi8(__VA_ARGS__) + +/* return a zeroed register */ +static inline const __m128i +vzero() +{ + return _mm_setzero_si128(); +} + +/* perform an unaligned load from memory into a register */ +static inline const __m128i +vload(const unsigned char *raw_input) +{ + return _mm_loadu_si128((const __m128i *) raw_input); +} + +/* return a vector with each 8-bit lane populated with the input scalar */ +static inline __m128i +splat(char byte) +{ + return _mm_set1_epi8(byte); +} + +/* perform signed greater-than on all 8-bit lanes */ +static inline __m128i +greater_than(const __m128i v1, const __m128i v2) +{ + return _mm_cmpgt_epi8(v1, v2); +} + +/* bitwise vector operations */ +static inline __m128i +bitwise_and(const __m128i v1, const __m128i v2) +{ + return _mm_and_si128(v1, v2); +} + +static inline __m128i +bitwise_or(const __m128i v1, const __m128i v2) +{ + return _mm_or_si128(v1, v2); +} + +static inline __m128i +bitwise_xor(const __m128i v1, const __m128i v2) +{ + return _mm_xor_si128(v1, v2); +} + +/* + * Do unsigned subtraction, but instead of wrapping around + * on overflow, stop at zero. Useful for emulating unsigned + * comparison. + */ +static inline __m128i +saturating_sub(const __m128i v1, const __m128i v2) +{ + return _mm_subs_epu8(v1, v2); +} + +/* + * Shift right each 8-bit lane + * + * There is no intrinsic to do this on 8-bit lanes, so shift right in each + * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount. + */ +static inline __m128i +shift_right(const __m128i v, const int n) +{ + const __m128i shift16 = _mm_srli_epi16(v, n); + const __m128i mask = splat(0xFF >> n); + + return bitwise_and(shift16, mask); +} + +/* + * Shift entire 'input' register right by N 8-bit lanes, and + * replace the first N lanes with the last N lanes from the + * 'prev' register. Could be stated in C thusly: + * + * ((prev << 128) | input) >> (N * 8) + * + * The third argument to the intrinsic must be a numeric constant, so + * we must have separate functions for different shift amounts. + */ +static inline __m128i +prev1(__m128i prev, __m128i input) +{ + return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1); +} + +static inline __m128i +prev2(__m128i prev, __m128i input) +{ + return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2); +} + +static inline __m128i +prev3(__m128i prev, __m128i input) +{ + return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3); +} + +/* + * For each 8-bit lane in the input, use that value as an index + * into the lookup vector as if it were a 16-element byte array. + */ +static inline __m128i +lookup(const __m128i input, const __m128i lookup) +{ + return _mm_shuffle_epi8(lookup, input); +} + +/* + * Return a vector with lanes non-zero where we have either errors, or + * two or more continuations in a row. + */ +static inline __m128i +check_special_cases(const __m128i prev, const __m128i input) +{ + const __m128i byte_1_high_table = vset(BYTE_1_HIGH_TABLE); + const __m128i byte_1_low_table = vset(BYTE_1_LOW_TABLE); + const __m128i byte_2_high_table = vset(BYTE_2_HIGH_TABLE); + + /* + * To classify the first byte in each chunk we need to have the last byte + * from the previous chunk. + */ + const __m128i input_shift1 = prev1(prev, input); + + /* put the relevant nibbles into their own bytes in their own registers */ + const __m128i byte_1_high = shift_right(input_shift1, 4); + const __m128i byte_1_low = bitwise_and(input_shift1, splat(0x0F)); + const __m128i byte_2_high = shift_right(input, 4); + + /* lookup the possible errors for each set of nibbles */ + const __m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table); + const __m128i lookup_1_low = lookup(byte_1_low, byte_1_low_table); + const __m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table); + + /* + * AND all the lookups together. At this point, non-zero lanes in the + * returned vector represent: + * + * 1. invalid 2-byte sequences + * + * 2. the second continuation byte of a 3- or 4-byte character + * + * 3. the third continuation byte of a 4-byte character + */ + const __m128i temp = bitwise_and(lookup_1_high, lookup_1_low); + + return bitwise_and(temp, lookup_2_high); +} + +/* + * Return a vector with lanes set to TWO_CONTS where we expect to find two + * continuations in a row. These are valid only within 3- and 4-byte sequences. + */ +static inline __m128i +check_multibyte_lengths(const __m128i prev, const __m128i input) +{ + /* + * Populate registers that contain the input shifted right by 2 and 3 + * bytes, filling in the left lanes from the previous input. + */ + const __m128i input_shift2 = prev2(prev, input); + const __m128i input_shift3 = prev3(prev, input); + + /* + * Constants for comparison. Any 3-byte lead is greater than + * MAX_TWO_BYTE_LEAD, etc. + */ + const __m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD); + const __m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD); + + /* + * Look in the shifted registers for 3- or 4-byte leads. There is no + * unsigned comparison, so we use saturating subtraction followed by + * signed comparison with zero. Any non-zero bytes in the result represent + * valid leads. + */ + const __m128i is_third_byte = saturating_sub(input_shift2, max_lead2); + const __m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3); + + /* OR them together for easier comparison */ + const __m128i temp = bitwise_or(is_third_byte, is_fourth_byte); + + /* + * Set all bits in each 8-bit lane if the result is greater than zero. + * Signed arithmetic is okay because the values are small. + */ + const __m128i must23 = greater_than(temp, vzero()); + + /* + * We want to compare with the result of check_special_cases() so apply a + * mask to return only the set bits corresponding to the "two + * continuations" case. + */ + return bitwise_and(must23, splat(TWO_CONTS)); +} + +/* set bits in the error vector where we find invalid UTF-8 input */ +static inline void +check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error) +{ + const __m128i special_cases = check_special_cases(prev, input); + const __m128i expect_two_conts = check_multibyte_lengths(prev, input); + + /* If the two cases are identical, this will be zero. */ + const __m128i result = bitwise_xor(expect_two_conts, special_cases); + + *error = bitwise_or(*error, result); +} + +/* return false if a register is zero, true otherwise */ +static inline bool +to_bool(const __m128i v) +{ + /* + * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is + * zero. Zero is the only value whose bitwise AND with itself is zero. + */ + return !_mm_testz_si128(v, v); +} + +/* set bits in the error vector where bytes in the input are zero */ +static inline void +check_for_zeros(const __m128i v, __m128i * error) +{ + const __m128i cmp = _mm_cmpeq_epi8(v, vzero()); + + *error = bitwise_or(*error, cmp); +} + +/* vector version of IS_HIGHBIT_SET() */ +static inline bool +is_highbit_set(const __m128i v) +{ + return _mm_movemask_epi8(v) != 0; +} + +/* return non-zero if the input terminates with an incomplete code point */ +static inline __m128i +is_incomplete(const __m128i v) +{ + const __m128i max_array = + vset(0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, MAX_THREE_BYTE_LEAD, MAX_TWO_BYTE_LEAD, MAX_CONTINUATION); + + return saturating_sub(v, max_array); +} + +/* + * See the comment in common/wchar.c under "multibyte sequence validators". + */ +int +pg_validate_utf8_sse42(const unsigned char *s, int len) +{ + const unsigned char *start = s; + const int orig_len = len; + __m128i error = vzero(); + __m128i prev = vzero(); + __m128i prev_incomplete = vzero(); + __m128i input; + + /* + * NB: This check must be strictly greater-than, otherwise an invalid byte + * at the end might not get detected. + */ + while (len > sizeof(__m128i)) + { + input = vload(s); + + check_for_zeros(input, &error); + + /* + * If the chunk is all ASCII, we can skip the full UTF-8 check, but we + * must still check the previous chunk for incomplete multibyte + * sequences at the end. We only update prev_incomplete if the chunk + * contains non-ASCII, since the error is cumulative. + */ + if (!is_highbit_set(input)) + error = bitwise_or(error, prev_incomplete); + else + { + check_utf8_bytes(prev, input, &error); + prev_incomplete = is_incomplete(input); + } + + prev = input; + s += sizeof(__m128i); + len -= sizeof(__m128i); + } + + /* + * If we saw an error any time during the loop, start over with the + * fallback so we can return the number of valid bytes. + */ + if (to_bool(error)) + return pg_validate_utf8_fallback(start, orig_len); + else + { + /* + * For short sequences, just use the fallback. For the last few bytes + * of a longer sequence, we walk backwards into the previous chunk, + * find the last byte that could have been the start of a valid + * character, and start the fallback from there. + */ + while (s > start) + { + s--; + len++; + + if ((!IS_HIGHBIT_SET(*s) && *s != '\0') || + IS_TWO_BYTE_LEAD(*s) || + IS_THREE_BYTE_LEAD(*s) || + IS_FOUR_BYTE_LEAD(*s)) + break; + } + return orig_len - len + pg_validate_utf8_fallback(s, len); + } +} diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c new file mode 100644 index 0000000000..ff6120be2b --- /dev/null +++ b/src/port/pg_utf8_sse42_choose.c @@ -0,0 +1,68 @@ +/*------------------------------------------------------------------------- + * + * pg_utf8_sse42_choose.c + * Choose between Intel SSE 4.2 and fallback implementation. + * + * On first call, checks if the CPU we're running on supports Intel SSE + * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise, + * fall back to the pure C implementation. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_utf8_choose.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#ifdef HAVE__GET_CPUID +#include +#endif + +#ifdef HAVE__CPUID +#include +#endif + +#include "port/pg_utf8.h" + +static bool +pg_utf8_sse42_available(void) +{ + /* To save from checking every SSE2 intrinsic, insist on 64-bit. */ +#ifdef __x86_64__ + unsigned int exx[4] = {0, 0, 0, 0}; + +#if defined(HAVE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuid(exx, 1); +#else +#error cpuid instruction not available +#endif /* HAVE__GET_CPUID */ + return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */ + +#else + return false; +#endif /* __x86_64__ */ +} + +/* + * This gets called on the first call. It replaces the function pointer + * so that subsequent calls are routed directly to the chosen implementation. + */ +static int +pg_validate_utf8_choose(const unsigned char *s, int len) +{ + if (pg_utf8_sse42_available()) + pg_validate_utf8 = pg_validate_utf8_sse42; + else + pg_validate_utf8 = pg_validate_utf8_fallback; + + return pg_validate_utf8(s, len); +} + +int (*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose; diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out index 9315ad3abd..8d14c96c10 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out @@ -196,6 +196,40 @@ order by description; -------------+------------+--------------------- (0 rows) +-- Test SSE ASCII fast path with cases where incomplete UTF-8 sequences +-- fall at the end of a 16-byte boundary followed by more than 16 bytes +-- of ASCII. +with test_bytes as ( + select + inbytes, + description, + (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + from utf8_verification_inputs + union all + select + inbytes, + description, + (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + from utf8_inputs +), test_padded as ( + select + description, + (test_conv(repeat('.', 16 - length(inbytes))::bytea || inbytes || repeat('.', 17)::bytea, 'utf8', 'utf8')).error + from test_bytes +) +select + description, + b.error as orig_error, + p.error as error_after_padding +from test_padded p +join test_bytes b +using (description) +where p.error is distinct from b.error +order by description; + description | orig_error | error_after_padding +-------------+------------+--------------------- +(0 rows) + -- Test conversions from UTF-8 select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs; description | inbytes | result | errorat | error diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql index 8ad5290f4c..1255b047b6 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql @@ -156,6 +156,37 @@ using (description) where p.error is distinct from b.error order by description; +-- Test SSE ASCII fast path with cases where incomplete UTF-8 sequences +-- fall at the end of a 16-byte boundary followed by more than 16 bytes +-- of ASCII. +with test_bytes as ( + select + inbytes, + description, + (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + from utf8_verification_inputs + union all + select + inbytes, + description, + (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + from utf8_inputs +), test_padded as ( + select + description, + (test_conv(repeat('.', 16 - length(inbytes))::bytea || inbytes || repeat('.', 17)::bytea, 'utf8', 'utf8')).error + from test_bytes +) +select + description, + b.error as orig_error, + p.error as error_after_padding +from test_padded p +join test_bytes b +using (description) +where p.error is distinct from b.error +order by description; + -- Test conversions from UTF-8 select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs; select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_inputs; diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm index 233ddbf4c2..9b8bad9044 100644 --- a/src/tools/msvc/Mkvcbuild.pm +++ b/src/tools/msvc/Mkvcbuild.pm @@ -120,10 +120,14 @@ sub mkvcbuild push(@pgportfiles, 'pg_crc32c_sse42_choose.c'); push(@pgportfiles, 'pg_crc32c_sse42.c'); push(@pgportfiles, 'pg_crc32c_sb8.c'); + push(@pgportfiles, 'pg_utf8_sse42_choose.c'); + push(@pgportfiles, 'pg_utf8_sse42.c'); + push(@pgportfiles, 'pg_utf8_fallback.c'); } else { push(@pgportfiles, 'pg_crc32c_sb8.c'); + push(@pgportfiles, 'pg_utf8_fallback.c'); } our @pgcommonallfiles = qw( diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm index 3c5fe5dddc..09f2bcf0a7 100644 --- a/src/tools/msvc/Solution.pm +++ b/src/tools/msvc/Solution.pm @@ -499,6 +499,9 @@ sub GenerateFiles USE_NAMED_POSIX_SEMAPHORES => undef, USE_OPENSSL => undef, USE_PAM => undef, + USE_FALLBACK_UTF8 => undef, + USE_SSE42_UTF8 => undef, + USE_SSE42_UTF8_WITH_RUNTIME_CHECK => 1, USE_SLICING_BY_8_CRC32C => undef, USE_SSE42_CRC32C => undef, USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1, -- 2.31.1