config/c-compiler.m4 | 34 +++ configure | 147 ++++++++++- configure.ac | 62 ++++- src/Makefile.global.in | 3 + src/common/wchar.c | 116 +++++++-- src/include/pg_config.h.in | 9 + src/include/port/pg_utf8.h | 72 ++++++ src/port/Makefile | 6 + src/port/pg_utf8_fallback.c | 132 ++++++++++ src/port/pg_utf8_sse42.c | 424 +++++++++++++++++++++++++++++++ src/port/pg_utf8_sse42_choose.c | 69 +++++ src/test/regress/expected/conversion.out | 52 ++++ src/test/regress/sql/conversion.sql | 28 ++ src/tools/msvc/Solution.pm | 3 + 14 files changed, 1131 insertions(+), 26 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 780e906ecc..a346d8429a 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -591,6 +591,40 @@ if test x"$pgac_cv_gcc_atomic_int64_cas" = x"yes"; then AC_DEFINE(HAVE_GCC__ATOMIC_INT64_CAS, 1, [Define to 1 if you have __atomic_compare_exchange_n(int64 *, int64 *, int64).]) fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS +# PGAC_SSE42_UTF8_INTRINSICS +# --------------------------- +# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs. +# +# Check if the compiler supports x86 instructions added in SSSE3 and SSE 4.1, +# in particular _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128. +# We don't test for SSE2 intrinsics, but they are assumed to be present if +# on x86-64 platforms. +# +# An optional compiler flag can be passed as argument (e.g. -msse4.2). If the +# intrinsics are supported, sets pgac_sse41_intrinsics, and CFLAGS_SSE42. +# +# Note: We could create a new CFLAGS macro for SSE4.1, but it doesn't seem worth it. +AC_DEFUN([PGAC_SSE42_UTF8_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_utf8_intrinsics_$1])])dnl +AC_CACHE_CHECK([for _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=$1], [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1" +AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], + [ __m128i zero = _mm_setzero_si128(); + return _mm_testz_si128(zero, + _mm_shuffle_epi8(zero, + _mm_alignr_epi8(zero, zero, 1)));])], + [Ac_cachevar=yes], + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) +if test x"$Ac_cachevar" = x"yes"; then + CFLAGS_SSE42="$1" + pgac_sse42_utf8_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_SSE42_UTF8_INTRINSICS + + # PGAC_SSE42_CRC32_INTRINSICS # --------------------------- # Check if the compiler supports the x86 CRC instructions added in SSE 4.2, diff --git a/configure b/configure index ce9ea36999..fd7e1c5e0f 100755 --- a/configure +++ b/configure @@ -647,6 +647,7 @@ MSGFMT_FLAGS MSGFMT PG_CRC32C_OBJS CFLAGS_ARMV8_CRC32C +PG_UTF8_OBJS CFLAGS_SSE42 have_win32_dbghelp LIBOBJS @@ -17670,6 +17671,93 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h fi +# Check for Intel SSSE3 and SSE 4.1 intrinsics for UTF-8 validation. +# Note: we reuse the flag, runtime check, and naming scheme used for SSE4.2. +# +# First check if the _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 +# intrinsics can be used +# with the default compiler flags. If not, check if adding the -msse4.2 +# flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=" >&5 +$as_echo_n "checking for _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=... " >&6; } +if ${pgac_cv_sse42_utf8_intrinsics_+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS " +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ + __m128i zero = _mm_setzero_si128(); + return _mm_testz_si128(zero, + _mm_shuffle_epi8(zero, + _mm_alignr_epi8(zero, zero, 1))); + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_sse42_utf8_intrinsics_=yes +else + pgac_cv_sse42_utf8_intrinsics_=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_utf8_intrinsics_" >&5 +$as_echo "$pgac_cv_sse42_utf8_intrinsics_" >&6; } +if test x"$pgac_cv_sse42_utf8_intrinsics_" = x"yes"; then + CFLAGS_SSE42="" + pgac_sse42_utf8_intrinsics=yes +fi + +if test x"$pgac_sse42_utf8_intrinsics" != x"yes"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2" >&5 +$as_echo_n "checking for _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 with CFLAGS=-msse4.2... " >&6; } +if ${pgac_cv_sse42_utf8_intrinsics__msse4_2+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -msse4.2" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ + __m128i zero = _mm_setzero_si128(); + return _mm_testz_si128(zero, + _mm_shuffle_epi8(zero, + _mm_alignr_epi8(zero, zero, 1))); + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_sse42_utf8_intrinsics__msse4_2=yes +else + pgac_cv_sse42_utf8_intrinsics__msse4_2=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_utf8_intrinsics__msse4_2" >&5 +$as_echo "$pgac_cv_sse42_utf8_intrinsics__msse4_2" >&6; } +if test x"$pgac_cv_sse42_utf8_intrinsics__msse4_2" = x"yes"; then + CFLAGS_SSE42="-msse4.2" + pgac_sse42_utf8_intrinsics=yes +fi + +fi + + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used @@ -17777,6 +17865,63 @@ if ac_fn_c_try_compile "$LINENO"; then : fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +# Select UTF-8 validator implementation. +# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs. +# +# If we are targeting a processor that has SSE 4.2 instructions, we can use +# those to validate UTF-8 characters. If we're not targeting such +# a processor, but we can nevertheless produce code that uses the SSE +# intrinsics, perhaps with some extra CFLAGS, compile both implementations and +# select which one to use at runtime, depending on whether SSE 4.2 is supported +# by the processor we're running on. +# +# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1 +# in the template or configure command line. +if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then + if test x"$pgac_sse42_utf8_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then + USE_SSE42_UTF8=1 + else + # the CPUID instruction is needed for the runtime check. + if test x"$pgac_sse42_utf8_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1 + else + # fall back to algorithm which doesn't require any special + # CPU support. + USE_FALLBACK_UTF8=1 + fi + fi +fi + +# Set PG_UTF8_OBJS appropriately depending on the selected implementation. +# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5 +$as_echo_n "checking which UTF-8 validator to use... " >&6; } +if test x"$USE_SSE42_UTF8" = x"1"; then + +$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h + + PG_UTF8_OBJS="pg_utf8_sse42.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5 +$as_echo "SSE 4.2" >&6; } +else + if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then + +$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h + + PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5 +$as_echo "SSE 4.2 with runtime check" >&6; } + else + +$as_echo "#define USE_FALLBACK_UTF8 1" >>confdefs.h + + PG_UTF8_OBJS="pg_utf8_fallback.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 +$as_echo "slicing-by-8" >&6; } + fi +fi + + # Check for ARMv8 CRC Extension intrinsics to do CRC calculations. # # First check if __crc32c* intrinsics can be used with the default compiler @@ -17903,7 +18048,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && # fall back to slicing-by-8 algorithm, which doesn't require any # special CPU support. USE_SLICING_BY_8_CRC32C=1 - fi + fi fi fi fi diff --git a/configure.ac b/configure.ac index 07da84d401..d18965cde5 100644 --- a/configure.ac +++ b/configure.ac @@ -2017,6 +2017,19 @@ if test x"$pgac_cv__cpuid" = x"yes"; then AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.]) fi +# Check for Intel SSSE3 and SSE 4.1 intrinsics for UTF-8 validation. +# Note: we reuse the flag, runtime check, and naming scheme used for SSE4.2. +# +# First check if the _mm_alignr_epi8, _mm_shuffle_epi8, and _mm_testz_si128 +# intrinsics can be used +# with the default compiler flags. If not, check if adding the -msse4.2 +# flag helps. CFLAGS_SSE42 is set to -msse4.2 if that's required. +PGAC_SSE42_UTF8_INTRINSICS([]) +if test x"$pgac_sse42_utf8_intrinsics" != x"yes"; then + PGAC_SSE42_UTF8_INTRINSICS([-msse4.2]) +fi +AC_SUBST(CFLAGS_SSE42) + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used @@ -2036,6 +2049,53 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [ #endif ])], [SSE4_2_TARGETED=1]) +# Select UTF-8 validator implementation. +# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs. +# +# If we are targeting a processor that has SSE 4.2 instructions, we can use +# those to validate UTF-8 characters. If we're not targeting such +# a processor, but we can nevertheless produce code that uses the SSE +# intrinsics, perhaps with some extra CFLAGS, compile both implementations and +# select which one to use at runtime, depending on whether SSE 4.2 is supported +# by the processor we're running on. +# +# You can override this logic by setting the appropriate USE_*_UTF8 flag to 1 +# in the template or configure command line. +if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then + if test x"$pgac_sse42_utf8_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then + USE_SSE42_UTF8=1 + else + # the CPUID instruction is needed for the runtime check. + if test x"$pgac_sse42_utf8_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then + USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1 + else + # fall back to algorithm which doesn't require any special + # CPU support. + USE_FALLBACK_UTF8=1 + fi + fi +fi + +# Set PG_UTF8_OBJS appropriately depending on the selected implementation. +# XXX this was copy-pasted from the equivalent CRC checks -- there may be bugs. +AC_MSG_CHECKING([which UTF-8 validator to use]) +if test x"$USE_SSE42_UTF8" = x"1"; then + AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.]) + PG_UTF8_OBJS="pg_utf8_sse42.o" + AC_MSG_RESULT(SSE 4.2) +else + if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then + AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.]) + PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o" + AC_MSG_RESULT(SSE 4.2 with runtime check) + else + AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.]) + PG_UTF8_OBJS="pg_utf8_fallback.o" + AC_MSG_RESULT(slicing-by-8) + fi +fi +AC_SUBST(PG_UTF8_OBJS) + # Check for ARMv8 CRC Extension intrinsics to do CRC calculations. # # First check if __crc32c* intrinsics can be used with the default compiler @@ -2084,7 +2144,7 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && # fall back to slicing-by-8 algorithm, which doesn't require any # special CPU support. USE_SLICING_BY_8_CRC32C=1 - fi + fi fi fi fi diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 74b3a6acd2..1d51ebe9c6 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -721,6 +721,9 @@ LIBOBJS = @LIBOBJS@ # files needed for the chosen CRC-32C implementation PG_CRC32C_OBJS = @PG_CRC32C_OBJS@ +# files needed for the chosen UTF-8 validation implementation +PG_UTF8_OBJS = @PG_UTF8_OBJS@ + LIBS := -lpgcommon -lpgport $(LIBS) # to make ws2_32.lib the last library diff --git a/src/common/wchar.c b/src/common/wchar.c index 6e7d731e02..742957e67e 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -13,6 +13,7 @@ #include "c.h" #include "mb/pg_wchar.h" +#include "port/pg_utf8.h" /* @@ -1189,6 +1190,15 @@ pg_eucjp_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1247,6 +1257,15 @@ pg_euckr_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1330,6 +1349,15 @@ pg_euctw_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1383,6 +1411,15 @@ pg_johab_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1433,6 +1470,15 @@ pg_mule_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1502,6 +1548,15 @@ pg_sjis_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1551,6 +1606,15 @@ pg_big5_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1600,6 +1664,15 @@ pg_gbk_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1649,6 +1722,15 @@ pg_uhc_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1709,6 +1791,15 @@ pg_gb18030_verifystr(const unsigned char *s, int len) int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') @@ -1760,30 +1851,7 @@ pg_utf8_verifychar(const unsigned char *s, int len) static int pg_utf8_verifystr(const unsigned char *s, int len) { - const unsigned char *start = s; - - while (len > 0) - { - int l; - - /* fast path for ASCII-subset characters */ - if (!IS_HIGHBIT_SET(*s)) - { - if (*s == '\0') - break; - l = 1; - } - else - { - l = pg_utf8_verifychar(s, len); - if (l == -1) - break; - } - s += l; - len -= l; - } - - return s - start; + return pg_validate_utf8(s, len); } /* diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 55cab4d2bf..303dae4441 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -905,6 +905,15 @@ /* Define to 1 to build with PAM support. (--with-pam) */ #undef USE_PAM +/* Define to 1 to use the fallback UTF-8 validator written in C. */ +#undef USE_FALLBACK_UTF8 + +/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */ +#undef USE_SSE42_UTF8 + +/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */ +#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK + /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */ #undef USE_SLICING_BY_8_CRC32C diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h new file mode 100644 index 0000000000..a259c59cf5 --- /dev/null +++ b/src/include/port/pg_utf8.h @@ -0,0 +1,72 @@ +/*------------------------------------------------------------------------- + * + * pg_utf8.h + * Routines for fast validation of UTF-8 text. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/port/pg_utf8.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_UTF8_H +#define PG_UTF8_H + + +#if defined(USE_SSE42_CRC32C) +/* Use Intel SSE4.2 instructions. */ +extern int pg_validate_utf8_sse42(const unsigned char *s, int len); + +#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) +/* + * Use Intel SSE 4.2 instructions, but perform a runtime check first + * to check that they are available. + */ +extern int (*pg_validate_utf8) (const unsigned char *s, int len); +extern int pg_validate_utf8_sse42(const unsigned char *s, int len); + +#endif /* USE_SSE42_CRC32C */ + +extern int pg_validate_utf8_fallback(const unsigned char *s, int len); + + +/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */ +#define HAS_ZERO(chunk) ( \ + ((chunk) - UINT64CONST(0x0101010101010101)) & \ + ~(chunk) & \ + UINT64CONST(0x8080808080808080)) + +/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */ +static inline int +check_ascii(const unsigned char *s, int len) +{ + uint64 half1, half2, + highbit_mask; + + if (len >= 2 * sizeof(uint64)) + { + memcpy(&half1, s, sizeof(uint64)); + memcpy(&half2, s + sizeof(uint64), sizeof(uint64)); + + /* + * If there are any zero bytes, bail and let the slow + * path handle it. + */ + if (HAS_ZERO(half1) || HAS_ZERO(half2)) + return 0; + + /* Check if any bytes in this chunk have the high bit set. */ + highbit_mask = ((half1 | half2) & UINT64CONST(0x8080808080808080)); + + if (!highbit_mask) + return 2 * sizeof(uint64); + else + return 0; + } + + return 0; +} + +#endif /* PG_UTF8_H */ diff --git a/src/port/Makefile b/src/port/Makefile index e41b005c4f..7a7e000b9d 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -40,6 +40,7 @@ LIBS += $(PTHREAD_LIBS) OBJS = \ $(LIBOBJS) \ $(PG_CRC32C_OBJS) \ + $(PG_UTF8_OBJS) \ chklocale.o \ erand48.o \ inet_net_ntop.o \ @@ -88,6 +89,11 @@ libpgport.a: $(OBJS) thread.o: CFLAGS+=$(PTHREAD_CFLAGS) thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS) +# all versions of pg_utf8_sse42.o need CFLAGS_SSE42 +pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42) +pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42) +pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42) + # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42) pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42) diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c new file mode 100644 index 0000000000..1615c48233 --- /dev/null +++ b/src/port/pg_utf8_fallback.c @@ -0,0 +1,132 @@ +/*------------------------------------------------------------------------- + * + * pg_utf8_fallback.c + * Validate UTF-8 with a fast path for the ASCII subset. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_utf8_fallback.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#include "port/pg_utf8.h" + + +#define IS_CONTINUATION_BYTE(c) (((c) & 0b11000000) == 0b10000000) + +/* + * See the comment in common/wchar.c under "multibyte sequence validators". + */ +int +pg_validate_utf8_fallback(const unsigned char *s, int len) +{ + const unsigned char *start = s; + unsigned char b1, b2, b3, b4; + + while (len > 0) + { + int l; + + /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ + if (!IS_HIGHBIT_SET(*s)) + { + if (*s == '\0') + break; + l = 1; + } + /* code points U+0080 through U+07FF */ + else if ((*s & 0b11100000) == 0b11000000) + { + l = 2; + if (len < l) + break; + + b1 = *s; + b2 = *(s + 1); + + if (!IS_CONTINUATION_BYTE(b2)) + break; + + /* check 2-byte overlong: 1100.000x.10xx.xxxx */ + if (b1 < 0xC2) + break; + } + /* code points U+0800 through U+D7FF and U+E000 through U+FFFF */ + else if ((*s & 0b11110000) == 0b11100000) + { + l = 3; + if (len < l) + break; + + b1 = *s; + b2 = *(s + 1); + b3 = *(s + 2); + + if (!IS_CONTINUATION_BYTE(b2) || + !IS_CONTINUATION_BYTE(b3)) + break; + + /* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */ + if (b1 == 0xE0 && b2 < 0xA0) + break; + + /* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */ + if (b1 == 0xED && b2 > 0x9F) + break; + } + /* code points U+010000 through U+10FFFF */ + else if ((*s & 0b11111000) == 0b11110000) + { + l = 4; + if (len < l) + break; + + b1 = *s; + b2 = *(s + 1); + b3 = *(s + 2); + b4 = *(s + 3); + + if (!IS_CONTINUATION_BYTE(b2) || + !IS_CONTINUATION_BYTE(b3) || + !IS_CONTINUATION_BYTE(b4)) + break; + + /* + * check 4-byte overlong: + * 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx + */ + if (b1 == 0xF0 && b2 < 0x90) + break; + + /* + * check too large: + * 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx + */ + if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4) + break; + } + else + /* invalid byte */ + break; + + s += l; + len -= l; + } + + return s - start; +} diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c new file mode 100644 index 0000000000..417f2142c7 --- /dev/null +++ b/src/port/pg_utf8_sse42.c @@ -0,0 +1,424 @@ +/*------------------------------------------------------------------------- + * + * pg_utf8_sse2.c + * Validate UTF-8 with Intel SSE 4.2 instructions. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_utf8_fallback.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#include + +#include "port/pg_utf8.h" + +/* TODO: cite paper */ + +/* + * Lookup tables for classifying two-byte sequences + * + * These constants were taken nearly verbatim from simdjson (Apache 2.0 license) + * + * XXX had to add a bunch of casts to prevent warnings -- needs more work + * + * IMHO a better symbol name for TOO_LONG is ASC_CONT + * + * simdjson also didn't seem to put the numerical values in a logical order, + * but the only one that MUST be as below is TWO_CONTS, since that indicates + * we can't say there's an error until we look at previous bytes. + */ +#define TOO_SHORT (uint8) (1 << 0) /* 11______ 0_______ */ + /* 11______ 11______ */ +#define TOO_LONG (uint8) (1 << 1) /* 0_______ 10______ */ +#define OVERLONG_3 (uint8) (1 << 2) /* 11100000 100_____ */ +#define SURROGATE (uint8) (1 << 4) /* 11101101 101_____ */ +#define OVERLONG_2 (uint8) (1 << 5) /* 1100000_ 10______ */ +#define TWO_CONTS (uint8) (1 << 7) /* 10______ 10______ */ +#define TOO_LARGE (uint8) (1 << 3) /* 11110100 1001____ */ + /* 11110100 101_____ */ + /* 11110101 1001____ */ + /* 11110101 101_____ */ + /* 1111011_ 1001____ */ + /* 1111011_ 101_____ */ + /* 11111___ 1001____ */ + /* 11111___ 101_____ */ +#define TOO_LARGE_1000 (uint8) (1 << 6) /* 11110101 1000____ */ + /* 1111011_ 1000____ */ + /* 11111___ 1000____ */ +#define OVERLONG_4 (uint8) (1 << 6) /* 11110000 1000____ */ + +/* These all have ____ in byte 1 */ +#define CARRY (uint8) (TOO_SHORT | TOO_LONG | TWO_CONTS) + +/* XXX the following tables could just be static variables */ + +/* + * table for looking up possible errors in the high nibble of + * the first byte of a 2-byte sequence + */ +static inline const __m128i +byte_1_high_table() +{ + return _mm_setr_epi8( + // 0_______ ________ + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, + // 10______ ________ + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, + // 1100____ ________ + TOO_SHORT | OVERLONG_2, + // 1101____ ________ + TOO_SHORT, + // 1110____ ________ + TOO_SHORT | OVERLONG_3 | SURROGATE, + // 1111____ ________ + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4 + ); +} + +/* + * table for looking up possible errors in the low nibble of + * the first byte of a 2-byte sequence + */ +static inline const __m128i +byte_1_low_table() +{ + return _mm_setr_epi8( + // ____0000 ________ + (uint8) (CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4), + // ____0001 ________ + (uint8) (CARRY | OVERLONG_2), + // ____001_ ________ + CARRY, + CARRY, + + // ____0100 ________ + (uint8) (CARRY | TOO_LARGE), + // ____0101 ________ + (uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000), + // ____011_ ________ + (uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000), + (uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000), + + // ____1___ ________ + (uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000), + (uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000), + (uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000), + (uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000), + (uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000), + // ____1101 ________ + (uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE), + (uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000), + (uint8) (CARRY | TOO_LARGE | TOO_LARGE_1000) + ); +} + +/* + * table for looking up possible errors in the high nibble of + * the second byte of a 2-byte sequence + */ +static inline const __m128i +byte_2_high_table() +{ + return _mm_setr_epi8( + // ________ 0_______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, + + // ________ 1000____ + (uint8) (TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4), + // ________ 1001____ + (uint8) (TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE), + // ________ 101_____ + (uint8) (TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE), + (uint8) (TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE), + + // ________ 11______ + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT + ); +} + +/* helper functions to wrap intrinsics */ + +/* return a zeroed vector */ +static inline const __m128i +vzero() +{ + return _mm_setzero_si128(); +} + +/* perform an unaligned load from memory and return the register */ +static inline const __m128i +vload(const unsigned char *raw_input) +{ + return _mm_loadu_si128((const __m128i *) raw_input); +} + +/* return a vector with set bits where any bytes in the input are zero */ +static inline const __m128i +has_zero(const __m128i v) +{ + return _mm_cmpeq_epi8(v, vzero()); +} + +/* return a vector with each 8-bit lane populated with the input scalar */ +static inline __m128i +splat(uint8 byte) +{ + return _mm_set1_epi8(byte); +} + +/* perform signed greater-than on all 8-bit lanes */ +static inline __m128i +greater_than(const __m128i v1, const __m128i v2) +{ + return _mm_cmpgt_epi8(v1, v2); +} + +/* + * Shift right each 8-bit lane + * + * There is no intrinsic to do this on 8-bit lanes, so shift right in each + * 16-bit lane then apply a mask of 1-bytes shifted the same amount. + */ +static inline __m128i +shift_right(const __m128i v, const int n) +{ + const __m128i shift16 = _mm_srli_epi16(v, n); + const __m128i mask = splat(0xFF >> n); + return _mm_and_si128(shift16, mask); +} + +/* Bitwise vector operations */ +static inline __m128i +bitwise_and(const __m128i v1, const __m128i v2) +{ + return _mm_and_si128(v1, v2); +} + +static inline __m128i +bitwise_or(const __m128i v1, const __m128i v2) +{ + return _mm_or_si128(v1, v2); +} + +static inline __m128i +bitwise_xor(const __m128i v1, const __m128i v2) +{ + return _mm_xor_si128(v1, v2); +} + +/* + * Do unsigned subtraction, but instead of wrapping around + * on overflow, stop at zero. Useful for emulating unsigned + * comparison. + */ +static inline __m128i +saturating_sub(const __m128i v1, const __m128i v2) +{ + return _mm_subs_epu8(v1, v2); +} + +/* return false if a register is zero, true otherwise */ +static inline bool +to_bool(const __m128i v) +{ + /* _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is zero. */ + return !_mm_testz_si128(v, v); +} + +/* + * Shift entire "input" register right by N 8-bit lanes, and + * replace the first N lanes with the last N lanes from the + * "prev" register. Can be stated in C thusly: + * + * (prev << 128) | input) >> (N * 8) + * + * The third argument to the intrinsic must be a numeric constant, so + * we must have separate functions for different shift amounts. + */ +static inline __m128i +prev1(__m128i prev, __m128i input) +{ + return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1); +} + +static inline __m128i +prev2(__m128i prev, __m128i input) +{ + return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2); +} + +static inline __m128i +prev3(__m128i prev, __m128i input) +{ + return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3); +} + +/* + * For each 1-byte lane in the input, use that value as an index + * into the lookup register as if it were a 16-element byte array. + */ +static inline __m128i +lookup(const __m128i input, __m128i lookup) +{ + return _mm_shuffle_epi8(lookup, input); +} + +/* The actual algorithm */ + +/* + * classify each 2-byte sequence in the input register + * + * Technically, it leaves off the last byte, but we'll get it + * from the "prev" register on the next loop iteration. + */ +static inline __m128i +classify(const __m128i prev, const __m128i input) +{ + const __m128i input_shift1 = prev1(prev, input); + + /* put the relevant nibbles into their own bytes in their own registers */ + const __m128i byte_1_high = shift_right(input_shift1, 4); + const __m128i byte_1_low = bitwise_and(input_shift1, splat(0x0F)); + const __m128i byte_2_high = shift_right(input, 4); + + /* lookup the possible errors for each set of nibbles */ + const __m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table()); + const __m128i lookup_1_low = lookup(byte_1_low, byte_1_low_table()); + const __m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table()); + + /* + * AND all the lookups together. At this point, non-zero + * values in vector returned represent + * + * 1) invalid 2-byte sequences + * 2) the second continuation byte of a possible 3- or 4-byte character + * 3) the third continuation byte of a possible 4-byte character + */ + return bitwise_and(bitwise_and(lookup_1_high, lookup_1_low), lookup_2_high); +} + +/* + * Return a mask of locations of lead bytes for 3- and 4-byte characters. + * Such lead bytes are found 2 and 3 bytes earlier in the sequence, respectivel. + */ +static inline __m128i +get_lead_byte_mask(const __m128i prev, const __m128i input, const __m128i special_cases) +{ + /* create registers that are shifted up by 2 and 3 bytes */ + const __m128i input_shift2 = prev2(prev, input); + const __m128i input_shift3 = prev3(prev, input); + + /* + * Look in the shifted registers for valid 3- or 4-byte leads. + * There is no unsigned comparison, so we use saturating subtraction + * followed by signed comparison with zero. Any non-zero bytes + * in the result represent valid leads. + */ + const __m128i is_third_byte = saturating_sub(input_shift2, splat(0b11100000u-1)); + const __m128i is_fourth_byte = saturating_sub(input_shift3, splat(0b11110000u-1)); + + /* OR them together for easier comparison */ + const __m128i temp = bitwise_or(is_third_byte, is_fourth_byte); + + /* + * If we find valid leads 2 or 3 bytes previous, set all bits for the current byte. + * Signed arithmetic is okay because the values are small. + */ + const __m128i must23 = greater_than(temp, vzero()); + + /* + * greater_than() sets all bits in the result when true. We want to compare + * with the result of the classifier so apply a mask to allow only the high bit + * to be set. This matches the TWO_CONTS symbol above. + */ + return bitwise_and(must23, splat(0x80)); +} + +static const __m128i +check_utf8_bytes(const __m128i prev, const __m128i input) +{ + const __m128i special_cases = classify(prev, input); + const __m128i lead_byte_mask = get_lead_byte_mask(prev, input, special_cases); + return bitwise_xor(lead_byte_mask, special_cases); +} + +int +pg_validate_utf8_sse42(const unsigned char *s, int len) +{ + const unsigned char *start = s; + const int orig_len = len; + + /* + * The first time through the loop we have no previous input or error, + * so use a zeroed register. + */ + __m128i prev = vzero(); + __m128i error = vzero(); + __m128i input; + + while (len >= sizeof(__m128i)) + { + input = vload(s); + + /* check for zeros */ + error = bitwise_or(error, has_zero(input)); + + /* TODO: fast path for ascii bytes? */ + + /* do the UTF-8 validation */ + error = bitwise_or(error, check_utf8_bytes(prev, input)); + + prev = input; + s += sizeof(__m128i); + len -= sizeof(__m128i); + } + + if (len > 0) + { + /* + * We don't have enough remaining input bytes for a full register, + * so back-fill with zero bytes. + */ + unsigned char inbuf[sizeof(__m128i)]; + memset(inbuf, 0, sizeof(__m128i)); + memcpy(inbuf, s, len); + + input = vload(inbuf); + + /* + * Likewise, when we do the zero check, we don't want the trailing + * zeros to cause false positives, so create a buffer to load + * into a mask register + */ + unsigned char maskbuf[sizeof(__m128i)]; + memset(maskbuf, 0, sizeof(__m128i)); + /* XXX is this portable? */ + memset(maskbuf + len, -1, sizeof(__m128i) - len); + + const __m128i trailing_mask = vload(maskbuf); + + /* check for zeros */ + error = bitwise_or(error, has_zero(bitwise_and(input, trailing_mask))); + + /* do the UTF-8 validation */ + error = bitwise_or(error, check_utf8_bytes(prev, input)); + } + + // FIXME: in the new noError conversions, we could have incomplete bytes + // at the end. We'll need some extra logic to find the end of the + // last verified character. + // For now, it's correct to give up on any error. + if (to_bool(error)) + return pg_validate_utf8_fallback(start, orig_len); + else + return orig_len; +} diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_sse42_choose.c new file mode 100644 index 0000000000..263b840150 --- /dev/null +++ b/src/port/pg_utf8_sse42_choose.c @@ -0,0 +1,69 @@ +/*------------------------------------------------------------------------- + * + * pg_utf8_sse42_choose.c + * Choose between Intel SSE 4.2 and fallback implementation. + * + * On first call, checks if the CPU we're running on supports Intel SSE + * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise, + * fall back to the pure C implementation which has a fast path for ASCII + * text. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_utf8_choose.c + * + *------------------------------------------------------------------------- + */ + +#include "c.h" + +#ifdef HAVE__GET_CPUID +#include +#endif + +#ifdef HAVE__CPUID +#include +#endif + +#include "port/pg_utf8.h" + +static bool +pg_utf8_sse42_available(void) +{ + /* To save from checking every SSE2 intrinsic, insist on 64-bit. */ +#ifdef __x86_64__ + unsigned int exx[4] = {0, 0, 0, 0}; + +#if defined(HAVE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuid(exx, 1); +#else +#error cpuid instruction not available +#endif /* HAVE__GET_CPUID */ + return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */ + +#else + return false; +#endif /* __x86_64__ */ +} + +/* + * This gets called on the first call. It replaces the function pointer + * so that subsequent calls are routed directly to the chosen implementation. + */ +static int +pg_validate_utf8_choose(const unsigned char *s, int len) +{ + if (pg_utf8_sse42_available()) + pg_validate_utf8 = pg_validate_utf8_sse42; + else + pg_validate_utf8 = pg_validate_utf8_fallback; + + return pg_validate_utf8(s, len); +} + +int (*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose; diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out index e34ab20974..e37bda8057 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out @@ -72,6 +72,58 @@ $$; -- -- UTF-8 -- +CREATE TABLE utf8_verification_inputs (inbytes bytea, description text); +insert into utf8_verification_inputs values + ('\xaf', 'bare continuation'), + ('\xc5', 'missing second byte in 2-byte char'), + ('\xc080', 'smallest 2-byte overlong'), + ('\xc1bf', 'largest 2-byte overlong'), + ('\xc280', 'next 2-byte after overlongs'), + ('\xdfbf', 'largest 2-byte'), + ('\xe9af', 'missing third byte in 3-byte char'), + ('\xe08080', 'smallest 3-byte overlong'), + ('\xe09fbf', 'largest 3-byte overlong'), + ('\xe0a080', 'next 3-byte after overlong'), + ('\xed9fbf', 'last before surrogates'), + ('\xeda080', 'smallest surrogate'), + ('\xedbfbf', 'largest surrogate'), + ('\xee8080', 'next after surrogates'), + ('\xefbfbf', 'largest 3-byte'), + ('\xf1afbf', 'missing fourth byte in 4-byte char'), + ('\xf0808080', 'smallest 4-byte overlong'), + ('\xf08fbfbf', 'largest 4-byte overlong'), + ('\xf0908080', 'next 4-byte after overlong'), + ('\xf48fbfbf', 'largest 4-byte'), + ('\xf4908080', 'smallest too large'), + ('\xfa9a9a8a8a', '5 byte'); +-- Test UTF-8 verification +select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs; + description | result | errorat | error +------------------------------------+------------+--------------+---------------------------------------------------------------- + bare continuation | \x | \xaf | invalid byte sequence for encoding "UTF8": 0xaf + missing second byte in 2-byte char | \x | \xc5 | invalid byte sequence for encoding "UTF8": 0xc5 + smallest 2-byte overlong | \x | \xc080 | invalid byte sequence for encoding "UTF8": 0xc0 0x80 + largest 2-byte overlong | \x | \xc1bf | invalid byte sequence for encoding "UTF8": 0xc1 0xbf + next 2-byte after overlongs | \xc280 | | + largest 2-byte | \xdfbf | | + missing third byte in 3-byte char | \x | \xe9af | invalid byte sequence for encoding "UTF8": 0xe9 0xaf + smallest 3-byte overlong | \x | \xe08080 | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80 + largest 3-byte overlong | \x | \xe09fbf | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf + next 3-byte after overlong | \xe0a080 | | + last before surrogates | \xed9fbf | | + smallest surrogate | \x | \xeda080 | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80 + largest surrogate | \x | \xedbfbf | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf + next after surrogates | \xee8080 | | + largest 3-byte | \xefbfbf | | + missing fourth byte in 4-byte char | \x | \xf1afbf | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf + smallest 4-byte overlong | \x | \xf0808080 | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80 + largest 4-byte overlong | \x | \xf08fbfbf | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf + next 4-byte after overlong | \xf0908080 | | + largest 4-byte | \xf48fbfbf | | + smallest too large | \x | \xf4908080 | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80 + 5 byte | \x | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa +(22 rows) + CREATE TABLE utf8_inputs (inbytes bytea, description text); insert into utf8_inputs values ('\x666f6f', 'valid, pure ASCII'), diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql index ea85f20ed8..7f761cd630 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql @@ -74,6 +74,34 @@ $$; -- -- UTF-8 -- +CREATE TABLE utf8_verification_inputs (inbytes bytea, description text); +insert into utf8_verification_inputs values + ('\xaf', 'bare continuation'), + ('\xc5', 'missing second byte in 2-byte char'), + ('\xc080', 'smallest 2-byte overlong'), + ('\xc1bf', 'largest 2-byte overlong'), + ('\xc280', 'next 2-byte after overlongs'), + ('\xdfbf', 'largest 2-byte'), + ('\xe9af', 'missing third byte in 3-byte char'), + ('\xe08080', 'smallest 3-byte overlong'), + ('\xe09fbf', 'largest 3-byte overlong'), + ('\xe0a080', 'next 3-byte after overlong'), + ('\xed9fbf', 'last before surrogates'), + ('\xeda080', 'smallest surrogate'), + ('\xedbfbf', 'largest surrogate'), + ('\xee8080', 'next after surrogates'), + ('\xefbfbf', 'largest 3-byte'), + ('\xf1afbf', 'missing fourth byte in 4-byte char'), + ('\xf0808080', 'smallest 4-byte overlong'), + ('\xf08fbfbf', 'largest 4-byte overlong'), + ('\xf0908080', 'next 4-byte after overlong'), + ('\xf48fbfbf', 'largest 4-byte'), + ('\xf4908080', 'smallest too large'), + ('\xfa9a9a8a8a', '5 byte'); + +-- Test UTF-8 verification +select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs; + CREATE TABLE utf8_inputs (inbytes bytea, description text); insert into utf8_inputs values ('\x666f6f', 'valid, pure ASCII'), diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm index 2aa062b2c9..5f778570ee 100644 --- a/src/tools/msvc/Solution.pm +++ b/src/tools/msvc/Solution.pm @@ -489,6 +489,9 @@ sub GenerateFiles USE_NAMED_POSIX_SEMAPHORES => undef, USE_OPENSSL => undef, USE_PAM => undef, + USE_FALLBACK_UTF8 => undef, + USE_SSE42_UTF8 => undef, + USE_SSE42_UTF8_WITH_RUNTIME_CHECK => undef, USE_SLICING_BY_8_CRC32C => undef, USE_SSE42_CRC32C => undef, USE_SSE42_CRC32C_WITH_RUNTIME_CHECK => 1,