From 6360898811851d0b5484e6aab7219cd7379c20e1 Mon Sep 17 00:00:00 2001 From: John Naylor Date: Sun, 18 Jul 2021 19:11:25 -0400 Subject: [PATCH v18 4/6] Check ascii 8-bytes at a time with bitwise operations --- src/common/wchar.c | 53 ++++++++++++++++++++---- src/test/regress/expected/conversion.out | 2 +- src/test/regress/sql/conversion.sql | 2 +- 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/src/common/wchar.c b/src/common/wchar.c index 4ea352bcf1..bba54912f9 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -1932,24 +1932,63 @@ utf8_advance(const unsigned char *s) return l; } +/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */ +static inline int +check_ascii(const unsigned char *s, int len) +{ + uint64 chunk, + highbits_set, + highbit_carry; + + if (len >= sizeof(uint64)) + { + memcpy(&chunk, s, sizeof(uint64)); + + /* Check if any bytes in this chunk have the high bit set. */ + highbits_set = chunk & UINT64CONST(0x8080808080808080); + if (highbits_set) + return 0; + + /* + * Check if there are any zero bytes in this chunk. + * + * First, add 0x7f to each byte. This sets the high bit in each byte, + * unless it was a zero. We already checked that none of the bytes had + * the high bit set previously, so the max value each byte can have + * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to + * worry about carrying over to the next byte. + */ + highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f); + + /* Then check that the high bit is set in each byte. */ + highbit_carry &= UINT64CONST(0x8080808080808080); + if (highbit_carry == UINT64CONST(0x8080808080808080)) + return sizeof(uint64); + else + return 0; + } + else + return 0; +} + static int pg_utf8_verifystr(const unsigned char *s, int len) { const unsigned char *start = s; /* - * fast path when we have enough bytes left in the string to cover all - * valid UTF-8 sequences + * fast path when we have enough bytes left in the string to use bitwise operations */ - while (len >= 4) + while (len >= 8) { int l; - /* check if the first byte is both non-zero and doesn't have the high bit set */ - if ((signed char) (*s) > 0) + /* fast path for ASCII-subset characters */ + l = check_ascii(s, 8); + if (l) { - s++; - len--; + s += l; + len -= l; continue; } diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out index e4ab9fe765..62461063b3 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out @@ -141,7 +141,7 @@ with test_bytes as ( ), test_padded as ( select description, - (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + (test_conv(inbytes || repeat('.', 8)::bytea, 'utf8', 'utf8')).error from test_bytes ) select diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql index e5a7e47958..5d0280cd1c 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql @@ -118,7 +118,7 @@ with test_bytes as ( ), test_padded as ( select description, - (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + (test_conv(inbytes || repeat('.', 8)::bytea, 'utf8', 'utf8')).error from test_bytes ) select -- 2.31.1