From c10bf1271e586c2cdebfb8e05a2dd9533c850d4a Mon Sep 17 00:00:00 2001 From: John Naylor Date: Fri, 16 Jul 2021 18:16:03 -0400 Subject: [PATCH v17 2/2] Use integer chunk for fast path multibyte check Based on idea from Amit Khandekar: https://www.postgresql.org/message-id/CAJ3gD9ejC%2BpuY%3DLgco2SGyD4tR46kye7qLZoskW0PXumtLcCpQ%40mail.gmail.com --- src/common/wchar.c | 158 ++++++++++++++++----------------------------- 1 file changed, 56 insertions(+), 102 deletions(-) diff --git a/src/common/wchar.c b/src/common/wchar.c index 3ccef6c3cb..ec4bbb3b6a 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -13,49 +13,36 @@ #include "c.h" #include "mb/pg_wchar.h" +#include "port/pg_bswap.h" -/* for UTF-8 */ -#define IS_CONTINUATION_BYTE(c) (((c) & 0xC0) == 0x80) -#define IS_TWO_BYTE_LEAD(c) (((c) & 0xE0) == 0xC0) -#define IS_THREE_BYTE_LEAD(c) (((c) & 0xF0) == 0xE0) -#define IS_FOUR_BYTE_LEAD(c) (((c) & 0xF8) == 0xF0) - /* Verify a chunk of bytes for valid ASCII including a zero-byte check. */ static inline int -check_ascii(const unsigned char *s, int len) +check_ascii(const uint64 chunk) { - uint64 chunk, - highbits_set, + uint64 highbits_set, highbit_carry; - if (len >= sizeof(uint64)) - { - memcpy(&chunk, s, sizeof(uint64)); - - /* Check if any bytes in this chunk have the high bit set. */ - highbits_set = chunk & UINT64CONST(0x8080808080808080); - if (highbits_set) - return 0; + /* Check if any bytes in this chunk have the high bit set. */ + highbits_set = chunk & UINT64CONST(0x8080808080808080); + if (highbits_set) + return 0; - /* - * Check if there are any zero bytes in this chunk. - * - * First, add 0x7f to each byte. This sets the high bit in each byte, - * unless it was a zero. We already checked that none of the bytes had - * the high bit set previously, so the max value each byte can have - * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to - * worry about carrying over to the next byte. - */ - highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f); + /* + * Check if there are any zero bytes in this chunk. + * + * First, add 0x7f to each byte. This sets the high bit in each byte, + * unless it was a zero. We already checked that none of the bytes had the + * high bit set previously, so the max value each byte can have after the + * addition is 0x7f + 0x7f = 0xfe, and we don't need to worry about + * carrying over to the next byte. + */ + highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f); - /* Then check that the high bit is set in each byte. */ - highbit_carry &= UINT64CONST(0x8080808080808080); - if (highbit_carry == UINT64CONST(0x8080808080808080)) - return sizeof(uint64); - else - return 0; - } + /* Then check that the high bit is set in each byte. */ + highbit_carry &= UINT64CONST(0x8080808080808080); + if (highbit_carry == UINT64CONST(0x8080808080808080)) + return sizeof(chunk); else return 0; } @@ -1804,92 +1791,60 @@ pg_utf8_verifychar(const unsigned char *s, int len) /* * Subroutine of pg_utf8_verifystr() to check on char. Returns the length of the - * character at *s in bytes, or 0 on invalid input or premature end of input. - * - * XXX: could this be combined with pg_utf8_verifychar above? + * character at the start of the chunk in bytes, or 0 on invalid input or premature + * end of input. */ static inline int -pg_utf8_verify_one(const unsigned char *s, int len) +pg_utf8_verify_one(const uint64 chunk_orig) { int l; - unsigned char b1, - b2, - b3, - b4; + const uint64 chunk = (pg_hton64(chunk_orig)); - /* Found non-ASCII or zero above, so verify a single character. */ - if (!IS_HIGHBIT_SET(*s)) + /* high bit not set */ + if ((chunk & 0x8000000000000000) == 0) { - if (*s == '\0') + /* check first byte for zero */ + if (chunk < 0x0100000000000000) return 0; + l = 1; } - /* code points U+0080 through U+07FF */ - else if (IS_TWO_BYTE_LEAD(*s)) + /* 2-byte lead with one continuation byte */ + else if ((chunk & 0xE0C0000000000000) == 0xC080000000000000) { - l = 2; - if (len < l) - return 0; - - b1 = *s; - b2 = *(s + 1); - - if (!IS_CONTINUATION_BYTE(b2)) - return 0; - /* check 2-byte overlong: 1100.000x.10xx.xxxx */ - if (b1 < 0xC2) + if (chunk < 0xC200000000000000) return 0; + + l = 2; } - /* code points U+0800 through U+D7FF and U+E000 through U+FFFF */ - else if (IS_THREE_BYTE_LEAD(*s)) + /* 3-byte lead with two continuation bytes */ + else if ((chunk & 0xF0C0C00000000000) == 0xE080800000000000) { - l = 3; - if (len < l) - return 0; - - b1 = *s; - b2 = *(s + 1); - b3 = *(s + 2); - - if (!IS_CONTINUATION_BYTE(b2) || - !IS_CONTINUATION_BYTE(b3)) - return 0; - - /* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */ - if (b1 == 0xE0 && b2 < 0xA0) + /* check 3-byte overlong: 1110.0000 100x.xxxx 10xx.xxxx */ + if (chunk < 0xE0A0000000000000) return 0; /* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */ - if (b1 == 0xED && b2 > 0x9F) + if (chunk > 0xED9FBFFFffffffff && chunk < 0xEE00000000000000) return 0; + + l = 3; } - /* code points U+010000 through U+10FFFF */ - else if (IS_FOUR_BYTE_LEAD(*s)) + /* 4-byte lead with three continuation bytes */ + else if ((chunk & 0xF8C0C0C000000000) == 0xF080808000000000) { - l = 4; - if (len < l) - return 0; - - b1 = *s; - b2 = *(s + 1); - b3 = *(s + 2); - b4 = *(s + 3); - - if (!IS_CONTINUATION_BYTE(b2) || - !IS_CONTINUATION_BYTE(b3) || - !IS_CONTINUATION_BYTE(b4)) - return 0; - /* * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx */ - if (b1 == 0xF0 && b2 < 0x90) + if (chunk < 0xF090000000000000) return 0; /* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */ - if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4) + if (chunk > 0xF48FBFBFffffffff) return 0; + + l = 4; } else /* invalid byte */ @@ -1898,22 +1853,23 @@ pg_utf8_verify_one(const unsigned char *s, int len) return l; } - static int pg_utf8_verifystr(const unsigned char *s, int len) { const unsigned char *start = s; + uint64 chunk; /* - * Fast path when we have at least 8 bytes left in the string. We can skip the - * length checks in the loop. + * Fast path when we have at least 8 bytes left in the string. */ while (len >= 8) { int l; + memcpy(&chunk, s, sizeof(chunk)); + /* fast path for ASCII-subset characters */ - l = check_ascii(s, 8); + l = check_ascii(chunk); if (l) { s += l; @@ -1923,10 +1879,8 @@ pg_utf8_verifystr(const unsigned char *s, int len) /* * Found non-ASCII or zero above, so verify a single character. - * By passing length as constant, the compiler should optimize away - * the length-checks in pg_utf8_verify_one. */ - l = pg_utf8_verify_one(s, 8); + l = pg_utf8_verify_one(chunk); if (l == 0) goto end; @@ -1939,8 +1893,8 @@ pg_utf8_verifystr(const unsigned char *s, int len) { int l; - l = pg_utf8_verify_one(s, len); - if (l == 0) + l = pg_utf8_verifychar(s, len); + if (l == -1) goto end; s += l; -- 2.31.1