From 96dcf1ae9d73df3287d521b009b39c2728a993af Mon Sep 17 00:00:00 2001 From: John Naylor Date: Sun, 6 Jun 2021 11:13:09 -0400 Subject: [PATCH v11 1/2] Rewrite pg_utf8_verifystr() for speed Instead of relying on pg_utf8_verifychar() and pg_utf8_isvalid(), rewrite this function in a manner loosely based on the fallback that is part of the simdjson library. Verifying multibyte UTF-8 text is modestly faster, but the biggest improvement is in verifying ASCII, which is now around 6x times faster, depending on platform. --- src/common/wchar.c | 134 ++++++++++++++++++++++- src/test/regress/expected/conversion.out | 106 ++++++++++++++++-- src/test/regress/sql/conversion.sql | 66 ++++++++++- 3 files changed, 294 insertions(+), 12 deletions(-) diff --git a/src/common/wchar.c b/src/common/wchar.c index 6e7d731e02..2805d01f7f 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -15,6 +15,56 @@ #include "mb/pg_wchar.h" +/* for UTF-8 */ +#define IS_CONTINUATION_BYTE(c) (((c) & 0xC0) == 0x80) +#define IS_TWO_BYTE_LEAD(c) (((c) & 0xE0) == 0xC0) +#define IS_THREE_BYTE_LEAD(c) (((c) & 0xF0) == 0xE0) +#define IS_FOUR_BYTE_LEAD(c) (((c) & 0xF8) == 0xF0) + +/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */ +static inline int +check_ascii(const unsigned char *s, int len) +{ + uint64 half1, + half2, + highbits_set, + x1, + x2, + x; + + if (len >= 2 * sizeof(uint64)) + { + memcpy(&half1, s, sizeof(uint64)); + memcpy(&half2, s + sizeof(uint64), sizeof(uint64)); + + /* Check if any bytes in this chunk have the high bit set. */ + highbits_set = (half1 | half2) & UINT64CONST(0x8080808080808080); + if (highbits_set) + return 0; + + /* + * Check if there are any zero bytes in this chunk. + * + * First, add 0x7f to each byte. This sets the high bit in each byte, + * unless it was a zero. We already checked that none of the bytes had + * the high bit set previously, so the max value each byte can have + * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to + * worry about carrying over to the next byte. + */ + x1 = half1 + UINT64CONST(0x7f7f7f7f7f7f7f7f); + x2 = half2 + UINT64CONST(0x7f7f7f7f7f7f7f7f); + + /* Then check that the high bit is set in each byte. */ + x = (x1 & x2) & UINT64CONST(0x8080808080808080); + if (x != UINT64CONST(0x8080808080808080)) + return 0; + + return 2 * sizeof(uint64); + } + else + return 0; +} + /* * Operations on multi-byte encodings are driven by a table of helper * functions. @@ -1761,24 +1811,102 @@ static int pg_utf8_verifystr(const unsigned char *s, int len) { const unsigned char *start = s; + unsigned char b1, + b2, + b3, + b4; while (len > 0) { int l; /* fast path for ASCII-subset characters */ + l = check_ascii(s, len); + if (l) + { + s += l; + len -= l; + continue; + } + + /* Found non-ASCII or zero above, so verify a single character. */ if (!IS_HIGHBIT_SET(*s)) { if (*s == '\0') break; l = 1; } - else + /* code points U+0080 through U+07FF */ + else if (IS_TWO_BYTE_LEAD(*s)) { - l = pg_utf8_verifychar(s, len); - if (l == -1) + l = 2; + if (len < l) + break; + + b1 = *s; + b2 = *(s + 1); + + if (!IS_CONTINUATION_BYTE(b2)) + break; + + /* check 2-byte overlong: 1100.000x.10xx.xxxx */ + if (b1 < 0xC2) + break; + } + /* code points U+0800 through U+D7FF and U+E000 through U+FFFF */ + else if (IS_THREE_BYTE_LEAD(*s)) + { + l = 3; + if (len < l) + break; + + b1 = *s; + b2 = *(s + 1); + b3 = *(s + 2); + + if (!IS_CONTINUATION_BYTE(b2) || + !IS_CONTINUATION_BYTE(b3)) + break; + + /* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */ + if (b1 == 0xE0 && b2 < 0xA0) + break; + + /* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */ + if (b1 == 0xED && b2 > 0x9F) break; } + /* code points U+010000 through U+10FFFF */ + else if (IS_FOUR_BYTE_LEAD(*s)) + { + l = 4; + if (len < l) + break; + + b1 = *s; + b2 = *(s + 1); + b3 = *(s + 2); + b4 = *(s + 3); + + if (!IS_CONTINUATION_BYTE(b2) || + !IS_CONTINUATION_BYTE(b3) || + !IS_CONTINUATION_BYTE(b4)) + break; + + /* + * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx + */ + if (b1 == 0xF0 && b2 < 0x90) + break; + + /* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */ + if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4) + break; + } + else + /* invalid byte */ + break; + s += l; len -= l; } diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out index 04fdcba496..9315ad3abd 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out @@ -72,6 +72,58 @@ $$; -- -- UTF-8 -- +CREATE TABLE utf8_verification_inputs (inbytes bytea, description text); +insert into utf8_verification_inputs values + ('\xaf', 'bare continuation'), + ('\xc5', 'missing second byte in 2-byte char'), + ('\xc080', 'smallest 2-byte overlong'), + ('\xc1bf', 'largest 2-byte overlong'), + ('\xc280', 'next 2-byte after overlongs'), + ('\xdfbf', 'largest 2-byte'), + ('\xe9af', 'missing third byte in 3-byte char'), + ('\xe08080', 'smallest 3-byte overlong'), + ('\xe09fbf', 'largest 3-byte overlong'), + ('\xe0a080', 'next 3-byte after overlong'), + ('\xed9fbf', 'last before surrogates'), + ('\xeda080', 'smallest surrogate'), + ('\xedbfbf', 'largest surrogate'), + ('\xee8080', 'next after surrogates'), + ('\xefbfbf', 'largest 3-byte'), + ('\xf1afbf', 'missing fourth byte in 4-byte char'), + ('\xf0808080', 'smallest 4-byte overlong'), + ('\xf08fbfbf', 'largest 4-byte overlong'), + ('\xf0908080', 'next 4-byte after overlong'), + ('\xf48fbfbf', 'largest 4-byte'), + ('\xf4908080', 'smallest too large'), + ('\xfa9a9a8a8a', '5 byte'); +-- Test UTF-8 verification +select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs; + description | result | errorat | error +------------------------------------+------------+--------------+---------------------------------------------------------------- + bare continuation | \x | \xaf | invalid byte sequence for encoding "UTF8": 0xaf + missing second byte in 2-byte char | \x | \xc5 | invalid byte sequence for encoding "UTF8": 0xc5 + smallest 2-byte overlong | \x | \xc080 | invalid byte sequence for encoding "UTF8": 0xc0 0x80 + largest 2-byte overlong | \x | \xc1bf | invalid byte sequence for encoding "UTF8": 0xc1 0xbf + next 2-byte after overlongs | \xc280 | | + largest 2-byte | \xdfbf | | + missing third byte in 3-byte char | \x | \xe9af | invalid byte sequence for encoding "UTF8": 0xe9 0xaf + smallest 3-byte overlong | \x | \xe08080 | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80 + largest 3-byte overlong | \x | \xe09fbf | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf + next 3-byte after overlong | \xe0a080 | | + last before surrogates | \xed9fbf | | + smallest surrogate | \x | \xeda080 | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80 + largest surrogate | \x | \xedbfbf | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf + next after surrogates | \xee8080 | | + largest 3-byte | \xefbfbf | | + missing fourth byte in 4-byte char | \x | \xf1afbf | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf + smallest 4-byte overlong | \x | \xf0808080 | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80 + largest 4-byte overlong | \x | \xf08fbfbf | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf + next 4-byte after overlong | \xf0908080 | | + largest 4-byte | \xf48fbfbf | | + smallest too large | \x | \xf4908080 | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80 + 5 byte | \x | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa +(22 rows) + CREATE TABLE utf8_inputs (inbytes bytea, description text); insert into utf8_inputs values ('\x666f6f', 'valid, pure ASCII'), @@ -85,7 +137,7 @@ insert into utf8_inputs values ('\x666f6fefa8aa', 'valid, needs mapping function to convert to GB18030'), ('\x66e8b1ff6f6f', 'invalid byte sequence'), ('\x66006f', 'invalid, NUL byte'), - ('\x666f6fe8b100', 'invalid, NUL byte'), + ('\x666f6fe8b100', 'invalid, NUL byte at end'), ('\x666f6fe8b1', 'incomplete character at end'); -- Test UTF-8 verification select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs; @@ -102,10 +154,48 @@ select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs; valid, needs mapping function to convert to GB18030 | \x666f6fefa8aa | | invalid byte sequence | \x66 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff invalid, NUL byte | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 - invalid, NUL byte | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 + invalid, NUL byte at end | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 incomplete character at end | \x666f6f | \xe8b1 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 (13 rows) +-- Test UTF-8 verification with ASCII padding appended to provide +-- coverage for algorithms that work on 16-bytes at a time. +-- XXX: The descriptions must be unique across the utf8_inputs and +-- utf8_verification_inputs tables. +with test_bytes as ( + -- The error message for a sequence starting with a 4-byte lead + -- will contain all 4 bytes if they are present, so add 3 + -- ASCII bytes to the end to ensure consistent error messages. + select + inbytes, + description, + (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + from utf8_verification_inputs + union all + select + inbytes, + description, + (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + from utf8_inputs +), test_padded as ( + select + description, + (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error + from test_bytes +) +select + description, + b.error as orig_error, + p.error as error_after_padding +from test_padded p +join test_bytes b +using (description) +where p.error is distinct from b.error +order by description; + description | orig_error | error_after_padding +-------------+------------+--------------------- +(0 rows) + -- Test conversions from UTF-8 select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs; description | inbytes | result | errorat | error @@ -121,7 +211,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from valid, needs mapping function to convert to GB18030 | \x666f6fefa8aa | \x666f6f | \xefa8aa | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "EUC_JIS_2004" invalid byte sequence | \x66e8b1ff6f6f | \x66 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff invalid, NUL byte | \x66006f | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 - invalid, NUL byte | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 + invalid, NUL byte at end | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 incomplete character at end | \x666f6fe8b1 | \x666f6f | \xe8b1 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 (13 rows) @@ -139,7 +229,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_ valid, needs mapping function to convert to GB18030 | \x666f6fefa8aa | \x666f6f | \xefa8aa | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN1" invalid byte sequence | \x66e8b1ff6f6f | \x66 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff invalid, NUL byte | \x66006f | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 - invalid, NUL byte | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 + invalid, NUL byte at end | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 incomplete character at end | \x666f6fe8b1 | \x666f6f | \xe8b1 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 (13 rows) @@ -157,7 +247,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'latin2')).* from utf8_ valid, needs mapping function to convert to GB18030 | \x666f6fefa8aa | \x666f6f | \xefa8aa | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN2" invalid byte sequence | \x66e8b1ff6f6f | \x66 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff invalid, NUL byte | \x66006f | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 - invalid, NUL byte | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 + invalid, NUL byte at end | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 incomplete character at end | \x666f6fe8b1 | \x666f6f | \xe8b1 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 (13 rows) @@ -175,7 +265,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'latin5')).* from utf8_ valid, needs mapping function to convert to GB18030 | \x666f6fefa8aa | \x666f6f | \xefa8aa | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN5" invalid byte sequence | \x66e8b1ff6f6f | \x66 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff invalid, NUL byte | \x66006f | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 - invalid, NUL byte | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 + invalid, NUL byte at end | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 incomplete character at end | \x666f6fe8b1 | \x666f6f | \xe8b1 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 (13 rows) @@ -193,7 +283,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'koi8r')).* from utf8_i valid, needs mapping function to convert to GB18030 | \x666f6fefa8aa | \x666f6f | \xefa8aa | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "KOI8R" invalid byte sequence | \x66e8b1ff6f6f | \x66 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff invalid, NUL byte | \x66006f | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 - invalid, NUL byte | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 + invalid, NUL byte at end | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 incomplete character at end | \x666f6fe8b1 | \x666f6f | \xe8b1 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 (13 rows) @@ -211,7 +301,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'gb18030')).* from utf8 valid, needs mapping function to convert to GB18030 | \x666f6fefa8aa | \x666f6f84309c38 | | invalid byte sequence | \x66e8b1ff6f6f | \x66 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff invalid, NUL byte | \x66006f | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 - invalid, NUL byte | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 + invalid, NUL byte at end | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 incomplete character at end | \x666f6fe8b1 | \x666f6f | \xe8b1 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 (13 rows) diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql index 8358682432..8ad5290f4c 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql @@ -74,6 +74,34 @@ $$; -- -- UTF-8 -- +CREATE TABLE utf8_verification_inputs (inbytes bytea, description text); +insert into utf8_verification_inputs values + ('\xaf', 'bare continuation'), + ('\xc5', 'missing second byte in 2-byte char'), + ('\xc080', 'smallest 2-byte overlong'), + ('\xc1bf', 'largest 2-byte overlong'), + ('\xc280', 'next 2-byte after overlongs'), + ('\xdfbf', 'largest 2-byte'), + ('\xe9af', 'missing third byte in 3-byte char'), + ('\xe08080', 'smallest 3-byte overlong'), + ('\xe09fbf', 'largest 3-byte overlong'), + ('\xe0a080', 'next 3-byte after overlong'), + ('\xed9fbf', 'last before surrogates'), + ('\xeda080', 'smallest surrogate'), + ('\xedbfbf', 'largest surrogate'), + ('\xee8080', 'next after surrogates'), + ('\xefbfbf', 'largest 3-byte'), + ('\xf1afbf', 'missing fourth byte in 4-byte char'), + ('\xf0808080', 'smallest 4-byte overlong'), + ('\xf08fbfbf', 'largest 4-byte overlong'), + ('\xf0908080', 'next 4-byte after overlong'), + ('\xf48fbfbf', 'largest 4-byte'), + ('\xf4908080', 'smallest too large'), + ('\xfa9a9a8a8a', '5 byte'); + +-- Test UTF-8 verification +select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs; + CREATE TABLE utf8_inputs (inbytes bytea, description text); insert into utf8_inputs values ('\x666f6f', 'valid, pure ASCII'), @@ -87,11 +115,47 @@ insert into utf8_inputs values ('\x666f6fefa8aa', 'valid, needs mapping function to convert to GB18030'), ('\x66e8b1ff6f6f', 'invalid byte sequence'), ('\x66006f', 'invalid, NUL byte'), - ('\x666f6fe8b100', 'invalid, NUL byte'), + ('\x666f6fe8b100', 'invalid, NUL byte at end'), ('\x666f6fe8b1', 'incomplete character at end'); -- Test UTF-8 verification select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs; + +-- Test UTF-8 verification with ASCII padding appended to provide +-- coverage for algorithms that work on 16-bytes at a time. +-- XXX: The descriptions must be unique across the utf8_inputs and +-- utf8_verification_inputs tables. +with test_bytes as ( + -- The error message for a sequence starting with a 4-byte lead + -- will contain all 4 bytes if they are present, so add 3 + -- ASCII bytes to the end to ensure consistent error messages. + select + inbytes, + description, + (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + from utf8_verification_inputs + union all + select + inbytes, + description, + (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + from utf8_inputs +), test_padded as ( + select + description, + (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error + from test_bytes +) +select + description, + b.error as orig_error, + p.error as error_after_padding +from test_padded p +join test_bytes b +using (description) +where p.error is distinct from b.error +order by description; + -- Test conversions from UTF-8 select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs; select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_inputs; -- 2.31.1