diff --git a/src/common/wchar.c b/src/common/wchar.c index 0636b8765b..f0a7333985 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -15,6 +15,12 @@ #include "mb/pg_wchar.h" +/* for UTF-8 */ +#define IS_CONTINUATION_BYTE(c) (((c) & 0xC0) == 0x80) +#define IS_TWO_BYTE_LEAD(c) (((c) & 0xE0) == 0xC0) +#define IS_THREE_BYTE_LEAD(c) (((c) & 0xF0) == 0xE0) +#define IS_FOUR_BYTE_LEAD(c) (((c) & 0xF8) == 0xF0) + /* * Operations on multi-byte encodings are driven by a table of helper * functions. @@ -1757,28 +1763,265 @@ pg_utf8_verifychar(const unsigned char *s, int len) return l; } +/* possible transition states for the UTF-8 DFA */ + +#define DFA_BITS_PER_STATE 6 +#define DFA_MASK ((1 << DFA_BITS_PER_STATE) - 1) + +/* TODO: switch BGN and ERR to make the state transition encodings more readable */ + +/* Start */ +#define BGN UINT64CONST(0) +/* Invalid sequence */ +#define ERR (UINT64CONST(1) * DFA_BITS_PER_STATE) +/* Continuation states */ +#define CS1 (UINT64CONST(2) * DFA_BITS_PER_STATE) +#define CS2 (UINT64CONST(3) * DFA_BITS_PER_STATE) +#define CS3 (UINT64CONST(4) * DFA_BITS_PER_STATE) +/* Partial 3-byte sequence states */ +#define P3A (UINT64CONST(5) * DFA_BITS_PER_STATE) +#define P3B (UINT64CONST(6) * DFA_BITS_PER_STATE) +/* Partial 4-byte sequence states */ +#define P4A (UINT64CONST(7) * DFA_BITS_PER_STATE) +#define P4B (UINT64CONST(8) * DFA_BITS_PER_STATE) +/* Start and End are the same state */ +#define END BGN + +/* + * The DFA transition table would look like this if encoded as an array + * (ERR is lower case for readability). TODO: source + * + * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE + * ========================================================================= + * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, // BGN|END + * err, err, err, err, err, err, err, err, err, err, err, err, // ERR + * + * err, err, END, END, END, err, err, err, err, err, err, err, // CS1 + * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, // CS2 + * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, // CS3 + * + * err, err, err, err, CS1, err, err, err, err, err, err, err, // P3A + * err, err, CS1, CS1, err, err, err, err, err, err, err, err, // P3B + * + * err, err, err, CS2, CS2, err, err, err, err, err, err, err, // P4A + * err, err, CS2, err, err, err, err, err, err, err, err, err, // P4B + */ + +/* + * Encode each transition within DFA_BITS_PER_STATE-sized sequences of bits. + * Based on idea from Per Vognsen: + * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725 + */ + +#define ERR_ON_ALL_NON_BGN_STATES (ERR << ERR) | (ERR << CS1) | (ERR << CS2) | (ERR << CS3) | (ERR << P3A) | (ERR << P3B) | (ERR << P4A) | (ERR << P4B) + +/* Invalid bytes */ +#define ILL ERR | ERR_ON_ALL_NON_BGN_STATES + +/* Non-zero ASCII */ +#define NZA END | ERR_ON_ALL_NON_BGN_STATES + +/* continuation bytes */ +#define CR1 ERR | (ERR << ERR) | (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (ERR << P3A) | (CS1 << P3B) | (ERR << P4A) | (CS2 << P4B) +#define CR2 ERR | (ERR << ERR) | (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (ERR << P3A) | (CS1 << P3B) | (CS2 << P4A) | (ERR << P4B) +#define CR3 ERR | (ERR << ERR) | (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (ERR << P3B) | (CS2 << P4A) | (ERR << P4B) + +/* 2-byte lead */ +#define L2A CS1 | ERR_ON_ALL_NON_BGN_STATES + +/* 3-byte leads */ +#define L3A P3A | ERR_ON_ALL_NON_BGN_STATES +#define L3B CS2 | ERR_ON_ALL_NON_BGN_STATES +#define L3C P3B | ERR_ON_ALL_NON_BGN_STATES + +/* 4-byte leads */ +#define L4A P4A | ERR_ON_ALL_NON_BGN_STATES +#define L4B CS3 | ERR_ON_ALL_NON_BGN_STATES +#define L4C P4B | ERR_ON_ALL_NON_BGN_STATES + +/* maps an input byte to an integer that encodes the state transitions */ +#define REP16(a) a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a +const uint64 ByteCategory[256] = +{ + /* ASCII */ + + ILL, NZA, NZA, NZA, NZA, NZA, NZA, NZA, + NZA, NZA, NZA, NZA, NZA, NZA, NZA, NZA, + REP16(NZA), + REP16(NZA), REP16(NZA), + REP16(NZA), REP16(NZA), + REP16(NZA), REP16(NZA), + + /* continuation bytes */ + + /* 80..8F */ + REP16(CR1), + + /* 90..9F */ + REP16(CR2), + + /* A0..BF */ + REP16(CR3), REP16(CR3), + + /* leading bytes */ + + /* C0..CF */ + ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A, + L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A, + + /* D0..DF */ + L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A, + L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A, + + /* E0..EF */ + L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B, + L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B, + + /* F0..FF */ + L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL, + ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL, +}; + +/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */ +static inline int +check_ascii(const unsigned char *s, int len) +{ + uint64 chunk, + highbits_set, + highbit_carry; + + // TODO: consider replacing this with an assert. + if (len >= sizeof(uint64)) + { + memcpy(&chunk, s, sizeof(uint64)); + + /* Check if any bytes in this chunk have the high bit set. */ + highbits_set = chunk & UINT64CONST(0x8080808080808080); + if (highbits_set) + return 0; + + /* + * Check if there are any zero bytes in this chunk. + * + * First, add 0x7f to each byte. This sets the high bit in each byte, + * unless it was a zero. We already checked that none of the bytes had + * the high bit set previously, so the max value each byte can have + * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to + * worry about carrying over to the next byte. + */ + highbit_carry = chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f); + + /* Then check that the high bit is set in each byte. */ + highbit_carry &= UINT64CONST(0x8080808080808080); + if (highbit_carry == UINT64CONST(0x8080808080808080)) + return sizeof(uint64); + else + return 0; + } + else + return 0; +} + +static inline uint64 +utf8_advance(const unsigned char *s, uint64 state, int len) +{ + while (len > 0) + { + state = ByteCategory[*s++] >> (state & DFA_MASK); + len--; + } + + return state & DFA_MASK; +} + static int -pg_utf8_verifystr(const unsigned char *s, int len) +utf8_verifystr_fast(const unsigned char *s, int len) { const unsigned char *start = s; + uint64 state = BGN; - while (len > 0) +#define STRIDE_LENGTH 8 + + /* + * fast path when we have enough bytes left in the string to use bitwise + * operations + */ + while (len > STRIDE_LENGTH) { int l; /* fast path for ASCII-subset characters */ - if (!IS_HIGHBIT_SET(*s)) + l = check_ascii(s, STRIDE_LENGTH); + + /* + * If the chunk is all ASCII, we can skip the full UTF-8 check, but we + * must still check the previous chunk for incomplete multibyte + * sequences at the end. + * + * WIP: if check_ascii returned END / ERR, this could be shortened to + * if (l != state). + */ + if (!(l == STRIDE_LENGTH && state == END)) { - if (*s == '\0') - break; - l = 1; - } - else - { - l = pg_utf8_verifychar(s, len); - if (l == -1) - break; + state = utf8_advance(s, state, STRIDE_LENGTH); } + + s += STRIDE_LENGTH; + len -= STRIDE_LENGTH; + } + + /* + * If we saw an error any time during the loop, let the caller handle it. + */ + if (state == ERR) + return 0; + + /* + * Even if we didn't reach the END state, the caller knows to search for + * the last possible valid character. + */ + return s - start; +} + +static int +pg_utf8_verifystr(const unsigned char *s, int len) +{ + const unsigned char *start = s; + int valid_bytes = 0; + + /* For longer strings, verify multiple bytes at a time. */ + if (len > 8) + { + valid_bytes = utf8_verifystr_fast(s, len); + s += valid_bytes; + len -= valid_bytes; + } + + /* + * For short strings, verify one character at a time. For the last few + * bytes of a longer sequence, we first walk backwards to find the last + * byte that could have been the start of a valid character. + */ + while (s > start) + { + s--; + len++; + + if (((signed char) (*s) > 0) || + IS_TWO_BYTE_LEAD(*s) || + IS_THREE_BYTE_LEAD(*s) || + IS_FOUR_BYTE_LEAD(*s)) + break; + } + + while (len > 0) + { + int l; + + l = pg_utf8_verifychar(s, len); + if (l == -1) + break; + s += l; len -= l; } diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out index 04fdcba496..62461063b3 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out @@ -72,6 +72,91 @@ $$; -- -- UTF-8 -- +-- The description column must be unique. +CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY); +insert into utf8_verification_inputs values + ('\xaf', 'bare continuation'), + ('\xc5', 'missing second byte in 2-byte char'), + ('\xc080', 'smallest 2-byte overlong'), + ('\xc1bf', 'largest 2-byte overlong'), + ('\xc280', 'next 2-byte after overlongs'), + ('\xdfbf', 'largest 2-byte'), + ('\xe9af', 'missing third byte in 3-byte char'), + ('\xe08080', 'smallest 3-byte overlong'), + ('\xe09fbf', 'largest 3-byte overlong'), + ('\xe0a080', 'next 3-byte after overlong'), + ('\xed9fbf', 'last before surrogates'), + ('\xeda080', 'smallest surrogate'), + ('\xedbfbf', 'largest surrogate'), + ('\xee8080', 'next after surrogates'), + ('\xefbfbf', 'largest 3-byte'), + ('\xf1afbf', 'missing fourth byte in 4-byte char'), + ('\xf0808080', 'smallest 4-byte overlong'), + ('\xf08fbfbf', 'largest 4-byte overlong'), + ('\xf0908080', 'next 4-byte after overlong'), + ('\xf48fbfbf', 'largest 4-byte'), + ('\xf4908080', 'smallest too large'), + ('\xfa9a9a8a8a', '5-byte'), + ('\x66006f', 'NUL byte'); +-- Test UTF-8 verification +select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs; + description | result | errorat | error +------------------------------------+------------+--------------+---------------------------------------------------------------- + bare continuation | \x | \xaf | invalid byte sequence for encoding "UTF8": 0xaf + missing second byte in 2-byte char | \x | \xc5 | invalid byte sequence for encoding "UTF8": 0xc5 + smallest 2-byte overlong | \x | \xc080 | invalid byte sequence for encoding "UTF8": 0xc0 0x80 + largest 2-byte overlong | \x | \xc1bf | invalid byte sequence for encoding "UTF8": 0xc1 0xbf + next 2-byte after overlongs | \xc280 | | + largest 2-byte | \xdfbf | | + missing third byte in 3-byte char | \x | \xe9af | invalid byte sequence for encoding "UTF8": 0xe9 0xaf + smallest 3-byte overlong | \x | \xe08080 | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80 + largest 3-byte overlong | \x | \xe09fbf | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf + next 3-byte after overlong | \xe0a080 | | + last before surrogates | \xed9fbf | | + smallest surrogate | \x | \xeda080 | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80 + largest surrogate | \x | \xedbfbf | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf + next after surrogates | \xee8080 | | + largest 3-byte | \xefbfbf | | + missing fourth byte in 4-byte char | \x | \xf1afbf | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf + smallest 4-byte overlong | \x | \xf0808080 | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80 + largest 4-byte overlong | \x | \xf08fbfbf | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf + next 4-byte after overlong | \xf0908080 | | + largest 4-byte | \xf48fbfbf | | + smallest too large | \x | \xf4908080 | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80 + 5-byte | \x | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa + NUL byte | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 +(23 rows) + +-- Test UTF-8 verification with ASCII padding appended to provide +-- coverage for algorithms that work on multiple bytes at a time. +with test_bytes as ( + -- The error message for a sequence starting with a 4-byte lead + -- will contain all 4 bytes if they are present, so add 3 + -- ASCII bytes to the end to ensure consistent error messages. + select + inbytes, + description, + (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + from utf8_verification_inputs +), test_padded as ( + select + description, + (test_conv(inbytes || repeat('.', 8)::bytea, 'utf8', 'utf8')).error + from test_bytes +) +select + description, + b.error as orig_error, + p.error as error_after_padding +from test_padded p +join test_bytes b +using (description) +where p.error is distinct from b.error +order by description; + description | orig_error | error_after_padding +-------------+------------+--------------------- +(0 rows) + CREATE TABLE utf8_inputs (inbytes bytea, description text); insert into utf8_inputs values ('\x666f6f', 'valid, pure ASCII'), diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql index 8358682432..5d0280cd1c 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql @@ -74,6 +74,63 @@ $$; -- -- UTF-8 -- +-- The description column must be unique. +CREATE TABLE utf8_verification_inputs (inbytes bytea, description text PRIMARY KEY); +insert into utf8_verification_inputs values + ('\xaf', 'bare continuation'), + ('\xc5', 'missing second byte in 2-byte char'), + ('\xc080', 'smallest 2-byte overlong'), + ('\xc1bf', 'largest 2-byte overlong'), + ('\xc280', 'next 2-byte after overlongs'), + ('\xdfbf', 'largest 2-byte'), + ('\xe9af', 'missing third byte in 3-byte char'), + ('\xe08080', 'smallest 3-byte overlong'), + ('\xe09fbf', 'largest 3-byte overlong'), + ('\xe0a080', 'next 3-byte after overlong'), + ('\xed9fbf', 'last before surrogates'), + ('\xeda080', 'smallest surrogate'), + ('\xedbfbf', 'largest surrogate'), + ('\xee8080', 'next after surrogates'), + ('\xefbfbf', 'largest 3-byte'), + ('\xf1afbf', 'missing fourth byte in 4-byte char'), + ('\xf0808080', 'smallest 4-byte overlong'), + ('\xf08fbfbf', 'largest 4-byte overlong'), + ('\xf0908080', 'next 4-byte after overlong'), + ('\xf48fbfbf', 'largest 4-byte'), + ('\xf4908080', 'smallest too large'), + ('\xfa9a9a8a8a', '5-byte'), + ('\x66006f', 'NUL byte'); + +-- Test UTF-8 verification +select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs; + +-- Test UTF-8 verification with ASCII padding appended to provide +-- coverage for algorithms that work on multiple bytes at a time. +with test_bytes as ( + -- The error message for a sequence starting with a 4-byte lead + -- will contain all 4 bytes if they are present, so add 3 + -- ASCII bytes to the end to ensure consistent error messages. + select + inbytes, + description, + (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error + from utf8_verification_inputs +), test_padded as ( + select + description, + (test_conv(inbytes || repeat('.', 8)::bytea, 'utf8', 'utf8')).error + from test_bytes +) +select + description, + b.error as orig_error, + p.error as error_after_padding +from test_padded p +join test_bytes b +using (description) +where p.error is distinct from b.error +order by description; + CREATE TABLE utf8_inputs (inbytes bytea, description text); insert into utf8_inputs values ('\x666f6f', 'valid, pure ASCII'),