diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c index 1615c48233..af1d636331 100644 --- a/src/port/pg_utf8_fallback.c +++ b/src/port/pg_utf8_fallback.c @@ -17,8 +17,91 @@ #include "port/pg_utf8.h" +static bool +pg_utf8_islegal(const unsigned char *source, int length) +{ + unsigned char a; + + switch (length) + { + default: + /* reject lengths 5 and 6 for now */ + return false; + case 4: + a = source[3]; + if (a < 0x80 || a > 0xBF) + return false; + /* FALL THRU */ + case 3: + a = source[2]; + if (a < 0x80 || a > 0xBF) + return false; + /* FALL THRU */ + case 2: + a = source[1]; + switch (*source) + { + case 0xE0: + if (a < 0xA0 || a > 0xBF) + return false; + break; + case 0xED: + if (a < 0x80 || a > 0x9F) + return false; + break; + case 0xF0: + if (a < 0x90 || a > 0xBF) + return false; + break; + case 0xF4: + if (a < 0x80 || a > 0x8F) + return false; + break; + default: + if (a < 0x80 || a > 0xBF) + return false; + break; + } + /* FALL THRU */ + case 1: + a = *source; + if (a >= 0x80 && a < 0xC2) + return false; + if (a > 0xF4) + return false; + break; + } + return true; +} + +static int +pg_utf8_verifychar(const unsigned char *s, int len) +{ + int l; -#define IS_CONTINUATION_BYTE(c) (((c) & 0b11000000) == 0b10000000) + if ((*s & 0x80) == 0) + { + if (*s == '\0') + return -1; + return 1; + } + else if ((*s & 0xe0) == 0xc0) + l = 2; + else if ((*s & 0xf0) == 0xe0) + l = 3; + else if ((*s & 0xf8) == 0xf0) + l = 4; + else + l = 1; + + if (l > len) + return -1; + + if (!pg_utf8_islegal(s, l)) + return -1; + + return l; +} /* * See the comment in common/wchar.c under "multibyte sequence validators". @@ -27,7 +110,6 @@ int pg_validate_utf8_fallback(const unsigned char *s, int len) { const unsigned char *start = s; - unsigned char b1, b2, b3, b4; while (len > 0) { @@ -49,81 +131,12 @@ pg_validate_utf8_fallback(const unsigned char *s, int len) break; l = 1; } - /* code points U+0080 through U+07FF */ - else if ((*s & 0b11100000) == 0b11000000) - { - l = 2; - if (len < l) - break; - - b1 = *s; - b2 = *(s + 1); - - if (!IS_CONTINUATION_BYTE(b2)) - break; - - /* check 2-byte overlong: 1100.000x.10xx.xxxx */ - if (b1 < 0xC2) - break; - } - /* code points U+0800 through U+D7FF and U+E000 through U+FFFF */ - else if ((*s & 0b11110000) == 0b11100000) - { - l = 3; - if (len < l) - break; - - b1 = *s; - b2 = *(s + 1); - b3 = *(s + 2); - - if (!IS_CONTINUATION_BYTE(b2) || - !IS_CONTINUATION_BYTE(b3)) - break; - - /* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */ - if (b1 == 0xE0 && b2 < 0xA0) - break; - - /* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */ - if (b1 == 0xED && b2 > 0x9F) - break; - } - /* code points U+010000 through U+10FFFF */ - else if ((*s & 0b11111000) == 0b11110000) + else { - l = 4; - if (len < l) - break; - - b1 = *s; - b2 = *(s + 1); - b3 = *(s + 2); - b4 = *(s + 3); - - if (!IS_CONTINUATION_BYTE(b2) || - !IS_CONTINUATION_BYTE(b3) || - !IS_CONTINUATION_BYTE(b4)) - break; - - /* - * check 4-byte overlong: - * 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx - */ - if (b1 == 0xF0 && b2 < 0x90) - break; - - /* - * check too large: - * 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx - */ - if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4) + l = pg_utf8_verifychar(s, len); + if (l == -1) break; } - else - /* invalid byte */ - break; - s += l; len -= l; }