diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index df29af6371..6ad8136523 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -3553,6 +3553,34 @@ repeat('Pg', 4) PgPgPgPg + + + + unistr + + unistr ( string text ) + text + + + Evaluate escaped unicode chars (4 or 6 digits) without prefix or + with prefix u (4 digits) or with prefix + U (8 digits) to chars or with prefix + + (6 digits). + + + unistr('\0441\043B\043E\043D') + слон + + + unistr('d\0061t\+000061') + data + + + unistr('d\u0061t\U00000061') + data + + + diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c index be86eb37fe..cbddb61396 100644 --- a/src/backend/parser/parser.c +++ b/src/backend/parser/parser.c @@ -278,30 +278,6 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner) return cur_token; } -/* convert hex digit (caller should have verified that) to value */ -static unsigned int -hexval(unsigned char c) -{ - if (c >= '0' && c <= '9') - return c - '0'; - if (c >= 'a' && c <= 'f') - return c - 'a' + 0xA; - if (c >= 'A' && c <= 'F') - return c - 'A' + 0xA; - elog(ERROR, "invalid hexadecimal digit"); - return 0; /* not reached */ -} - -/* is Unicode code point acceptable? */ -static void -check_unicode_value(pg_wchar c) -{ - if (!is_valid_unicode_codepoint(c)) - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("invalid Unicode escape value"))); -} - /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */ static bool check_uescapechar(unsigned char escape) diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c index d07cbafcee..b39dde12bd 100644 --- a/src/backend/parser/scansup.c +++ b/src/backend/parser/scansup.c @@ -125,3 +125,27 @@ scanner_isspace(char ch) return true; return false; } + +/* convert hex digit (caller should have verified that) to value */ +unsigned int +hexval(unsigned char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 0xA; + if (c >= 'A' && c <= 'F') + return c - 'A' + 0xA; + elog(ERROR, "invalid hexadecimal digit"); + return 0; /* not reached */ +} + +/* is Unicode code point acceptable? */ +void +check_unicode_value(pg_wchar c) +{ + if (!is_valid_unicode_codepoint(c)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape value"))); +} diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index ff9bf238f3..e16f0875d6 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -6290,3 +6290,202 @@ unicode_is_normalized(PG_FUNCTION_ARGS) PG_RETURN_BOOL(result); } + +/* + * First four chars should be hexnum digits + */ +static bool +isxdigit_four(const char *instr) +{ + return isxdigit((unsigned char) instr[0]) && + isxdigit((unsigned char) instr[1]) && + isxdigit((unsigned char) instr[2]) && + isxdigit((unsigned char) instr[3]); +} + +/* + * Translate string with hexadecimal digits to number + */ +static long int +hexval_four(const char *instr) +{ + return (hexval(instr[0]) << 12) + + (hexval(instr[1]) << 8) + + (hexval(instr[2]) << 4) + + hexval(instr[3]); +} + +/* + * Replaces unicode escape sequences by unicode chars + */ +Datum +unistr(PG_FUNCTION_ARGS) +{ + StringInfoData str; + text *input_text; + text *result; + pg_wchar pair_first = 0; + char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1]; + char *instr; + int len; + + /* when input string is NULL, then result is NULL too */ + if (PG_ARGISNULL(0)) + PG_RETURN_NULL(); + + input_text = PG_GETARG_TEXT_PP(0); + instr = VARDATA_ANY(input_text); + len = VARSIZE_ANY_EXHDR(input_text); + + initStringInfo(&str); + + while (len > 0) + { + if (instr[0] == '\\') + { + if (len >= 2 && + instr[1] == '\\') + { + if (pair_first) + goto invalid_pair; + appendStringInfoChar(&str, '\\'); + instr += 2; + len -= 2; + } + else if ((len >= 5 && isxdigit_four(&instr[1])) || + (len >= 6 && instr[1] == 'u' && isxdigit_four(&instr[2]))) + { + pg_wchar unicode; + int offset = instr[1] == 'u' ? 2 : 1; + + unicode = hexval_four(instr + offset); + + check_unicode_value(unicode); + + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + pg_unicode_to_server(unicode, (unsigned char *) cbuf); + appendStringInfoString(&str, cbuf); + } + + instr += 4 + offset; + len -= 4 + offset; + } + else if (len >= 8 && + instr[1] == '+' && + isxdigit_four(&instr[2]) && + isxdigit((unsigned char) instr[6]) && + isxdigit((unsigned char) instr[7])) + { + pg_wchar unicode; + + unicode = (hexval_four(&instr[2]) << 8) + + (hexval(instr[6]) << 4) + + hexval(instr[7]); + + check_unicode_value(unicode); + + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + pg_unicode_to_server(unicode, (unsigned char *) cbuf); + appendStringInfoString(&str, cbuf); + } + + instr += 8; + len -= 8; + } + else if (len >= 10 && + instr[1] == 'U' && + isxdigit_four(&instr[2]) && + isxdigit_four(&instr[6])) + { + pg_wchar unicode; + + unicode = (hexval_four(&instr[2]) << 16) + hexval_four(&instr[6]); + + check_unicode_value(unicode); + + if (pair_first) + { + if (is_utf16_surrogate_second(unicode)) + { + unicode = surrogate_pair_to_codepoint(pair_first, unicode); + pair_first = 0; + } + else + goto invalid_pair; + } + else if (is_utf16_surrogate_second(unicode)) + goto invalid_pair; + + if (is_utf16_surrogate_first(unicode)) + pair_first = unicode; + else + { + pg_unicode_to_server(unicode, (unsigned char *) cbuf); + appendStringInfoString(&str, cbuf); + } + + instr += 10; + len -= 10; + } + else + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode escape"), + errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX or \\UXXXXXXXX."))); + } + else + { + if (pair_first) + goto invalid_pair; + + appendStringInfoChar(&str, *instr++); + len--; + } + } + + /* unfinished surrogate pair? */ + if (pair_first) + goto invalid_pair; + + result = cstring_to_text_with_len(str.data, str.len); + pfree(str.data); + + PG_RETURN_TEXT_P(result); + +invalid_pair: + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid Unicode surrogate pair"))); +} diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index fc2202b843..92149b9cc2 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -11010,4 +11010,7 @@ proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text', prosrc => 'unicode_is_normalized' }, +{ oid => '9822', descr => 'unescape Unicode chars in strings', + proname => 'unistr', prorettype => 'text', proargtypes => 'text', + proisstrict => 't', prosrc => 'unistr' } ] diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h index 5bc426660d..1481c1da01 100644 --- a/src/include/parser/scansup.h +++ b/src/include/parser/scansup.h @@ -14,6 +14,8 @@ #ifndef SCANSUP_H #define SCANSUP_H +#include "mb/pg_wchar.h" + extern char *downcase_truncate_identifier(const char *ident, int len, bool warn); @@ -24,4 +26,8 @@ extern void truncate_identifier(char *ident, int len, bool warn); extern bool scanner_isspace(char ch); +extern unsigned int hexval(unsigned char c); + +extern void check_unicode_value(pg_wchar c); + #endif /* SCANSUP_H */ diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out index 2a1e903696..778ef6e696 100644 --- a/src/test/regress/expected/unicode.out +++ b/src/test/regress/expected/unicode.out @@ -79,3 +79,30 @@ ORDER BY num; SELECT is_normalized('abc', 'def'); -- run-time error ERROR: invalid normalization form: def +SELECT unistr('\0441\043B\043E\043D'); + unistr +-------- + слон +(1 row) + +SELECT unistr('d\u0061t\U00000061'); + unistr +-------- + data +(1 row) + +-- run-time error +SELECT unistr('wrong: \db99'); +ERROR: invalid Unicode surrogate pair +SELECT unistr('wrong: \db99\0061'); +ERROR: invalid Unicode surrogate pair +SELECT unistr('wrong: \+00db99\+000061'); +ERROR: invalid Unicode surrogate pair +SELECT unistr('wrong: \+2FFFFF'); +ERROR: invalid Unicode escape value +SELECT unistr('wrong: \udb99\u0061'); +ERROR: invalid Unicode surrogate pair +SELECT unistr('wrong: \U0000db99\U00000061'); +ERROR: invalid Unicode surrogate pair +SELECT unistr('wrong: \U002FFFFF'); +ERROR: invalid Unicode escape value diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql index ccfc6fa77a..546e85f8cd 100644 --- a/src/test/regress/sql/unicode.sql +++ b/src/test/regress/sql/unicode.sql @@ -30,3 +30,15 @@ FROM ORDER BY num; SELECT is_normalized('abc', 'def'); -- run-time error + +SELECT unistr('\0441\043B\043E\043D'); +SELECT unistr('d\u0061t\U00000061'); + +-- run-time error +SELECT unistr('wrong: \db99'); +SELECT unistr('wrong: \db99\0061'); +SELECT unistr('wrong: \+00db99\+000061'); +SELECT unistr('wrong: \+2FFFFF'); +SELECT unistr('wrong: \udb99\u0061'); +SELECT unistr('wrong: \U0000db99\U00000061'); +SELECT unistr('wrong: \U002FFFFF');