diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index df29af6371..6ad8136523 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -3553,6 +3553,34 @@ repeat('Pg', 4) PgPgPgPg
+
+
+
+ unistr
+
+ unistr ( string text )
+ text
+
+
+ Evaluate escaped unicode chars (4 or 6 digits) without prefix or
+ with prefix u (4 digits) or with prefix
+ U (8 digits) to chars or with prefix
+ + (6 digits).
+
+
+ unistr('\0441\043B\043E\043D')
+ слон
+
+
+ unistr('d\0061t\+000061')
+ data
+
+
+ unistr('d\u0061t\U00000061')
+ data
+
+
+
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index be86eb37fe..cbddb61396 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -278,30 +278,6 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
return cur_token;
}
-/* convert hex digit (caller should have verified that) to value */
-static unsigned int
-hexval(unsigned char c)
-{
- if (c >= '0' && c <= '9')
- return c - '0';
- if (c >= 'a' && c <= 'f')
- return c - 'a' + 0xA;
- if (c >= 'A' && c <= 'F')
- return c - 'A' + 0xA;
- elog(ERROR, "invalid hexadecimal digit");
- return 0; /* not reached */
-}
-
-/* is Unicode code point acceptable? */
-static void
-check_unicode_value(pg_wchar c)
-{
- if (!is_valid_unicode_codepoint(c))
- ereport(ERROR,
- (errcode(ERRCODE_SYNTAX_ERROR),
- errmsg("invalid Unicode escape value")));
-}
-
/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
static bool
check_uescapechar(unsigned char escape)
diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c
index d07cbafcee..b39dde12bd 100644
--- a/src/backend/parser/scansup.c
+++ b/src/backend/parser/scansup.c
@@ -125,3 +125,27 @@ scanner_isspace(char ch)
return true;
return false;
}
+
+/* convert hex digit (caller should have verified that) to value */
+unsigned int
+hexval(unsigned char c)
+{
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ if (c >= 'a' && c <= 'f')
+ return c - 'a' + 0xA;
+ if (c >= 'A' && c <= 'F')
+ return c - 'A' + 0xA;
+ elog(ERROR, "invalid hexadecimal digit");
+ return 0; /* not reached */
+}
+
+/* is Unicode code point acceptable? */
+void
+check_unicode_value(pg_wchar c)
+{
+ if (!is_valid_unicode_codepoint(c))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("invalid Unicode escape value")));
+}
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index ff9bf238f3..e16f0875d6 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -6290,3 +6290,202 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
PG_RETURN_BOOL(result);
}
+
+/*
+ * First four chars should be hexnum digits
+ */
+static bool
+isxdigit_four(const char *instr)
+{
+ return isxdigit((unsigned char) instr[0]) &&
+ isxdigit((unsigned char) instr[1]) &&
+ isxdigit((unsigned char) instr[2]) &&
+ isxdigit((unsigned char) instr[3]);
+}
+
+/*
+ * Translate string with hexadecimal digits to number
+ */
+static long int
+hexval_four(const char *instr)
+{
+ return (hexval(instr[0]) << 12) +
+ (hexval(instr[1]) << 8) +
+ (hexval(instr[2]) << 4) +
+ hexval(instr[3]);
+}
+
+/*
+ * Replaces unicode escape sequences by unicode chars
+ */
+Datum
+unistr(PG_FUNCTION_ARGS)
+{
+ StringInfoData str;
+ text *input_text;
+ text *result;
+ pg_wchar pair_first = 0;
+ char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
+ char *instr;
+ int len;
+
+ /* when input string is NULL, then result is NULL too */
+ if (PG_ARGISNULL(0))
+ PG_RETURN_NULL();
+
+ input_text = PG_GETARG_TEXT_PP(0);
+ instr = VARDATA_ANY(input_text);
+ len = VARSIZE_ANY_EXHDR(input_text);
+
+ initStringInfo(&str);
+
+ while (len > 0)
+ {
+ if (instr[0] == '\\')
+ {
+ if (len >= 2 &&
+ instr[1] == '\\')
+ {
+ if (pair_first)
+ goto invalid_pair;
+ appendStringInfoChar(&str, '\\');
+ instr += 2;
+ len -= 2;
+ }
+ else if ((len >= 5 && isxdigit_four(&instr[1])) ||
+ (len >= 6 && instr[1] == 'u' && isxdigit_four(&instr[2])))
+ {
+ pg_wchar unicode;
+ int offset = instr[1] == 'u' ? 2 : 1;
+
+ unicode = hexval_four(instr + offset);
+
+ check_unicode_value(unicode);
+
+ if (pair_first)
+ {
+ if (is_utf16_surrogate_second(unicode))
+ {
+ unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+ pair_first = 0;
+ }
+ else
+ goto invalid_pair;
+ }
+ else if (is_utf16_surrogate_second(unicode))
+ goto invalid_pair;
+
+ if (is_utf16_surrogate_first(unicode))
+ pair_first = unicode;
+ else
+ {
+ pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+ appendStringInfoString(&str, cbuf);
+ }
+
+ instr += 4 + offset;
+ len -= 4 + offset;
+ }
+ else if (len >= 8 &&
+ instr[1] == '+' &&
+ isxdigit_four(&instr[2]) &&
+ isxdigit((unsigned char) instr[6]) &&
+ isxdigit((unsigned char) instr[7]))
+ {
+ pg_wchar unicode;
+
+ unicode = (hexval_four(&instr[2]) << 8) +
+ (hexval(instr[6]) << 4) +
+ hexval(instr[7]);
+
+ check_unicode_value(unicode);
+
+ if (pair_first)
+ {
+ if (is_utf16_surrogate_second(unicode))
+ {
+ unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+ pair_first = 0;
+ }
+ else
+ goto invalid_pair;
+ }
+ else if (is_utf16_surrogate_second(unicode))
+ goto invalid_pair;
+
+ if (is_utf16_surrogate_first(unicode))
+ pair_first = unicode;
+ else
+ {
+ pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+ appendStringInfoString(&str, cbuf);
+ }
+
+ instr += 8;
+ len -= 8;
+ }
+ else if (len >= 10 &&
+ instr[1] == 'U' &&
+ isxdigit_four(&instr[2]) &&
+ isxdigit_four(&instr[6]))
+ {
+ pg_wchar unicode;
+
+ unicode = (hexval_four(&instr[2]) << 16) + hexval_four(&instr[6]);
+
+ check_unicode_value(unicode);
+
+ if (pair_first)
+ {
+ if (is_utf16_surrogate_second(unicode))
+ {
+ unicode = surrogate_pair_to_codepoint(pair_first, unicode);
+ pair_first = 0;
+ }
+ else
+ goto invalid_pair;
+ }
+ else if (is_utf16_surrogate_second(unicode))
+ goto invalid_pair;
+
+ if (is_utf16_surrogate_first(unicode))
+ pair_first = unicode;
+ else
+ {
+ pg_unicode_to_server(unicode, (unsigned char *) cbuf);
+ appendStringInfoString(&str, cbuf);
+ }
+
+ instr += 10;
+ len -= 10;
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("invalid Unicode escape"),
+ errhint("Unicode escapes must be \\XXXX, \\+XXXXXX, \\uXXXX or \\UXXXXXXXX.")));
+ }
+ else
+ {
+ if (pair_first)
+ goto invalid_pair;
+
+ appendStringInfoChar(&str, *instr++);
+ len--;
+ }
+ }
+
+ /* unfinished surrogate pair? */
+ if (pair_first)
+ goto invalid_pair;
+
+ result = cstring_to_text_with_len(str.data, str.len);
+ pfree(str.data);
+
+ PG_RETURN_TEXT_P(result);
+
+invalid_pair:
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("invalid Unicode surrogate pair")));
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index fc2202b843..92149b9cc2 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -11010,4 +11010,7 @@
proname => 'is_normalized', prorettype => 'bool', proargtypes => 'text text',
prosrc => 'unicode_is_normalized' },
+{ oid => '9822', descr => 'unescape Unicode chars in strings',
+ proname => 'unistr', prorettype => 'text', proargtypes => 'text',
+ proisstrict => 't', prosrc => 'unistr' }
]
diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h
index 5bc426660d..1481c1da01 100644
--- a/src/include/parser/scansup.h
+++ b/src/include/parser/scansup.h
@@ -14,6 +14,8 @@
#ifndef SCANSUP_H
#define SCANSUP_H
+#include "mb/pg_wchar.h"
+
extern char *downcase_truncate_identifier(const char *ident, int len,
bool warn);
@@ -24,4 +26,8 @@ extern void truncate_identifier(char *ident, int len, bool warn);
extern bool scanner_isspace(char ch);
+extern unsigned int hexval(unsigned char c);
+
+extern void check_unicode_value(pg_wchar c);
+
#endif /* SCANSUP_H */
diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out
index 2a1e903696..778ef6e696 100644
--- a/src/test/regress/expected/unicode.out
+++ b/src/test/regress/expected/unicode.out
@@ -79,3 +79,30 @@ ORDER BY num;
SELECT is_normalized('abc', 'def'); -- run-time error
ERROR: invalid normalization form: def
+SELECT unistr('\0441\043B\043E\043D');
+ unistr
+--------
+ слон
+(1 row)
+
+SELECT unistr('d\u0061t\U00000061');
+ unistr
+--------
+ data
+(1 row)
+
+-- run-time error
+SELECT unistr('wrong: \db99');
+ERROR: invalid Unicode surrogate pair
+SELECT unistr('wrong: \db99\0061');
+ERROR: invalid Unicode surrogate pair
+SELECT unistr('wrong: \+00db99\+000061');
+ERROR: invalid Unicode surrogate pair
+SELECT unistr('wrong: \+2FFFFF');
+ERROR: invalid Unicode escape value
+SELECT unistr('wrong: \udb99\u0061');
+ERROR: invalid Unicode surrogate pair
+SELECT unistr('wrong: \U0000db99\U00000061');
+ERROR: invalid Unicode surrogate pair
+SELECT unistr('wrong: \U002FFFFF');
+ERROR: invalid Unicode escape value
diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql
index ccfc6fa77a..546e85f8cd 100644
--- a/src/test/regress/sql/unicode.sql
+++ b/src/test/regress/sql/unicode.sql
@@ -30,3 +30,15 @@ FROM
ORDER BY num;
SELECT is_normalized('abc', 'def'); -- run-time error
+
+SELECT unistr('\0441\043B\043E\043D');
+SELECT unistr('d\u0061t\U00000061');
+
+-- run-time error
+SELECT unistr('wrong: \db99');
+SELECT unistr('wrong: \db99\0061');
+SELECT unistr('wrong: \+00db99\+000061');
+SELECT unistr('wrong: \+2FFFFF');
+SELECT unistr('wrong: \udb99\u0061');
+SELECT unistr('wrong: \U0000db99\U00000061');
+SELECT unistr('wrong: \U002FFFFF');