Re: The "char" type versus non-ASCII characters - Mailing list pgsql-hackers
From | Tom Lane |
---|---|
Subject | Re: The "char" type versus non-ASCII characters |
Date | |
Msg-id | 1486364.1659306324@sss.pgh.pa.us Whole thread Raw |
In response to | Re: The "char" type versus non-ASCII characters (Tom Lane <tgl@sss.pgh.pa.us>) |
Responses |
Re: The "char" type versus non-ASCII characters
|
List | pgsql-hackers |
I wrote: > This came up again today [1], so here's a concrete proposal. > Let's use \ooo for high-bit-set chars, but keep backslash as just > backslash (so it's only semi-compatible with bytea). Hearing no howls of protest, here's a fleshed out, potentially-committable version. I added some regression test coverage for the modified code. (I also fixed an astonishingly obsolete comment about what the regular char type does.) I looked at the SGML docs too, but I don't think there is anything to change there. The docs say "single-byte internal type" and are silent about "char" beyond that. I think that's exactly where we want to be: any more detail would encourage people to use the type, which we don't really want. Possibly we could change the text to "single-byte internal type, meant to hold ASCII characters" but I'm not sure that's better. The next question is what to do with this. I propose to commit it into HEAD and v15 before next week's beta3 release. If we don't get a lot of pushback, we could consider back-patching further for the November releases; but I'm hesitant to shove something like this into stable branches with only a week's notice. regards, tom lane diff --git a/src/backend/utils/adt/char.c b/src/backend/utils/adt/char.c index 0df41c2253..e50293bf14 100644 --- a/src/backend/utils/adt/char.c +++ b/src/backend/utils/adt/char.c @@ -20,6 +20,11 @@ #include "libpq/pqformat.h" #include "utils/builtins.h" +#define ISOCTAL(c) (((c) >= '0') && ((c) <= '7')) +#define TOOCTAL(c) ((c) + '0') +#define FROMOCTAL(c) ((unsigned char) (c) - '0') + + /***************************************************************************** * USER I/O ROUTINES * *****************************************************************************/ @@ -27,31 +32,53 @@ /* * charin - converts "x" to 'x' * - * Note that an empty input string will implicitly be converted to \0. + * This accepts the formats charout produces. If we have multibyte input + * that is not in the form '\ooo', then we take its first byte as the value + * and silently discard the rest; this is a backwards-compatibility provision. */ Datum charin(PG_FUNCTION_ARGS) { char *ch = PG_GETARG_CSTRING(0); + if (strlen(ch) == 4 && ch[0] == '\\' && + ISOCTAL(ch[1]) && ISOCTAL(ch[2]) && ISOCTAL(ch[3])) + PG_RETURN_CHAR((FROMOCTAL(ch[1]) << 6) + + (FROMOCTAL(ch[2]) << 3) + + FROMOCTAL(ch[3])); + /* This will do the right thing for a zero-length input string */ PG_RETURN_CHAR(ch[0]); } /* * charout - converts 'x' to "x" * - * Note that if the char value is \0, the resulting string will appear - * to be empty (null-terminated after zero characters). So this is the - * inverse of the charin() function for such data. + * The possible output formats are: + * 1. 0x00 is represented as an empty string. + * 2. 0x01..0x7F are represented as a single ASCII byte. + * 3. 0x80..0xFF are represented as \ooo (backslash and 3 octal digits). + * Case 3 is meant to match the traditional "escape" format of bytea. */ Datum charout(PG_FUNCTION_ARGS) { char ch = PG_GETARG_CHAR(0); - char *result = (char *) palloc(2); + char *result = (char *) palloc(5); - result[0] = ch; - result[1] = '\0'; + if (IS_HIGHBIT_SET(ch)) + { + result[0] = '\\'; + result[1] = TOOCTAL(((unsigned char) ch) >> 6); + result[2] = TOOCTAL((((unsigned char) ch) >> 3) & 07); + result[3] = TOOCTAL(((unsigned char) ch) & 07); + result[4] = '\0'; + } + else + { + /* This produces acceptable results for 0x00 as well */ + result[0] = ch; + result[1] = '\0'; + } PG_RETURN_CSTRING(result); } @@ -176,15 +203,20 @@ Datum text_char(PG_FUNCTION_ARGS) { text *arg1 = PG_GETARG_TEXT_PP(0); + char *ch = VARDATA_ANY(arg1); char result; /* - * An empty input string is converted to \0 (for consistency with charin). - * If the input is longer than one character, the excess data is silently - * discarded. + * Conversion rules are the same as in charin(), but here we need to + * handle the empty-string case honestly. */ - if (VARSIZE_ANY_EXHDR(arg1) > 0) - result = *(VARDATA_ANY(arg1)); + if (VARSIZE_ANY_EXHDR(arg1) == 4 && ch[0] == '\\' && + ISOCTAL(ch[1]) && ISOCTAL(ch[2]) && ISOCTAL(ch[3])) + result = (FROMOCTAL(ch[1]) << 6) + + (FROMOCTAL(ch[2]) << 3) + + FROMOCTAL(ch[3]); + else if (VARSIZE_ANY_EXHDR(arg1) > 0) + result = ch[0]; else result = '\0'; @@ -195,13 +227,21 @@ Datum char_text(PG_FUNCTION_ARGS) { char arg1 = PG_GETARG_CHAR(0); - text *result = palloc(VARHDRSZ + 1); + text *result = palloc(VARHDRSZ + 4); /* - * Convert \0 to an empty string, for consistency with charout (and - * because the text stuff doesn't like embedded nulls all that well). + * Conversion rules are the same as in charout(), but here we need to be + * honest about converting 0x00 to an empty string. */ - if (arg1 != '\0') + if (IS_HIGHBIT_SET(arg1)) + { + SET_VARSIZE(result, VARHDRSZ + 4); + (VARDATA(result))[0] = '\\'; + (VARDATA(result))[1] = TOOCTAL(((unsigned char) arg1) >> 6); + (VARDATA(result))[2] = TOOCTAL((((unsigned char) arg1) >> 3) & 07); + (VARDATA(result))[3] = TOOCTAL(((unsigned char) arg1) & 07); + } + else if (arg1 != '\0') { SET_VARSIZE(result, VARHDRSZ + 1); *(VARDATA(result)) = arg1; diff --git a/src/test/regress/expected/char.out b/src/test/regress/expected/char.out index 2d78f90f3b..ea9b0b8eeb 100644 --- a/src/test/regress/expected/char.out +++ b/src/test/regress/expected/char.out @@ -1,8 +1,8 @@ -- -- CHAR -- --- fixed-length by value --- internally passed by value if <= 4 bytes in storage +-- Per SQL standard, CHAR means character(1), that is a varlena type +-- with a constraint restricting it to one character (not byte) SELECT char 'c' = char 'c' AS true; true ------ @@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL; abcd (4 rows) +-- +-- Also test "char", which is an ad-hoc one-byte type. It can only +-- really store ASCII characters, but we allow high-bit-set characters +-- to be accessed via bytea-like escapes. +-- +SELECT 'a'::"char"; + char +------ + a +(1 row) + +SELECT '\101'::"char"; + char +------ + A +(1 row) + +SELECT '\377'::"char"; + char +------ + \377 +(1 row) + +SELECT 'a'::"char"::text; + text +------ + a +(1 row) + +SELECT '\377'::"char"::text; + text +------ + \377 +(1 row) + +SELECT '\000'::"char"::text; + text +------ + +(1 row) + +SELECT 'a'::text::"char"; + char +------ + a +(1 row) + +SELECT '\377'::text::"char"; + char +------ + \377 +(1 row) + +SELECT ''::text::"char"; + char +------ + +(1 row) + diff --git a/src/test/regress/expected/char_1.out b/src/test/regress/expected/char_1.out index fa6644d692..ffd31551de 100644 --- a/src/test/regress/expected/char_1.out +++ b/src/test/regress/expected/char_1.out @@ -1,8 +1,8 @@ -- -- CHAR -- --- fixed-length by value --- internally passed by value if <= 4 bytes in storage +-- Per SQL standard, CHAR means character(1), that is a varlena type +-- with a constraint restricting it to one character (not byte) SELECT char 'c' = char 'c' AS true; true ------ @@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL; abcd (4 rows) +-- +-- Also test "char", which is an ad-hoc one-byte type. It can only +-- really store ASCII characters, but we allow high-bit-set characters +-- to be accessed via bytea-like escapes. +-- +SELECT 'a'::"char"; + char +------ + a +(1 row) + +SELECT '\101'::"char"; + char +------ + A +(1 row) + +SELECT '\377'::"char"; + char +------ + \377 +(1 row) + +SELECT 'a'::"char"::text; + text +------ + a +(1 row) + +SELECT '\377'::"char"::text; + text +------ + \377 +(1 row) + +SELECT '\000'::"char"::text; + text +------ + +(1 row) + +SELECT 'a'::text::"char"; + char +------ + a +(1 row) + +SELECT '\377'::text::"char"; + char +------ + \377 +(1 row) + +SELECT ''::text::"char"; + char +------ + +(1 row) + diff --git a/src/test/regress/expected/char_2.out b/src/test/regress/expected/char_2.out index 09434a44cd..56818f824b 100644 --- a/src/test/regress/expected/char_2.out +++ b/src/test/regress/expected/char_2.out @@ -1,8 +1,8 @@ -- -- CHAR -- --- fixed-length by value --- internally passed by value if <= 4 bytes in storage +-- Per SQL standard, CHAR means character(1), that is a varlena type +-- with a constraint restricting it to one character (not byte) SELECT char 'c' = char 'c' AS true; true ------ @@ -119,3 +119,62 @@ SELECT * FROM CHAR_TBL; abcd (4 rows) +-- +-- Also test "char", which is an ad-hoc one-byte type. It can only +-- really store ASCII characters, but we allow high-bit-set characters +-- to be accessed via bytea-like escapes. +-- +SELECT 'a'::"char"; + char +------ + a +(1 row) + +SELECT '\101'::"char"; + char +------ + A +(1 row) + +SELECT '\377'::"char"; + char +------ + \377 +(1 row) + +SELECT 'a'::"char"::text; + text +------ + a +(1 row) + +SELECT '\377'::"char"::text; + text +------ + \377 +(1 row) + +SELECT '\000'::"char"::text; + text +------ + +(1 row) + +SELECT 'a'::text::"char"; + char +------ + a +(1 row) + +SELECT '\377'::text::"char"; + char +------ + \377 +(1 row) + +SELECT ''::text::"char"; + char +------ + +(1 row) + diff --git a/src/test/regress/sql/char.sql b/src/test/regress/sql/char.sql index 9c83c45e34..120fed53e5 100644 --- a/src/test/regress/sql/char.sql +++ b/src/test/regress/sql/char.sql @@ -2,8 +2,8 @@ -- CHAR -- --- fixed-length by value --- internally passed by value if <= 4 bytes in storage +-- Per SQL standard, CHAR means character(1), that is a varlena type +-- with a constraint restricting it to one character (not byte) SELECT char 'c' = char 'c' AS true; @@ -71,3 +71,19 @@ DROP TABLE CHAR_TBL; INSERT INTO CHAR_TBL (f1) VALUES ('abcde'); SELECT * FROM CHAR_TBL; + +-- +-- Also test "char", which is an ad-hoc one-byte type. It can only +-- really store ASCII characters, but we allow high-bit-set characters +-- to be accessed via bytea-like escapes. +-- + +SELECT 'a'::"char"; +SELECT '\101'::"char"; +SELECT '\377'::"char"; +SELECT 'a'::"char"::text; +SELECT '\377'::"char"::text; +SELECT '\000'::"char"::text; +SELECT 'a'::text::"char"; +SELECT '\377'::text::"char"; +SELECT ''::text::"char";
pgsql-hackers by date: