Re: Allow to_date() and to_timestamp() to accept localized names - Mailing list pgsql-hackers
From | Tom Lane |
---|---|
Subject | Re: Allow to_date() and to_timestamp() to accept localized names |
Date | |
Msg-id | 20326.1579816853@sss.pgh.pa.us Whole thread Raw |
In response to | Re: Allow to_date() and to_timestamp() to accept localized names (Alvaro Herrera <alvherre@2ndquadrant.com>) |
Responses |
Re: Allow to_date() and to_timestamp() to accept localized names
|
List | pgsql-hackers |
Here's a v7 patch, rebased over my recent hacking, and addressing most of the complaints I raised in <31691.1579648824@sss.pgh.pa.us>. However, I've got some new complaints now that I've looked harder at the code: * It's not exactly apparent to me why this code should be concerned about non-normalized characters when noplace else in the backend is. I think we should either rip that out entirely, or move the logic into str_tolower (and hence also str_toupper etc). I'd tend to favor the former, but of course I don't speak any languages where this would be an issue. Perhaps a better idea is to invent a new SQL function that normalizes a given string, and expect users to call that first if they'd like to_date() to match unnormalized text. * I have no faith in this calculation that decides how long the match length was: *len = element_len + name_len - norm_len; I seriously doubt that this does the right thing if either the normalization or the downcasing changed the byte length of the string. I'm not actually sure how we can do that correctly. There's no good way to know whether the changes happened within the part of the "name" string that we matched, or the part beyond what we matched, but we only want to count the former. So this seems like a pretty hard problem, and even if this logic is somehow correct as it stands, it needs a comment explaining why. * I'm still concerned about the risk of integer overflow in the string allocations in seq_search_localized(). Those need to be adjusted to be more paranoid, similar to the code in e.g. str_tolower. regards, tom lane diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 6c4359d..ff44496 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -5934,7 +5934,7 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}'); </row> <row> <entry><literal>TM</literal> prefix</entry> - <entry>translation mode (print localized day and month names based on + <entry>translation mode (use localized day and month names based on <xref linkend="guc-lc-time"/>)</entry> <entry><literal>TMMonth</literal></entry> </row> @@ -5966,8 +5966,13 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}'); <listitem> <para> <literal>TM</literal> does not include trailing blanks. + </para> + </listitem> + + <listitem> + <para> <function>to_timestamp</function> and <function>to_date</function> ignore - the <literal>TM</literal> modifier. + the case when receiving names as an input. </para> </listitem> diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 792f9ca..2f39050 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -87,6 +87,7 @@ #include "catalog/pg_collation.h" #include "catalog/pg_type.h" +#include "common/unicode_norm.h" #include "mb/pg_wchar.h" #include "parser/scansup.h" #include "utils/builtins.h" @@ -1059,9 +1060,11 @@ static int from_char_parse_int_len(int *dest, const char **src, const int len, FormatNode *node, bool *have_error); static int from_char_parse_int(int *dest, const char **src, FormatNode *node, bool *have_error); -static int seq_search(const char *name, const char *const *array, int *len); +static int seq_search_ascii(const char *name, const char *const *array, int *len); +static int seq_search_localized(const char *name, char **array, int *len); static int from_char_seq_search(int *dest, const char **src, const char *const *array, + char **localized_array, FormatNode *node, bool *have_error); static void do_to_timestamp(text *date_txt, text *fmt, bool std, struct pg_tm *tm, fsec_t *fsec, int *fprec, @@ -2459,7 +2462,7 @@ from_char_parse_int(int *dest, const char **src, FormatNode *node, bool *have_er * suitable for comparisons to ASCII strings. */ static int -seq_search(const char *name, const char *const *array, int *len) +seq_search_ascii(const char *name, const char *const *array, int *len) { unsigned char firstc; const char *const *a; @@ -2505,8 +2508,92 @@ seq_search(const char *name, const char *const *array, int *len) } /* - * Perform a sequential search in 'array' for an entry matching the first - * character(s) of the 'src' string case-insensitively. + * Sequentially search an array of possibly non-English words for + * a case-insensitive match to the initial character(s) of "name". + * + * This has the same API as seq_search_ascii(), but we use a more general + * downcasing transformation to achieve case-insensitivity. + * + * The array is treated as const, but we don't declare it that way because + * the arrays exported by pg_locale.c aren't const. + */ +static int +seq_search_localized(const char *name, char **array, int *len) +{ + char **a; + char *lower_name; + char *norm_name; + int name_len; + int norm_len; + + *len = 0; + + /* empty string can't match anything */ + if (!*name) + return -1; + + name_len = strlen(name); + + /* Normalize name, if working in Unicode */ + if (GetDatabaseEncoding() == PG_UTF8) + { + pg_wchar *wchar_name; + pg_wchar *norm_wname; + size_t name_wlen; + size_t norm_wlen; + + wchar_name = (pg_wchar *) palloc((name_len + 1) * sizeof(pg_wchar)); + name_wlen = pg_mb2wchar_with_len(name, wchar_name, name_len); + norm_wname = unicode_normalize_kc(wchar_name); + pfree(wchar_name); + norm_wlen = pg_wchar_strlen(norm_wname); + norm_name = (char *) palloc(norm_wlen * MAX_MULTIBYTE_CHAR_LEN + 1); + norm_len = pg_wchar2mb_with_len(norm_wname, norm_name, norm_wlen); + pfree(norm_wname); + } + else + { + norm_name = unconstify(char *, name); + norm_len = name_len; + } + + /* Now lower-case it */ + lower_name = str_tolower(norm_name, norm_len, DEFAULT_COLLATION_OID); + if ((const char *) norm_name != name) + pfree(norm_name); + + for (a = array; *a != NULL; a++) + { + char *lower_element; + int element_len; + + /* Lower-case array element, assuming it is normalized */ + lower_element = str_tolower(*a, strlen(*a), DEFAULT_COLLATION_OID); + element_len = strlen(lower_element); + + /* Match? */ + if (strncmp(lower_name, lower_element, element_len) == 0) + { + *len = element_len + name_len - norm_len; + pfree(lower_element); + pfree(lower_name); + return a - array; + } + pfree(lower_element); + } + + pfree(lower_name); + return -1; +} + +/* + * Perform a sequential search in 'array' (or 'localized_array', if that's + * not NULL) for an entry matching the first character(s) of the 'src' + * string case-insensitively. + * + * The 'array' is presumed to be English words (all-ASCII), but + * if 'localized_array' is supplied, that might be non-English + * so we need a more expensive downcasing transformation. * * If a match is found, copy the array index of the match into the integer * pointed to by 'dest', advance 'src' to the end of the part of the string @@ -2520,11 +2607,15 @@ seq_search(const char *name, const char *const *array, int *len) */ static int from_char_seq_search(int *dest, const char **src, const char *const *array, + char **localized_array, FormatNode *node, bool *have_error) { int len; - *dest = seq_search(*src, array, &len); + if (localized_array == NULL) + *dest = seq_search_ascii(*src, array, &len); + else + *dest = seq_search_localized(*src, localized_array, &len); if (len <= 0) { @@ -3172,6 +3263,9 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, /* number of extra skipped characters (more than given in format string) */ int extra_skip = 0; + /* cache localized days and months */ + cache_locale_time(); + for (n = node, s = in; n->type != NODE_TYPE_END && *s != '\0'; n++) { /* @@ -3272,7 +3366,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_P_M: case DCH_a_m: case DCH_p_m: - from_char_seq_search(&value, &s, ampm_strings_long, + from_char_seq_search(&value, &s, ampm_strings_long, NULL, n, have_error); CHECK_ERROR; from_char_set_int(&out->pm, value % 2, n, have_error); @@ -3283,7 +3377,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_PM: case DCH_am: case DCH_pm: - from_char_seq_search(&value, &s, ampm_strings, + from_char_seq_search(&value, &s, ampm_strings, NULL, n, have_error); CHECK_ERROR; from_char_set_int(&out->pm, value % 2, n, have_error); @@ -3396,7 +3490,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_B_C: case DCH_a_d: case DCH_b_c: - from_char_seq_search(&value, &s, adbc_strings_long, + from_char_seq_search(&value, &s, adbc_strings_long, NULL, n, have_error); CHECK_ERROR; from_char_set_int(&out->bc, value % 2, n, have_error); @@ -3406,7 +3500,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_BC: case DCH_ad: case DCH_bc: - from_char_seq_search(&value, &s, adbc_strings, + from_char_seq_search(&value, &s, adbc_strings, NULL, n, have_error); CHECK_ERROR; from_char_set_int(&out->bc, value % 2, n, have_error); @@ -3416,6 +3510,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_Month: case DCH_month: from_char_seq_search(&value, &s, months_full, + S_TM(n->suffix) ? localized_full_months : NULL, n, have_error); CHECK_ERROR; from_char_set_int(&out->mm, value + 1, n, have_error); @@ -3425,6 +3520,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_Mon: case DCH_mon: from_char_seq_search(&value, &s, months, + S_TM(n->suffix) ? localized_abbrev_months : NULL, n, have_error); CHECK_ERROR; from_char_set_int(&out->mm, value + 1, n, have_error); @@ -3439,6 +3535,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_Day: case DCH_day: from_char_seq_search(&value, &s, days, + S_TM(n->suffix) ? localized_full_days : NULL, n, have_error); CHECK_ERROR; from_char_set_int(&out->d, value, n, have_error); @@ -3449,6 +3546,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_Dy: case DCH_dy: from_char_seq_search(&value, &s, days_short, + S_TM(n->suffix) ? localized_abbrev_days : NULL, n, have_error); CHECK_ERROR; from_char_set_int(&out->d, value, n, have_error); @@ -3566,7 +3664,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, break; case DCH_RM: case DCH_rm: - from_char_seq_search(&value, &s, rm_months_lower, + from_char_seq_search(&value, &s, rm_months_lower, NULL, n, have_error); CHECK_ERROR; from_char_set_int(&out->mm, MONTHS_PER_YEAR - value, diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 25fb7e2..64fd3ae 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -96,11 +96,17 @@ char *locale_monetary; char *locale_numeric; char *locale_time; -/* lc_time localization cache */ -char *localized_abbrev_days[7]; -char *localized_full_days[7]; -char *localized_abbrev_months[12]; -char *localized_full_months[12]; +/* + * lc_time localization cache. + * + * We use only the first 7 or 12 entries of these arrays. The last array + * element is left as NULL for the convenience of outside code that wants + * to sequentially scan these arrays. + */ +char *localized_abbrev_days[7 + 1]; +char *localized_full_days[7 + 1]; +char *localized_abbrev_months[12 + 1]; +char *localized_full_months[12 + 1]; /* indicates whether locale information cache is valid */ static bool CurrentLocaleConvValid = false; @@ -922,6 +928,8 @@ cache_locale_time(void) cache_single_string(&localized_full_days[i], bufptr, encoding); bufptr += MAX_L10N_DATA; } + localized_abbrev_days[7] = NULL; + localized_full_days[7] = NULL; /* localized months */ for (i = 0; i < 12; i++) @@ -931,6 +939,8 @@ cache_locale_time(void) cache_single_string(&localized_full_months[i], bufptr, encoding); bufptr += MAX_L10N_DATA; } + localized_abbrev_months[12] = NULL; + localized_full_months[12] = NULL; CurrentLCTimeValid = true; } diff --git a/src/test/regress/expected/collate.linux.utf8.out b/src/test/regress/expected/collate.linux.utf8.out index 37c6add..af3762e 100644 --- a/src/test/regress/expected/collate.linux.utf8.out +++ b/src/test/regress/expected/collate.linux.utf8.out @@ -461,6 +461,22 @@ SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr_TR"); 01 NİS 2010 (1 row) +-- to_date +SELECT to_date('01 ŞUB 2010', 'DD TMMON YYYY'); -- normalized \u015E + to_date +------------ + 02-01-2010 +(1 row) + +SELECT to_date('01 ŞUB 2010', 'DD TMMON YYYY'); -- not normalized \u0053+\u0327 + to_date +------------ + 02-01-2010 +(1 row) + +SELECT to_date('1234567890ab 2010', 'TMMONTH YYYY'); -- fail +ERROR: invalid value "1234567890ab" for "MONTH" +DETAIL: The given value did not match any of the allowed values for this field. -- backwards parsing CREATE VIEW collview1 AS SELECT * FROM collate_test1 WHERE b COLLATE "C" >= 'bbc'; CREATE VIEW collview2 AS SELECT a, b FROM collate_test1 ORDER BY b COLLATE "C"; diff --git a/src/test/regress/sql/collate.linux.utf8.sql b/src/test/regress/sql/collate.linux.utf8.sql index 8c26f16..b0a85c2 100644 --- a/src/test/regress/sql/collate.linux.utf8.sql +++ b/src/test/regress/sql/collate.linux.utf8.sql @@ -182,6 +182,12 @@ SELECT to_char(date '2010-02-01', 'DD TMMON YYYY' COLLATE "tr_TR"); SELECT to_char(date '2010-04-01', 'DD TMMON YYYY'); SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr_TR"); +-- to_date + +SELECT to_date('01 ŞUB 2010', 'DD TMMON YYYY'); -- normalized \u015E +SELECT to_date('01 ŞUB 2010', 'DD TMMON YYYY'); -- not normalized \u0053+\u0327 +SELECT to_date('1234567890ab 2010', 'TMMONTH YYYY'); -- fail + -- backwards parsing
pgsql-hackers by date: