Re: Allow to_date() and to_timestamp() to accept localized names - Mailing list pgsql-hackers
From | Tom Lane |
---|---|
Subject | Re: Allow to_date() and to_timestamp() to accept localized names |
Date | |
Msg-id | 6719.1583101530@sss.pgh.pa.us Whole thread Raw |
In response to | Re: Allow to_date() and to_timestamp() to accept localized names (Tom Lane <tgl@sss.pgh.pa.us>) |
Responses |
Re: Allow to_date() and to_timestamp() to accept localized names
|
List | pgsql-hackers |
I wrote: > * I don't think it's appropriate to hard-wire DEFAULT_COLLATION_OID > as the collation to do case-folding with. For to_date/to_timestamp, > we have collatable text input so we can rely on the function's input > collation instead, at the cost of having to pass down the collation > OID through a few layers of subroutines :-(. For parse_datetime, > I punted for now and let it use DEFAULT_COLLATION_OID, because that's > currently only called by JSONB code that probably hasn't got a usable > input collation anyway (since jsonb isn't considered collatable). On closer look, it's probably a wise idea to change the signature of parse_datetime() to include a collation argument, because that function is new in v13 so there's no API-break argument against it. It will never be cheaper to change it than today. So v11 below does that, pushing the use of DEFAULT_COLLATION_OID into the json-specific code. Perhaps somebody else would like to look at whether there's something brighter for that code to do, but I still suspect there isn't, so I didn't chase it further. > Barring objections, this seems > committable to me. Going once, going twice ... regards, tom lane diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 28035f1..8b73e05 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -5968,7 +5968,7 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}'); </row> <row> <entry><literal>TM</literal> prefix</entry> - <entry>translation mode (print localized day and month names based on + <entry>translation mode (use localized day and month names based on <xref linkend="guc-lc-time"/>)</entry> <entry><literal>TMMonth</literal></entry> </row> @@ -5999,9 +5999,20 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}'); <listitem> <para> - <literal>TM</literal> does not include trailing blanks. - <function>to_timestamp</function> and <function>to_date</function> ignore - the <literal>TM</literal> modifier. + <literal>TM</literal> suppresses trailing blanks whether or + not <literal>FM</literal> is specified. + </para> + </listitem> + + <listitem> + <para> + <function>to_timestamp</function> and <function>to_date</function> + ignore letter case in the input; so for + example <literal>MON</literal>, <literal>Mon</literal>, + and <literal>mon</literal> all accept the same strings. When using + the <literal>TM</literal> modifier, case-folding is done according to + the rules of the function's input collation (see + <xref linkend="collation"/>). </para> </listitem> diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index d029468..95f7d05 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1038,7 +1038,7 @@ static void parse_format(FormatNode *node, const char *str, const KeyWord *kw, static void DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid collid); static void DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, - bool std, bool *have_error); + Oid collid, bool std, bool *have_error); #ifdef DEBUG_TO_FROM_CHAR static void dump_index(const KeyWord *k, const int *index); @@ -1057,11 +1057,14 @@ static int from_char_parse_int_len(int *dest, const char **src, const int len, FormatNode *node, bool *have_error); static int from_char_parse_int(int *dest, const char **src, FormatNode *node, bool *have_error); -static int seq_search(const char *name, const char *const *array, int *len); +static int seq_search_ascii(const char *name, const char *const *array, int *len); +static int seq_search_localized(const char *name, char **array, int *len, + Oid collid); static int from_char_seq_search(int *dest, const char **src, const char *const *array, + char **localized_array, Oid collid, FormatNode *node, bool *have_error); -static void do_to_timestamp(text *date_txt, text *fmt, bool std, +static void do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std, struct pg_tm *tm, fsec_t *fsec, int *fprec, uint32 *flags, bool *have_error); static char *fill_str(char *str, int c, int max); @@ -2457,7 +2460,7 @@ from_char_parse_int(int *dest, const char **src, FormatNode *node, bool *have_er * suitable for comparisons to ASCII strings. */ static int -seq_search(const char *name, const char *const *array, int *len) +seq_search_ascii(const char *name, const char *const *array, int *len) { unsigned char firstc; const char *const *a; @@ -2503,8 +2506,89 @@ seq_search(const char *name, const char *const *array, int *len) } /* - * Perform a sequential search in 'array' for an entry matching the first - * character(s) of the 'src' string case-insensitively. + * Sequentially search an array of possibly non-English words for + * a case-insensitive match to the initial character(s) of "name". + * + * This has the same API as seq_search_ascii(), but we use a more general + * case-folding transformation to achieve case-insensitivity. Case folding + * is done per the rules of the collation identified by "collid". + * + * The array is treated as const, but we don't declare it that way because + * the arrays exported by pg_locale.c aren't const. + */ +static int +seq_search_localized(const char *name, char **array, int *len, Oid collid) +{ + char **a; + char *upper_name; + char *lower_name; + + *len = 0; + + /* empty string can't match anything */ + if (!*name) + return -1; + + /* + * The case-folding processing done below is fairly expensive, so before + * doing that, make a quick pass to see if there is an exact match. + */ + for (a = array; *a != NULL; a++) + { + int element_len = strlen(*a); + + if (strncmp(name, *a, element_len) == 0) + { + *len = element_len; + return a - array; + } + } + + /* + * Fold to upper case, then to lower case, so that we can match reliably + * even in languages in which case conversions are not injective. + */ + upper_name = str_toupper(unconstify(char *, name), strlen(name), collid); + lower_name = str_tolower(upper_name, strlen(upper_name), collid); + pfree(upper_name); + + for (a = array; *a != NULL; a++) + { + char *upper_element; + char *lower_element; + int element_len; + + /* Likewise upper/lower-case array element */ + upper_element = str_toupper(*a, strlen(*a), collid); + lower_element = str_tolower(upper_element, strlen(upper_element), + collid); + pfree(upper_element); + element_len = strlen(lower_element); + + /* Match? */ + if (strncmp(lower_name, lower_element, element_len) == 0) + { + *len = element_len; + pfree(lower_element); + pfree(lower_name); + return a - array; + } + pfree(lower_element); + } + + pfree(lower_name); + return -1; +} + +/* + * Perform a sequential search in 'array' (or 'localized_array', if that's + * not NULL) for an entry matching the first character(s) of the 'src' + * string case-insensitively. + * + * The 'array' is presumed to be English words (all-ASCII), but + * if 'localized_array' is supplied, that might be non-English + * so we need a more expensive case-folding transformation + * (which will follow the rules of the collation 'collid'). * * If a match is found, copy the array index of the match into the integer * pointed to by 'dest', advance 'src' to the end of the part of the string @@ -2518,11 +2602,15 @@ seq_search(const char *name, const char *const *array, int *len) */ static int from_char_seq_search(int *dest, const char **src, const char *const *array, + char **localized_array, Oid collid, FormatNode *node, bool *have_error) { int len; - *dest = seq_search(*src, array, &len); + if (localized_array == NULL) + *dest = seq_search_ascii(*src, array, &len); + else + *dest = seq_search_localized(*src, localized_array, &len, collid); if (len <= 0) { @@ -3147,19 +3235,20 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col *s = '\0'; } -/* ---------- - * Process a string as denoted by a list of FormatNodes. +/* + * Process the string 'in' as denoted by the array of FormatNodes 'node[]'. * The TmFromChar struct pointed to by 'out' is populated with the results. * + * 'collid' identifies the collation to use, if needed. + * 'std' specifies standard parsing mode. + * If 'have_error' is NULL, then errors are thrown, else '*have_error' is set. + * * Note: we currently don't have any to_interval() function, so there * is no need here for INVALID_FOR_INTERVAL checks. - * - * If 'have_error' is NULL, then errors are thrown, else '*have_error' is set. - * ---------- */ static void -DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, - bool *have_error) +DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, + Oid collid, bool std, bool *have_error) { FormatNode *n; const char *s; @@ -3170,6 +3259,9 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, /* number of extra skipped characters (more than given in format string) */ int extra_skip = 0; + /* cache localized days and months */ + cache_locale_time(); + for (n = node, s = in; n->type != NODE_TYPE_END && *s != '\0'; n++) { /* @@ -3271,6 +3363,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_a_m: case DCH_p_m: from_char_seq_search(&value, &s, ampm_strings_long, + NULL, InvalidOid, n, have_error); CHECK_ERROR; from_char_set_int(&out->pm, value % 2, n, have_error); @@ -3282,6 +3375,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_am: case DCH_pm: from_char_seq_search(&value, &s, ampm_strings, + NULL, InvalidOid, n, have_error); CHECK_ERROR; from_char_set_int(&out->pm, value % 2, n, have_error); @@ -3395,6 +3489,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_a_d: case DCH_b_c: from_char_seq_search(&value, &s, adbc_strings_long, + NULL, InvalidOid, n, have_error); CHECK_ERROR; from_char_set_int(&out->bc, value % 2, n, have_error); @@ -3405,6 +3500,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_ad: case DCH_bc: from_char_seq_search(&value, &s, adbc_strings, + NULL, InvalidOid, n, have_error); CHECK_ERROR; from_char_set_int(&out->bc, value % 2, n, have_error); @@ -3414,6 +3510,8 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_Month: case DCH_month: from_char_seq_search(&value, &s, months_full, + S_TM(n->suffix) ? localized_full_months : NULL, + collid, n, have_error); CHECK_ERROR; from_char_set_int(&out->mm, value + 1, n, have_error); @@ -3423,6 +3521,8 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_Mon: case DCH_mon: from_char_seq_search(&value, &s, months, + S_TM(n->suffix) ? localized_abbrev_months : NULL, + collid, n, have_error); CHECK_ERROR; from_char_set_int(&out->mm, value + 1, n, have_error); @@ -3437,6 +3537,8 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_Day: case DCH_day: from_char_seq_search(&value, &s, days, + S_TM(n->suffix) ? localized_full_days : NULL, + collid, n, have_error); CHECK_ERROR; from_char_set_int(&out->d, value, n, have_error); @@ -3447,6 +3549,8 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_Dy: case DCH_dy: from_char_seq_search(&value, &s, days_short, + S_TM(n->suffix) ? localized_abbrev_days : NULL, + collid, n, have_error); CHECK_ERROR; from_char_set_int(&out->d, value, n, have_error); @@ -3565,6 +3669,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std, case DCH_RM: case DCH_rm: from_char_seq_search(&value, &s, rm_months_lower, + NULL, InvalidOid, n, have_error); CHECK_ERROR; from_char_set_int(&out->mm, MONTHS_PER_YEAR - value, @@ -4031,13 +4136,15 @@ to_timestamp(PG_FUNCTION_ARGS) { text *date_txt = PG_GETARG_TEXT_PP(0); text *fmt = PG_GETARG_TEXT_PP(1); + Oid collid = PG_GET_COLLATION(); Timestamp result; int tz; struct pg_tm tm; fsec_t fsec; int fprec; - do_to_timestamp(date_txt, fmt, false, &tm, &fsec, &fprec, NULL, NULL); + do_to_timestamp(date_txt, fmt, collid, false, + &tm, &fsec, &fprec, NULL, NULL); /* Use the specified time zone, if any. */ if (tm.tm_zone) @@ -4072,11 +4179,13 @@ to_date(PG_FUNCTION_ARGS) { text *date_txt = PG_GETARG_TEXT_PP(0); text *fmt = PG_GETARG_TEXT_PP(1); + Oid collid = PG_GET_COLLATION(); DateADT result; struct pg_tm tm; fsec_t fsec; - do_to_timestamp(date_txt, fmt, false, &tm, &fsec, NULL, NULL, NULL); + do_to_timestamp(date_txt, fmt, collid, false, + &tm, &fsec, NULL, NULL, NULL); /* Prevent overflow in Julian-day routines */ if (!IS_VALID_JULIAN(tm.tm_year, tm.tm_mon, tm.tm_mday)) @@ -4098,25 +4207,31 @@ to_date(PG_FUNCTION_ARGS) } /* - * Convert the 'date_txt' input to a datetime type using argument 'fmt' as a format string. + * Convert the 'date_txt' input to a datetime type using argument 'fmt' + * as a format string. The collation 'collid' may be used for case-folding + * rules in some cases. 'strict' specifies standard parsing mode. + * * The actual data type (returned in 'typid', 'typmod') is determined by * the presence of date/time/zone components in the format string. * - * When timezone component is present, the corresponding offset is set to '*tz'. + * When timezone component is present, the corresponding offset is + * returned in '*tz'. * * If 'have_error' is NULL, then errors are thrown, else '*have_error' is set * and zero value is returned. */ Datum -parse_datetime(text *date_txt, text *fmt, bool strict, Oid *typid, - int32 *typmod, int *tz, bool *have_error) +parse_datetime(text *date_txt, text *fmt, Oid collid, bool strict, + Oid *typid, int32 *typmod, int *tz, + bool *have_error) { struct pg_tm tm; fsec_t fsec; int fprec; uint32 flags; - do_to_timestamp(date_txt, fmt, strict, &tm, &fsec, &fprec, &flags, have_error); + do_to_timestamp(date_txt, fmt, collid, strict, + &tm, &fsec, &fprec, &flags, have_error); CHECK_ERROR; *typmod = fprec ? fprec : -1; /* fractional part precision */ @@ -4278,21 +4393,21 @@ on_error: * Parse the 'date_txt' according to 'fmt', return results as a struct pg_tm, * fractional seconds, and fractional precision. * + * 'collid' identifies the collation to use, if needed. + * 'std' specifies standard parsing mode. + * Bit mask of date/time/zone components found in 'fmt' is returned in 'flags', + * if that is not NULL. + * If 'have_error' is NULL, then errors are thrown, else '*have_error' is set. + * * We parse 'fmt' into a list of FormatNodes, which is then passed to * DCH_from_char to populate a TmFromChar with the parsed contents of * 'date_txt'. * * The TmFromChar is then analysed and converted into the final results in - * struct 'tm' and 'fsec'. - * - * Bit mask of date/time/zone components found in 'fmt' is returned in 'flags'. - * - * 'std' specifies standard parsing mode. - * - * If 'have_error' is NULL, then errors are thrown, else '*have_error' is set. + * struct 'tm', 'fsec', and 'fprec'. */ static void -do_to_timestamp(text *date_txt, text *fmt, bool std, +do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std, struct pg_tm *tm, fsec_t *fsec, int *fprec, uint32 *flags, bool *have_error) { @@ -4352,7 +4467,7 @@ do_to_timestamp(text *date_txt, text *fmt, bool std, /* dump_index(DCH_keywords, DCH_index); */ #endif - DCH_from_char(format, date_str, &tmfc, std, have_error); + DCH_from_char(format, date_str, &tmfc, collid, std, have_error); CHECK_ERROR; pfree(fmt_str); diff --git a/src/backend/utils/adt/jsonpath_exec.c b/src/backend/utils/adt/jsonpath_exec.c index b6fdd47..1e4dd89 100644 --- a/src/backend/utils/adt/jsonpath_exec.c +++ b/src/backend/utils/adt/jsonpath_exec.c @@ -1781,6 +1781,7 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, JsonbValue jbvbuf; Datum value; text *datetime; + Oid collid; Oid typid; int32 typmod = -1; int tz = 0; @@ -1797,6 +1798,13 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, datetime = cstring_to_text_with_len(jb->val.string.val, jb->val.string.len); + /* + * At some point we might wish to have callers supply the collation to + * use, but right now it's unclear that they'd be able to do better than + * DEFAULT_COLLATION_OID anyway. + */ + collid = DEFAULT_COLLATION_OID; + if (jsp->content.arg) { text *template; @@ -1814,7 +1822,7 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, template = cstring_to_text_with_len(template_str, template_len); - value = parse_datetime(datetime, template, true, + value = parse_datetime(datetime, template, collid, true, &typid, &typmod, &tz, jspThrowErrors(cxt) ? NULL : &have_error); @@ -1858,7 +1866,7 @@ executeDateTimeMethod(JsonPathExecContext *cxt, JsonPathItem *jsp, MemoryContextSwitchTo(oldcxt); } - value = parse_datetime(datetime, fmt_txt[i], true, + value = parse_datetime(datetime, fmt_txt[i], collid, true, &typid, &typmod, &tz, &have_error); diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 25fb7e2..64fd3ae 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -96,11 +96,17 @@ char *locale_monetary; char *locale_numeric; char *locale_time; -/* lc_time localization cache */ -char *localized_abbrev_days[7]; -char *localized_full_days[7]; -char *localized_abbrev_months[12]; -char *localized_full_months[12]; +/* + * lc_time localization cache. + * + * We use only the first 7 or 12 entries of these arrays. The last array + * element is left as NULL for the convenience of outside code that wants + * to sequentially scan these arrays. + */ +char *localized_abbrev_days[7 + 1]; +char *localized_full_days[7 + 1]; +char *localized_abbrev_months[12 + 1]; +char *localized_full_months[12 + 1]; /* indicates whether locale information cache is valid */ static bool CurrentLocaleConvValid = false; @@ -922,6 +928,8 @@ cache_locale_time(void) cache_single_string(&localized_full_days[i], bufptr, encoding); bufptr += MAX_L10N_DATA; } + localized_abbrev_days[7] = NULL; + localized_full_days[7] = NULL; /* localized months */ for (i = 0; i < 12; i++) @@ -931,6 +939,8 @@ cache_locale_time(void) cache_single_string(&localized_full_months[i], bufptr, encoding); bufptr += MAX_L10N_DATA; } + localized_abbrev_months[12] = NULL; + localized_full_months[12] = NULL; CurrentLCTimeValid = true; } diff --git a/src/include/utils/formatting.h b/src/include/utils/formatting.h index 357deb9..8f62aa3 100644 --- a/src/include/utils/formatting.h +++ b/src/include/utils/formatting.h @@ -26,7 +26,7 @@ extern char *asc_tolower(const char *buff, size_t nbytes); extern char *asc_toupper(const char *buff, size_t nbytes); extern char *asc_initcap(const char *buff, size_t nbytes); -extern Datum parse_datetime(text *date_txt, text *fmt, bool std, +extern Datum parse_datetime(text *date_txt, text *fmt, Oid collid, bool strict, Oid *typid, int32 *typmod, int *tz, bool *have_error); diff --git a/src/test/regress/expected/collate.linux.utf8.out b/src/test/regress/expected/collate.linux.utf8.out index 37c6add..f06ae54 100644 --- a/src/test/regress/expected/collate.linux.utf8.out +++ b/src/test/regress/expected/collate.linux.utf8.out @@ -461,6 +461,22 @@ SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr_TR"); 01 NİS 2010 (1 row) +-- to_date +SELECT to_date('01 ŞUB 2010', 'DD TMMON YYYY'); + to_date +------------ + 02-01-2010 +(1 row) + +SELECT to_date('01 Şub 2010', 'DD TMMON YYYY'); + to_date +------------ + 02-01-2010 +(1 row) + +SELECT to_date('1234567890ab 2010', 'TMMONTH YYYY'); -- fail +ERROR: invalid value "1234567890ab" for "MONTH" +DETAIL: The given value did not match any of the allowed values for this field. -- backwards parsing CREATE VIEW collview1 AS SELECT * FROM collate_test1 WHERE b COLLATE "C" >= 'bbc'; CREATE VIEW collview2 AS SELECT a, b FROM collate_test1 ORDER BY b COLLATE "C"; diff --git a/src/test/regress/sql/collate.linux.utf8.sql b/src/test/regress/sql/collate.linux.utf8.sql index 8c26f16..cbbd220 100644 --- a/src/test/regress/sql/collate.linux.utf8.sql +++ b/src/test/regress/sql/collate.linux.utf8.sql @@ -182,6 +182,12 @@ SELECT to_char(date '2010-02-01', 'DD TMMON YYYY' COLLATE "tr_TR"); SELECT to_char(date '2010-04-01', 'DD TMMON YYYY'); SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr_TR"); +-- to_date + +SELECT to_date('01 ŞUB 2010', 'DD TMMON YYYY'); +SELECT to_date('01 Şub 2010', 'DD TMMON YYYY'); +SELECT to_date('1234567890ab 2010', 'TMMONTH YYYY'); -- fail + -- backwards parsing
pgsql-hackers by date: