Re: Allow to_date() and to_timestamp() to accept localized names - Mailing list pgsql-hackers

From Tom Lane
Subject Re: Allow to_date() and to_timestamp() to accept localized names
Date
Msg-id 20326.1579816853@sss.pgh.pa.us
Whole thread Raw
In response to Re: Allow to_date() and to_timestamp() to accept localized names  (Alvaro Herrera <alvherre@2ndquadrant.com>)
Responses Re: Allow to_date() and to_timestamp() to accept localized names  (Juan José Santamaría Flecha <juanjo.santamaria@gmail.com>)
List pgsql-hackers
Here's a v7 patch, rebased over my recent hacking, and addressing
most of the complaints I raised in <31691.1579648824@sss.pgh.pa.us>.
However, I've got some new complaints now that I've looked harder
at the code:

* It's not exactly apparent to me why this code should be concerned
about non-normalized characters when noplace else in the backend is.
I think we should either rip that out entirely, or move the logic
into str_tolower (and hence also str_toupper etc).  I'd tend to favor
the former, but of course I don't speak any languages where this would
be an issue.  Perhaps a better idea is to invent a new SQL function
that normalizes a given string, and expect users to call that first
if they'd like to_date() to match unnormalized text.

* I have no faith in this calculation that decides how long the match
length was:

        *len = element_len + name_len - norm_len;

I seriously doubt that this does the right thing if either the
normalization or the downcasing changed the byte length of the
string.  I'm not actually sure how we can do that correctly.
There's no good way to know whether the changes happened within
the part of the "name" string that we matched, or the part beyond
what we matched, but we only want to count the former.  So this
seems like a pretty hard problem, and even if this logic is somehow
correct as it stands, it needs a comment explaining why.

* I'm still concerned about the risk of integer overflow in the
string allocations in seq_search_localized().  Those need to be
adjusted to be more paranoid, similar to the code in e.g. str_tolower.

            regards, tom lane

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 6c4359d..ff44496 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -5934,7 +5934,7 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        </row>
        <row>
         <entry><literal>TM</literal> prefix</entry>
-        <entry>translation mode (print localized day and month names based on
+        <entry>translation mode (use localized day and month names based on
          <xref linkend="guc-lc-time"/>)</entry>
         <entry><literal>TMMonth</literal></entry>
        </row>
@@ -5966,8 +5966,13 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
      <listitem>
       <para>
        <literal>TM</literal> does not include trailing blanks.
+      </para>
+     </listitem>
+
+     <listitem>
+      <para>
        <function>to_timestamp</function> and <function>to_date</function> ignore
-       the <literal>TM</literal> modifier.
+       the case when receiving names as an input.
       </para>
      </listitem>

diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 792f9ca..2f39050 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -87,6 +87,7 @@

 #include "catalog/pg_collation.h"
 #include "catalog/pg_type.h"
+#include "common/unicode_norm.h"
 #include "mb/pg_wchar.h"
 #include "parser/scansup.h"
 #include "utils/builtins.h"
@@ -1059,9 +1060,11 @@ static int    from_char_parse_int_len(int *dest, const char **src, const int len,
                                     FormatNode *node, bool *have_error);
 static int    from_char_parse_int(int *dest, const char **src, FormatNode *node,
                                 bool *have_error);
-static int    seq_search(const char *name, const char *const *array, int *len);
+static int    seq_search_ascii(const char *name, const char *const *array, int *len);
+static int    seq_search_localized(const char *name, char **array, int *len);
 static int    from_char_seq_search(int *dest, const char **src,
                                  const char *const *array,
+                                 char **localized_array,
                                  FormatNode *node, bool *have_error);
 static void do_to_timestamp(text *date_txt, text *fmt, bool std,
                             struct pg_tm *tm, fsec_t *fsec, int *fprec,
@@ -2459,7 +2462,7 @@ from_char_parse_int(int *dest, const char **src, FormatNode *node, bool *have_er
  * suitable for comparisons to ASCII strings.
  */
 static int
-seq_search(const char *name, const char *const *array, int *len)
+seq_search_ascii(const char *name, const char *const *array, int *len)
 {
     unsigned char firstc;
     const char *const *a;
@@ -2505,8 +2508,92 @@ seq_search(const char *name, const char *const *array, int *len)
 }

 /*
- * Perform a sequential search in 'array' for an entry matching the first
- * character(s) of the 'src' string case-insensitively.
+ * Sequentially search an array of possibly non-English words for
+ * a case-insensitive match to the initial character(s) of "name".
+ *
+ * This has the same API as seq_search_ascii(), but we use a more general
+ * downcasing transformation to achieve case-insensitivity.
+ *
+ * The array is treated as const, but we don't declare it that way because
+ * the arrays exported by pg_locale.c aren't const.
+ */
+static int
+seq_search_localized(const char *name, char **array, int *len)
+{
+    char      **a;
+    char       *lower_name;
+    char       *norm_name;
+    int            name_len;
+    int            norm_len;
+
+    *len = 0;
+
+    /* empty string can't match anything */
+    if (!*name)
+        return -1;
+
+    name_len = strlen(name);
+
+    /* Normalize name, if working in Unicode */
+    if (GetDatabaseEncoding() == PG_UTF8)
+    {
+        pg_wchar   *wchar_name;
+        pg_wchar   *norm_wname;
+        size_t        name_wlen;
+        size_t        norm_wlen;
+
+        wchar_name = (pg_wchar *) palloc((name_len + 1) * sizeof(pg_wchar));
+        name_wlen = pg_mb2wchar_with_len(name, wchar_name, name_len);
+        norm_wname = unicode_normalize_kc(wchar_name);
+        pfree(wchar_name);
+        norm_wlen = pg_wchar_strlen(norm_wname);
+        norm_name = (char *) palloc(norm_wlen * MAX_MULTIBYTE_CHAR_LEN + 1);
+        norm_len = pg_wchar2mb_with_len(norm_wname, norm_name, norm_wlen);
+        pfree(norm_wname);
+    }
+    else
+    {
+        norm_name = unconstify(char *, name);
+        norm_len = name_len;
+    }
+
+    /* Now lower-case it */
+    lower_name = str_tolower(norm_name, norm_len, DEFAULT_COLLATION_OID);
+    if ((const char *) norm_name != name)
+        pfree(norm_name);
+
+    for (a = array; *a != NULL; a++)
+    {
+        char       *lower_element;
+        int            element_len;
+
+        /* Lower-case array element, assuming it is normalized */
+        lower_element = str_tolower(*a, strlen(*a), DEFAULT_COLLATION_OID);
+        element_len = strlen(lower_element);
+
+        /* Match? */
+        if (strncmp(lower_name, lower_element, element_len) == 0)
+        {
+            *len = element_len + name_len - norm_len;
+            pfree(lower_element);
+            pfree(lower_name);
+            return a - array;
+        }
+        pfree(lower_element);
+    }
+
+    pfree(lower_name);
+    return -1;
+}
+
+/*
+ * Perform a sequential search in 'array' (or 'localized_array', if that's
+ * not NULL) for an entry matching the first character(s) of the 'src'
+ * string case-insensitively.
+ *
+ * The 'array' is presumed to be English words (all-ASCII), but
+ * if 'localized_array' is supplied, that might be non-English
+ * so we need a more expensive downcasing transformation.
  *
  * If a match is found, copy the array index of the match into the integer
  * pointed to by 'dest', advance 'src' to the end of the part of the string
@@ -2520,11 +2607,15 @@ seq_search(const char *name, const char *const *array, int *len)
  */
 static int
 from_char_seq_search(int *dest, const char **src, const char *const *array,
+                     char **localized_array,
                      FormatNode *node, bool *have_error)
 {
     int            len;

-    *dest = seq_search(*src, array, &len);
+    if (localized_array == NULL)
+        *dest = seq_search_ascii(*src, array, &len);
+    else
+        *dest = seq_search_localized(*src, localized_array, &len);

     if (len <= 0)
     {
@@ -3172,6 +3263,9 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
     /* number of extra skipped characters (more than given in format string) */
     int            extra_skip = 0;

+    /* cache localized days and months */
+    cache_locale_time();
+
     for (n = node, s = in; n->type != NODE_TYPE_END && *s != '\0'; n++)
     {
         /*
@@ -3272,7 +3366,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
             case DCH_P_M:
             case DCH_a_m:
             case DCH_p_m:
-                from_char_seq_search(&value, &s, ampm_strings_long,
+                from_char_seq_search(&value, &s, ampm_strings_long, NULL,
                                      n, have_error);
                 CHECK_ERROR;
                 from_char_set_int(&out->pm, value % 2, n, have_error);
@@ -3283,7 +3377,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
             case DCH_PM:
             case DCH_am:
             case DCH_pm:
-                from_char_seq_search(&value, &s, ampm_strings,
+                from_char_seq_search(&value, &s, ampm_strings, NULL,
                                      n, have_error);
                 CHECK_ERROR;
                 from_char_set_int(&out->pm, value % 2, n, have_error);
@@ -3396,7 +3490,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
             case DCH_B_C:
             case DCH_a_d:
             case DCH_b_c:
-                from_char_seq_search(&value, &s, adbc_strings_long,
+                from_char_seq_search(&value, &s, adbc_strings_long, NULL,
                                      n, have_error);
                 CHECK_ERROR;
                 from_char_set_int(&out->bc, value % 2, n, have_error);
@@ -3406,7 +3500,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
             case DCH_BC:
             case DCH_ad:
             case DCH_bc:
-                from_char_seq_search(&value, &s, adbc_strings,
+                from_char_seq_search(&value, &s, adbc_strings, NULL,
                                      n, have_error);
                 CHECK_ERROR;
                 from_char_set_int(&out->bc, value % 2, n, have_error);
@@ -3416,6 +3510,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
             case DCH_Month:
             case DCH_month:
                 from_char_seq_search(&value, &s, months_full,
+                                     S_TM(n->suffix) ? localized_full_months : NULL,
                                      n, have_error);
                 CHECK_ERROR;
                 from_char_set_int(&out->mm, value + 1, n, have_error);
@@ -3425,6 +3520,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
             case DCH_Mon:
             case DCH_mon:
                 from_char_seq_search(&value, &s, months,
+                                     S_TM(n->suffix) ? localized_abbrev_months : NULL,
                                      n, have_error);
                 CHECK_ERROR;
                 from_char_set_int(&out->mm, value + 1, n, have_error);
@@ -3439,6 +3535,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
             case DCH_Day:
             case DCH_day:
                 from_char_seq_search(&value, &s, days,
+                                     S_TM(n->suffix) ? localized_full_days : NULL,
                                      n, have_error);
                 CHECK_ERROR;
                 from_char_set_int(&out->d, value, n, have_error);
@@ -3449,6 +3546,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
             case DCH_Dy:
             case DCH_dy:
                 from_char_seq_search(&value, &s, days_short,
+                                     S_TM(n->suffix) ? localized_abbrev_days : NULL,
                                      n, have_error);
                 CHECK_ERROR;
                 from_char_set_int(&out->d, value, n, have_error);
@@ -3566,7 +3664,7 @@ DCH_from_char(FormatNode *node, const char *in, TmFromChar *out, bool std,
                 break;
             case DCH_RM:
             case DCH_rm:
-                from_char_seq_search(&value, &s, rm_months_lower,
+                from_char_seq_search(&value, &s, rm_months_lower, NULL,
                                      n, have_error);
                 CHECK_ERROR;
                 from_char_set_int(&out->mm, MONTHS_PER_YEAR - value,
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 25fb7e2..64fd3ae 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -96,11 +96,17 @@ char       *locale_monetary;
 char       *locale_numeric;
 char       *locale_time;

-/* lc_time localization cache */
-char       *localized_abbrev_days[7];
-char       *localized_full_days[7];
-char       *localized_abbrev_months[12];
-char       *localized_full_months[12];
+/*
+ * lc_time localization cache.
+ *
+ * We use only the first 7 or 12 entries of these arrays.  The last array
+ * element is left as NULL for the convenience of outside code that wants
+ * to sequentially scan these arrays.
+ */
+char       *localized_abbrev_days[7 + 1];
+char       *localized_full_days[7 + 1];
+char       *localized_abbrev_months[12 + 1];
+char       *localized_full_months[12 + 1];

 /* indicates whether locale information cache is valid */
 static bool CurrentLocaleConvValid = false;
@@ -922,6 +928,8 @@ cache_locale_time(void)
         cache_single_string(&localized_full_days[i], bufptr, encoding);
         bufptr += MAX_L10N_DATA;
     }
+    localized_abbrev_days[7] = NULL;
+    localized_full_days[7] = NULL;

     /* localized months */
     for (i = 0; i < 12; i++)
@@ -931,6 +939,8 @@ cache_locale_time(void)
         cache_single_string(&localized_full_months[i], bufptr, encoding);
         bufptr += MAX_L10N_DATA;
     }
+    localized_abbrev_months[12] = NULL;
+    localized_full_months[12] = NULL;

     CurrentLCTimeValid = true;
 }
diff --git a/src/test/regress/expected/collate.linux.utf8.out b/src/test/regress/expected/collate.linux.utf8.out
index 37c6add..af3762e 100644
--- a/src/test/regress/expected/collate.linux.utf8.out
+++ b/src/test/regress/expected/collate.linux.utf8.out
@@ -461,6 +461,22 @@ SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr_TR");
  01 NİS 2010
 (1 row)

+-- to_date
+SELECT to_date('01 ŞUB 2010', 'DD TMMON YYYY'); -- normalized \u015E
+  to_date
+------------
+ 02-01-2010
+(1 row)
+
+SELECT to_date('01 ŞUB 2010', 'DD TMMON YYYY'); -- not normalized \u0053+\u0327
+  to_date
+------------
+ 02-01-2010
+(1 row)
+
+SELECT to_date('1234567890ab 2010', 'TMMONTH YYYY'); -- fail
+ERROR:  invalid value "1234567890ab" for "MONTH"
+DETAIL:  The given value did not match any of the allowed values for this field.
 -- backwards parsing
 CREATE VIEW collview1 AS SELECT * FROM collate_test1 WHERE b COLLATE "C" >= 'bbc';
 CREATE VIEW collview2 AS SELECT a, b FROM collate_test1 ORDER BY b COLLATE "C";
diff --git a/src/test/regress/sql/collate.linux.utf8.sql b/src/test/regress/sql/collate.linux.utf8.sql
index 8c26f16..b0a85c2 100644
--- a/src/test/regress/sql/collate.linux.utf8.sql
+++ b/src/test/regress/sql/collate.linux.utf8.sql
@@ -182,6 +182,12 @@ SELECT to_char(date '2010-02-01', 'DD TMMON YYYY' COLLATE "tr_TR");
 SELECT to_char(date '2010-04-01', 'DD TMMON YYYY');
 SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr_TR");

+-- to_date
+
+SELECT to_date('01 ŞUB 2010', 'DD TMMON YYYY'); -- normalized \u015E
+SELECT to_date('01 ŞUB 2010', 'DD TMMON YYYY'); -- not normalized \u0053+\u0327
+SELECT to_date('1234567890ab 2010', 'TMMONTH YYYY'); -- fail
+

 -- backwards parsing


pgsql-hackers by date:

Previous
From: Alvaro Herrera
Date:
Subject: Re: progress report for ANALYZE
Next
From: Tom Lane
Date:
Subject: Busted(?) optimization in ATAddForeignKeyConstraint