Re: Fix number skipping in to_number - Mailing list pgsql-hackers

From Tom Lane
Subject Re: Fix number skipping in to_number
Date
Msg-id 28186.1510957703@sss.pgh.pa.us
Whole thread Raw
In response to Re: Fix number skipping in to_number  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-hackers
I wrote:
> That leads me to the attached patch.  There is more that could be done
> here --- in particular, I'd like to see the character-not-byte-count
> rule extended to literal text.  But that seems like fit material for
> a different patch.

Attached is a patch that makes formatting.c more multibyte-aware;
it now handles multibyte characters as single NODE_TYPE_CHAR format
nodes, rather than one node per byte.  This doesn't really have much
impact on the output (to_char) side, but on the input side, it
greatly simplifies treating such characters as single characters
rather than multiple ones.  An example is that (in UTF8 encoding)
previously we got

u8=# select to_number('$12.34', '€99D99');
 to_number 
-----------
      0.34
(1 row)

because the literal euro sign is 3 bytes long and was thought to be
3 literal characters.  Now we get the expected result

u8=# select to_number('$12.34', '€99D99');
 to_number 
-----------
     12.34
(1 row)

Aside from skipping 1 input character (not byte) per literal format
character, I fixed the SKIP_THth macro, allowing to_date/to_timestamp to
also follow the rule of skipping whole characters not bytes for non-data
format patterns.  There might be some other places that need similar
adjustments, but I couldn't find any.

Not sure about whether/how to add regression tests for this; it's really
impossible to add specific tests in an ASCII-only test file.  Maybe we
could put a test or two into collate.linux.utf8.sql, but it seems a bit
off topic for that, and I think that test file hardly gets run anyway.

Note this needs to be applied over the patch I posted at
https://postgr.es/m/3626.1510949486@sss.pgh.pa.us
I intend to commit that fairly soon, but it's not in right now.

            regards, tom lane

diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index cb0dbf7..ec97de0 100644
*** a/src/backend/utils/adt/formatting.c
--- b/src/backend/utils/adt/formatting.c
*************** typedef enum
*** 151,158 ****
      FROM_CHAR_DATE_ISOWEEK        /* ISO 8601 week date */
  } FromCharDateMode;

- typedef struct FormatNode FormatNode;
-
  typedef struct
  {
      const char *name;
--- 151,156 ----
*************** typedef struct
*** 162,174 ****
      FromCharDateMode date_mode;
  } KeyWord;

! struct FormatNode
  {
!     int            type;            /* node type            */
!     const KeyWord *key;            /* if node type is KEYWORD    */
!     char        character;        /* if node type is CHAR        */
!     int            suffix;            /* keyword suffix        */
! };

  #define NODE_TYPE_END        1
  #define NODE_TYPE_ACTION    2
--- 160,172 ----
      FromCharDateMode date_mode;
  } KeyWord;

! typedef struct
  {
!     int            type;            /* NODE_TYPE_XXX, see below */
!     const KeyWord *key;            /* if type is ACTION */
!     char        character[MAX_MULTIBYTE_CHAR_LEN + 1];    /* if type is CHAR */
!     int            suffix;            /* keyword prefix/suffix code, if any */
! } FormatNode;

  #define NODE_TYPE_END        1
  #define NODE_TYPE_ACTION    2
*************** parse_format(FormatNode *node, const cha
*** 1282,1293 ****
          }
          else if (*str)
          {
              /*
               * Process double-quoted literal string, if any
               */
              if (*str == '"')
              {
!                 while (*(++str))
                  {
                      if (*str == '"')
                      {
--- 1280,1294 ----
          }
          else if (*str)
          {
+             int            chlen;
+
              /*
               * Process double-quoted literal string, if any
               */
              if (*str == '"')
              {
!                 str++;
!                 while (*str)
                  {
                      if (*str == '"')
                      {
*************** parse_format(FormatNode *node, const cha
*** 1297,1307 ****
                      /* backslash quotes the next character, if any */
                      if (*str == '\\' && *(str + 1))
                          str++;
                      n->type = NODE_TYPE_CHAR;
!                     n->character = *str;
                      n->key = NULL;
                      n->suffix = 0;
                      n++;
                  }
              }
              else
--- 1298,1311 ----
                      /* backslash quotes the next character, if any */
                      if (*str == '\\' && *(str + 1))
                          str++;
+                     chlen = pg_mblen(str);
                      n->type = NODE_TYPE_CHAR;
!                     memcpy(n->character, str, chlen);
!                     n->character[chlen] = '\0';
                      n->key = NULL;
                      n->suffix = 0;
                      n++;
+                     str += chlen;
                  }
              }
              else
*************** parse_format(FormatNode *node, const cha
*** 1312,1323 ****
                   */
                  if (*str == '\\' && *(str + 1) == '"')
                      str++;
                  n->type = NODE_TYPE_CHAR;
!                 n->character = *str;
                  n->key = NULL;
                  n->suffix = 0;
                  n++;
!                 str++;
              }
          }
      }
--- 1316,1329 ----
                   */
                  if (*str == '\\' && *(str + 1) == '"')
                      str++;
+                 chlen = pg_mblen(str);
                  n->type = NODE_TYPE_CHAR;
!                 memcpy(n->character, str, chlen);
!                 n->character[chlen] = '\0';
                  n->key = NULL;
                  n->suffix = 0;
                  n++;
!                 str += chlen;
              }
          }
      }
*************** dump_node(FormatNode *node, int max)
*** 1349,1355 ****
              elog(DEBUG_elog_output, "%d:\t NODE_TYPE_ACTION '%s'\t(%s,%s)",
                   a, n->key->name, DUMP_THth(n->suffix), DUMP_FM(n->suffix));
          else if (n->type == NODE_TYPE_CHAR)
!             elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%c'", a, n->character);
          else if (n->type == NODE_TYPE_END)
          {
              elog(DEBUG_elog_output, "%d:\t NODE_TYPE_END", a);
--- 1355,1362 ----
              elog(DEBUG_elog_output, "%d:\t NODE_TYPE_ACTION '%s'\t(%s,%s)",
                   a, n->key->name, DUMP_THth(n->suffix), DUMP_FM(n->suffix));
          else if (n->type == NODE_TYPE_CHAR)
!             elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%s'",
!                  a, n->character);
          else if (n->type == NODE_TYPE_END)
          {
              elog(DEBUG_elog_output, "%d:\t NODE_TYPE_END", a);
*************** asc_toupper_z(const char *buff)
*** 2008,2015 ****
      do { \
          if (S_THth(_suf)) \
          { \
!             if (*(ptr)) (ptr)++; \
!             if (*(ptr)) (ptr)++; \
          } \
      } while (0)

--- 2015,2022 ----
      do { \
          if (S_THth(_suf)) \
          { \
!             if (*(ptr)) (ptr) += pg_mblen(ptr); \
!             if (*(ptr)) (ptr) += pg_mblen(ptr); \
          } \
      } while (0)

*************** is_next_separator(FormatNode *n)
*** 2076,2082 ****

          return true;
      }
!     else if (isdigit((unsigned char) n->character))
          return false;

      return true;                /* some non-digit input (separator) */
--- 2083,2090 ----

          return true;
      }
!     else if (n->character[1] == '\0' &&
!              isdigit((unsigned char) n->character[0]))
          return false;

      return true;                /* some non-digit input (separator) */
*************** DCH_to_char(FormatNode *node, bool is_in
*** 2405,2412 ****
      {
          if (n->type != NODE_TYPE_ACTION)
          {
!             *s = n->character;
!             s++;
              continue;
          }

--- 2413,2420 ----
      {
          if (n->type != NODE_TYPE_ACTION)
          {
!             strcpy(s, n->character);
!             s += strlen(s);
              continue;
          }

*************** DCH_from_char(FormatNode *node, char *in
*** 2974,2980 ****
               * we don't insist that the consumed character match the format's
               * character.
               */
!             s++;
              continue;
          }

--- 2982,2988 ----
               * we don't insist that the consumed character match the format's
               * character.
               */
!             s += pg_mblen(s);
              continue;
          }

*************** get_last_relevant_decnum(char *num)
*** 4217,4223 ****
  /*
   * These macros are used in NUM_processor() and its subsidiary routines.
   * OVERLOAD_TEST: true if we've reached end of input string
!  * AMOUNT_TEST(s): true if at least s characters remain in string
   */
  #define OVERLOAD_TEST    (Np->inout_p >= Np->inout + input_len)
  #define AMOUNT_TEST(s)    (Np->inout_p <= Np->inout + (input_len - (s)))
--- 4225,4231 ----
  /*
   * These macros are used in NUM_processor() and its subsidiary routines.
   * OVERLOAD_TEST: true if we've reached end of input string
!  * AMOUNT_TEST(s): true if at least s bytes remain in string
   */
  #define OVERLOAD_TEST    (Np->inout_p >= Np->inout + input_len)
  #define AMOUNT_TEST(s)    (Np->inout_p <= Np->inout + (input_len - (s)))
*************** NUM_processor(FormatNode *node, NUMDesc
*** 4821,4829 ****
          if (!Np->is_to_char)
          {
              /*
!              * Check at least one character remains to be scanned.  (In
!              * actions below, must use AMOUNT_TEST if we want to read more
!              * characters than that.)
               */
              if (OVERLOAD_TEST)
                  break;
--- 4829,4837 ----
          if (!Np->is_to_char)
          {
              /*
!              * Check at least one byte remains to be scanned.  (In actions
!              * below, must use AMOUNT_TEST if we want to read more bytes than
!              * that.)
               */
              if (OVERLOAD_TEST)
                  break;
*************** NUM_processor(FormatNode *node, NUMDesc
*** 5081,5092 ****
               * In TO_CHAR, non-pattern characters in the format are copied to
               * the output.  In TO_NUMBER, we skip one input character for each
               * non-pattern format character, whether or not it matches the
!              * format character.  (Currently, that's actually implemented as
!              * skipping one input byte per non-pattern format byte, which is
!              * wrong...)
               */
              if (Np->is_to_char)
!                 *Np->inout_p = n->character;
          }
          Np->inout_p++;
      }
--- 5089,5106 ----
               * In TO_CHAR, non-pattern characters in the format are copied to
               * the output.  In TO_NUMBER, we skip one input character for each
               * non-pattern format character, whether or not it matches the
!              * format character.
               */
              if (Np->is_to_char)
!             {
!                 strcpy(Np->inout_p, n->character);
!                 Np->inout_p += strlen(Np->inout_p);
!             }
!             else
!             {
!                 Np->inout_p += pg_mblen(Np->inout_p);
!             }
!             continue;
          }
          Np->inout_p++;
      }

pgsql-hackers by date:

Previous
From: Peter Geoghegan
Date:
Subject: Re: [HACKERS] Parallel Hash take II
Next
From: Jeremy Schneider
Date:
Subject: Re: [HACKERS] pg_upgrade to clusters with a different WAL segment size