Home > mailing lists
Re: CSV multiline final fix - Mailing list pgsql-patches

From	Andrew Dunstan
Subject	Re: CSV multiline final fix
Date	February 22, 2005 04:56:44
Msg-id	421ABB50.6090006@dunslane.net Whole thread Raw
In response to	Re: CSV multiline final fix ("Andrew Dunstan" <andrew@dunslane.net>)
Responses	Re: CSV multiline final fix
List	pgsql-patches
Tree view

Andrew Dunstan wrote:

>Bruce Momjian said:
>
>
>>Shame we had to duplicate CopyReadLine() in a sense.
>>
>>
>>
>>
>
>
>If you can find a clean way to merge them please do - I'll be very grateful.
> My head started to spin when I tried. In general I dislike having more than
>2 or 2 levels of logic in a given piece of code.
>
>
>
>

Previous comment courtesy clumsy fingers and the Department of
Redundancy Department (of course, I meant 2 or 3).

Anyway, please review this patch for copy.c - it's possibly more to your
taste. It's less redundant, but I'm not sure it's more clear.

cheers

andrew
*** copy.c.orig    Mon Feb 21 23:12:41 2005
--- copy.c    Mon Feb 21 23:35:22 2005
***************
*** 98,104 ****
  static EolType eol_type;        /* EOL type of input */
  static int    client_encoding;    /* remote side's character encoding */
  static int    server_encoding;    /* local encoding */
- static bool embedded_line_warning;

  /* these are just for error messages, see copy_in_error_callback */
  static bool copy_binary;        /* is it a binary copy? */
--- 98,103 ----
***************
*** 139,145 ****
  static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
   char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
           List *force_notnull_atts);
! static bool CopyReadLine(void);
  static char *CopyReadAttribute(const char *delim, const char *null_print,
                    CopyReadResult *result, bool *isnull);
  static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
--- 138,144 ----
  static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids,
   char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
           List *force_notnull_atts);
! static bool CopyReadLine(char * quote, char * escape);
  static char *CopyReadAttribute(const char *delim, const char *null_print,
                    CopyReadResult *result, bool *isnull);
  static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
***************
*** 1191,1197 ****
      attr = tupDesc->attrs;
      num_phys_attrs = tupDesc->natts;
      attr_count = list_length(attnumlist);
-     embedded_line_warning = false;

      /*
       * Get info about the columns we need to process.
--- 1190,1195 ----
***************
*** 1718,1724 ****
              ListCell   *cur;

              /* Actually read the line into memory here */
!             done = CopyReadLine();

              /*
               * EOF at start of line means we're done.  If we see EOF after
--- 1716,1723 ----
              ListCell   *cur;

              /* Actually read the line into memory here */
!             done = csv_mode ?
!                 CopyReadLine(quote, escape) : CopyReadLine(NULL, NULL);

              /*
               * EOF at start of line means we're done.  If we see EOF after
***************
*** 2006,2012 ****
   * by newline.
   */
  static bool
! CopyReadLine(void)
  {
      bool        result;
      bool        change_encoding = (client_encoding != server_encoding);
--- 2005,2011 ----
   * by newline.
   */
  static bool
! CopyReadLine(char * quote, char * escape)
  {
      bool        result;
      bool        change_encoding = (client_encoding != server_encoding);
***************
*** 2015,2020 ****
--- 2014,2032 ----
      int            j;
      unsigned char s[2];
      char       *cvt;
+     bool        in_quote = false, last_was_esc = false, csv_mode = false;
+     char        quotec = '\0', escapec = '\0';
+
+     if (quote)
+     {
+         csv_mode = true;
+         quotec = quote[0];
+         escapec = escape[0];
+         /* ignore special escape processing if it's the same as quotec */
+         if (quotec == escapec)
+             escapec = '\0';
+     }
+

      s[1] = 0;

***************
*** 2031,2041 ****

      /*
       * In this loop we only care for detecting newlines (\r and/or \n) and
!      * the end-of-copy marker (\.).  For backwards compatibility we allow
       * backslashes to escape newline characters.  Backslashes other than
       * the end marker get put into the line_buf, since CopyReadAttribute
!      * does its own escape processing.    These four characters, and only
!      * these four, are assumed the same in frontend and backend encodings.
       * We do not assume that second and later bytes of a frontend
       * multibyte character couldn't look like ASCII characters.
       */
--- 2043,2062 ----

      /*
       * In this loop we only care for detecting newlines (\r and/or \n) and
!      * the end-of-copy marker (\.).
!      *
!      * In Text mode, for backwards compatibility we allow
       * backslashes to escape newline characters.  Backslashes other than
       * the end marker get put into the line_buf, since CopyReadAttribute
!      * does its own escape processing.
!      *
!      * In CSV mode, CR and NL inside q quoted field are just part of the
!      * data value and are put in line_buf. We keep just enough state
!      * to know if we are currently in a quoted field or not.
!      *
!      * These four characters, and only these four, are assumed the same in
!      * frontend and backend encodings.
!      *
       * We do not assume that second and later bytes of a frontend
       * multibyte character couldn't look like ASCII characters.
       */
***************
*** 2047,2059 ****
              result = true;
              break;
          }
!         if (c == '\r')
          {
              if (eol_type == EOL_NL)
!                 ereport(ERROR,
!                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
!                          errmsg("literal carriage return found in data"),
!                   errhint("Use \"\\r\" to represent carriage return.")));
              /* Check for \r\n on first line, _and_ handle \r\n. */
              if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
              {
--- 2068,2116 ----
              result = true;
              break;
          }
!
!         if (csv_mode)
!         {
!             /*
!              * Dealing with quotes and escapes here is mildly tricky. If the
!              * quote char is also the escape char, there's no problem - we
!              * just use the char as a toggle. If they are different, we need
!              * to ensure that we only take account of an escape inside a quoted
!              * field and immediately preceding a quote char, and not the
!              * second in a escape-escape sequence.
!              */
!
!             if (in_quote && c == escapec)
!                 last_was_esc = ! last_was_esc;
!             if (c == quotec && ! last_was_esc)
!                 in_quote = ! in_quote;
!             if (c != escapec)
!                 last_was_esc = false;
!
!             /*
!              * updating the line count for embedded CR and/or LF chars is
!              * necessarily a little fragile - this test is probably about
!              * the best we can do.
!              */
!             if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n'))
!                 copy_lineno++;
!         }
!
!         if (!in_quote && c == '\r')
          {
              if (eol_type == EOL_NL)
!             {
!                 if (! csv_mode)
!                     ereport(ERROR,
!                             (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
!                              errmsg("literal carriage return found in data"),
!                              errhint("Use \"\\r\" to represent carriage return.")));
!                 else
!                     ereport(ERROR,
!                             (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
!                              errmsg("unquoted carriage return found in CSV data"),
!                              errhint("Use quoted CSV field to represent carriage return.")));
!             }
              /* Check for \r\n on first line, _and_ handle \r\n. */
              if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
              {
***************
*** 2068,2077 ****
                  {
                      /* found \r, but no \n */
                      if (eol_type == EOL_CRNL)
!                         ereport(ERROR,
!                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
!                          errmsg("literal carriage return found in data"),
!                                  errhint("Use \"\\r\" to represent carriage return.")));

                      /*
                       * if we got here, it is the first line and we didn't
--- 2125,2143 ----
                  {
                      /* found \r, but no \n */
                      if (eol_type == EOL_CRNL)
!                     {
!                         if (!csv_mode)
!                             ereport(ERROR,
!                                     (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
!                                      errmsg("literal carriage return found in data"),
!                                      errhint("Use \"\\r\" to represent carriage return.")));
!                         else
!                             ereport(ERROR,
!                                     (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
!                                      errmsg("unquoted carriage return found in data"),
!                                      errhint("Use quoted CSV field to represent carriage return.")));
!
!                     }

                      /*
                       * if we got here, it is the first line and we didn't
***************
*** 2083,2108 ****
              }
              break;
          }
!         if (c == '\n')
          {
              if (eol_type == EOL_CR || eol_type == EOL_CRNL)
!                 ereport(ERROR,
!                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
!                          errmsg("literal newline found in data"),
!                          errhint("Use \"\\n\" to represent newline.")));
              eol_type = EOL_NL;
              break;
          }
!         if (c == '\\')
          {
!             c = CopyGetChar();
!             if (c == EOF)
              {
                  result = true;
                  break;
              }
!             if (c == '.')
              {
                  if (eol_type == EOL_CRNL)
                  {
                      c = CopyGetChar();
--- 2149,2195 ----
              }
              break;
          }
!         if (!in_quote && c == '\n')
          {
              if (eol_type == EOL_CR || eol_type == EOL_CRNL)
!             {
!                 if (!csv_mode)
!                     ereport(ERROR,
!                             (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
!                              errmsg("literal newline found in data"),
!                              errhint("Use \"\\n\" to represent newline.")));
!                 else
!                     ereport(ERROR,
!                             (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
!                              errmsg("unquoted newline found in data"),
!                              errhint("Use quoted CSV field to represent newline.")));
!
!             }
              eol_type = EOL_NL;
              break;
          }
!
!         if ((line_buf.len == 0 || !csv_mode) && c == '\\')
          {
!             int c2;
!
!             if (csv_mode)
!                 c2 = CopyPeekChar();
!             else
!                 c2 = c = CopyGetChar();
!
!             if (c2 == EOF)
              {
                  result = true;
+                 if (csv_mode)
+                     CopyDonePeek(c2, true);
                  break;
              }
!             if (c2 == '.')
              {
+                 if (csv_mode)
+                     CopyDonePeek(c2, true); /* allow keep calling GetChar() */
+
                  if (eol_type == EOL_CRNL)
                  {
                      c = CopyGetChar();
***************
*** 2140,2147 ****
                  result = true;    /* report EOF */
                  break;
              }
!             /* not EOF mark, so emit \ and following char literally */
!             appendStringInfoCharMacro(&line_buf, '\\');
          }

          appendStringInfoCharMacro(&line_buf, c);
--- 2227,2238 ----
                  result = true;    /* report EOF */
                  break;
              }
!
!             if (csv_mode)
!                 CopyDonePeek(c2, false); /* not a dot, so put it back */
!             else
!                 /* not EOF mark, so emit \ and following char literally */
!                 appendStringInfoCharMacro(&line_buf, '\\');
          }

          appendStringInfoCharMacro(&line_buf, c);
***************
*** 2369,2402 ****

      for (;;)
      {
-         /* handle multiline quoted fields */
-         if (in_quote && line_buf.cursor >= line_buf.len)
-         {
-             bool        done;
-
-             switch (eol_type)
-             {
-                 case EOL_NL:
-                     appendStringInfoString(&attribute_buf, "\n");
-                     break;
-                 case EOL_CR:
-                     appendStringInfoString(&attribute_buf, "\r");
-                     break;
-                 case EOL_CRNL:
-                     appendStringInfoString(&attribute_buf, "\r\n");
-                     break;
-                 case EOL_UNKNOWN:
-                     /* shouldn't happen - just keep going */
-                     break;
-             }
-
-             copy_lineno++;
-             done = CopyReadLine();
-             if (done && line_buf.len == 0)
-                 break;
-             start_cursor = line_buf.cursor;
-         }
-
          end_cursor = line_buf.cursor;
          if (line_buf.cursor >= line_buf.len)
              break;
--- 2460,2465 ----
***************
*** 2629,2653 ****
           !use_quote && (c = *test_string) != '\0';
           test_string += mblen)
      {
-         /*
-          * We don't know here what the surrounding line end characters
-          * might be. It might not even be under postgres' control. So
-          * we simple warn on ANY embedded line ending character.
-          *
-          * This warning will disappear when we make line parsing field-aware,
-          * so that we can reliably read in embedded line ending characters
-          * regardless of the file's line-end context.
-          *
-          */
-
-         if (!embedded_line_warning  && (c == '\n' || c == '\r') )
-         {
-             embedded_line_warning = true;
-             elog(WARNING,
-                  "CSV fields with embedded linefeed or carriage return "
-                  "characters might not be able to be reimported");
-         }
-
          if (c == delimc || c == quotec || c == '\n' || c == '\r')
              use_quote = true;
          if (!same_encoding)
--- 2692,2697 ----
pgsql-patches by date:
From: Bruce Momjian
Date: 22 February 2005, 04:11:20
Subject: Re: Cleanup for gettext() calls
From: Bruce Momjian
Date: 22 February 2005, 04:59:09
Subject: Re: CSV multiline final fix
Re: CSV multiline final fix - Mailing list pgsql-patches

Previous

Next