CSV multiline final fix - Mailing list pgsql-patches

From Andrew Dunstan
Subject CSV multiline final fix
Date
Msg-id 4219326D.5060801@dunslane.net
Whole thread Raw
Responses Re: CSV multiline final fix
List pgsql-patches
Well, in response to the huge number (0) of comments on my POC patch to
fix this, I prepared the attached patch, which improves on my previous
effort a bit (there was one obscure failure case which is now handled).

Basically, all the required logic is in a new function CopyReadLineCSV()
which is almost but not quite like CopyReadLine(). The new function
keeps just enough state to know if a line ending sequence (CR, CRLF, or
LF) is part of a quoted field or not. This gets rid of the need for
special casing embedded line endings on input elsewhere, so that is
removed, as is the warning about them on output that we added back in
december (as we then thought just before release). Lastly, the docs are
also patched.

Also attached is my tiny test file - maybe we need to cover this in
regression tests?

cheers

andrew
diff -c -r ../../pgbf/root/REL8_0_STABLE/pgsql/doc/src/sgml/ref/copy.sgml ./doc/src/sgml/ref/copy.sgml
*** ../../pgbf/root/REL8_0_STABLE/pgsql/doc/src/sgml/ref/copy.sgml    Mon Jan  3 19:39:53 2005
--- ./doc/src/sgml/ref/copy.sgml    Sun Feb 20 19:18:54 2005
***************
*** 496,508 ****
      <para>
       CSV mode will both recognize and produce CSV files with quoted
       values containing embedded carriage returns and line feeds. Thus
!      the files are not strictly one line per table row like text-mode
!      files. However, <productname>PostgreSQL</productname> will reject
!      <command>COPY</command> input if any fields contain embedded line
!      end character sequences that do not match the line ending
!      convention used in the CSV file itself. It is generally safer to
!      import data containing embedded line end characters using the
!      text or binary formats rather than CSV.
      </para>
     </note>

--- 496,503 ----
      <para>
       CSV mode will both recognize and produce CSV files with quoted
       values containing embedded carriage returns and line feeds. Thus
!      the files are not strictly one line per table row as are text-mode
!      files.
      </para>
     </note>

***************
*** 513,518 ****
--- 508,515 ----
       might encounter some files that cannot be imported using this
       mechanism, and <command>COPY</> might produce files that other
       programs cannot process.
+      It is generally safer to import data using the text or binary formats,
+      if possible, rather than using CSV format.
      </para>
     </note>

diff -c -r ../../pgbf/root/REL8_0_STABLE/pgsql/src/backend/commands/copy.c ./src/backend/commands/copy.c
*** ../../pgbf/root/REL8_0_STABLE/pgsql/src/backend/commands/copy.c    Fri Dec 31 16:59:41 2004
--- ./src/backend/commands/copy.c    Sun Feb 20 13:40:56 2005
***************
*** 98,104 ****
  static EolType eol_type;        /* EOL type of input */
  static int    client_encoding;    /* remote side's character encoding */
  static int    server_encoding;    /* local encoding */
- static bool embedded_line_warning;

  /* these are just for error messages, see copy_in_error_callback */
  static bool copy_binary;        /* is it a binary copy? */
--- 98,103 ----
***************
*** 140,145 ****
--- 139,145 ----
   char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
           List *force_notnull_atts);
  static bool CopyReadLine(void);
+ static bool CopyReadLineCSV(char * quote, char * escape);
  static char *CopyReadAttribute(const char *delim, const char *null_print,
                    CopyReadResult *result, bool *isnull);
  static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
***************
*** 1191,1197 ****
      attr = tupDesc->attrs;
      num_phys_attrs = tupDesc->natts;
      attr_count = list_length(attnumlist);
-     embedded_line_warning = false;

      /*
       * Get info about the columns we need to process.
--- 1191,1196 ----
***************
*** 1718,1724 ****
              ListCell   *cur;

              /* Actually read the line into memory here */
!             done = CopyReadLine();

              /*
               * EOF at start of line means we're done.  If we see EOF after
--- 1717,1723 ----
              ListCell   *cur;

              /* Actually read the line into memory here */
!             done = csv_mode ? CopyReadLineCSV(quote, escape) : CopyReadLine();

              /*
               * EOF at start of line means we're done.  If we see EOF after
***************
*** 2194,2199 ****
--- 2193,2448 ----
      return result;
  }

+ /*
+  * Read a line for CSV copy mode. Differences from standard mode:
+  * . CR an NL are not special inside quoted fields - they just get added
+  *   to the buffer.
+  * . \ is not magical except as the start of the end of data marker.
+  *
+  */
+
+ static bool
+ CopyReadLineCSV(char * quote, char * escape)
+ {
+     bool        result;
+     bool        change_encoding = (client_encoding != server_encoding);
+     int            c;
+     int            mblen;
+     int            j;
+     unsigned char s[2];
+     char       *cvt;
+     bool        in_quote = false, last_was_esc = false;
+     char        quotec = quote[0];
+     char        escapec = escape[0];
+
+     s[1] = 0;
+
+     /* ignore special escape processing if it's the same as quote */
+     if (quotec == escapec)
+         escapec = '\0';
+
+     /* reset line_buf to empty */
+     line_buf.len = 0;
+     line_buf.data[0] = '\0';
+     line_buf.cursor = 0;
+
+     /* mark that encoding conversion hasn't occurred yet */
+     line_buf_converted = false;
+
+     /* set default status */
+     result = false;
+
+     /*
+      * In this loop we only care for detecting newlines (\r and/or \n)
+      * and the end-of-copy marker (\.).  These four
+      * characters, and only these four, are assumed the same in frontend
+      * and backend encodings.  We do not assume that second and later bytes
+      * of a frontend multibyte character couldn't look like ASCII characters.
+      *
+      * What about the encoding implications of the quote / excape chars?
+      *
+      * However, CR and NL characters that are inside a quoted field are
+      * not special, and are simply a part of the data value. The parsing rule
+      * used is a bit rough and ready, but probably adequate for our purposes.
+      */
+
+     for (;;)
+     {
+         c = CopyGetChar();
+         if (c == EOF)
+         {
+             result = true;
+             break;
+         }
+
+         /*
+          * Dealing with quotes and escapes here is mildly tricky. If the
+          * quote char is also the escape char, there's no problem - we
+          * just use the char as a toggle. If they are different, we need
+          * to ensure that we only take account of an escape inside a quoted
+          * field and immediately preceding a quote char, and not the
+          * second in a escape-escape sequence.
+          */
+
+         if (in_quote && c == escapec)
+             last_was_esc = ! last_was_esc;
+
+         if (c == quotec && ! last_was_esc)
+             in_quote = ! in_quote;
+
+         if (c != escapec)
+             last_was_esc = false;
+
+         /*
+          * updating the line count for embedded CR and/or LF chars is
+          * necessarily a little fragile - this test is probably about
+          * the best we can do.
+          */
+         if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n'))
+             copy_lineno++;
+
+         if (!in_quote && c == '\r')
+         {
+             if (eol_type == EOL_NL)
+                 ereport(ERROR,
+                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                          errmsg("unquoted carriage return found in CSV data"),
+                   errhint("Use quoted CSV field to represent carriage return.")));
+             /* Check for \r\n on first line, _and_ handle \r\n. */
+             if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
+             {
+                 int            c2 = CopyPeekChar();
+
+                 if (c2 == '\n')
+                 {
+                     CopyDonePeek(c2, true);        /* eat newline */
+                     eol_type = EOL_CRNL;
+                 }
+                 else
+                 {
+                     /* found \r, but no \n */
+                     if (eol_type == EOL_CRNL)
+                         ereport(ERROR,
+                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                          errmsg("unquoted carriage return found in CSV data"),
+                                  errhint("Use quoted CSV field to represent carriage return.")));
+
+                     /*
+                      * if we got here, it is the first line and we didn't
+                      * get \n, so put it back
+                      */
+                     CopyDonePeek(c2, false);
+                     eol_type = EOL_CR;
+                 }
+             }
+             break;
+         }
+         if (!in_quote && c == '\n')
+         {
+             if (eol_type == EOL_CR || eol_type == EOL_CRNL)
+                 ereport(ERROR,
+                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                          errmsg("unquoted newline found in CSV data"),
+                          errhint("Use quoted CSV field to represent newline.")));
+             eol_type = EOL_NL;
+             break;
+         }
+
+         /* \ is only potentially magical at the start of a line */
+         if (line_buf.len == 0 && c == '\\')
+         {
+             int            c2 = CopyPeekChar();
+
+             if (c2 == EOF)
+             {
+                 result = true;
+
+                 CopyDonePeek(c2, true); /* eat it - do we need to? */
+
+                 break;
+             }
+             if (c2 == '.')
+             {
+
+                 CopyDonePeek(c2, true); /* so we can keep calling GetChar() */
+
+                 if (eol_type == EOL_CRNL)
+                 {
+                     c = CopyGetChar();
+                     if (c == '\n')
+                         ereport(ERROR,
+                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                  errmsg("end-of-copy marker does not match previous newline style")));
+                     if (c != '\r')
+                         ereport(ERROR,
+                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                  errmsg("end-of-copy marker corrupt")));
+                 }
+                 c = CopyGetChar();
+                 if (c != '\r' && c != '\n')
+                     ereport(ERROR,
+                             (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                              errmsg("end-of-copy marker corrupt")));
+                 if ((eol_type == EOL_NL && c != '\n') ||
+                     (eol_type == EOL_CRNL && c != '\n') ||
+                     (eol_type == EOL_CR && c != '\r'))
+                     ereport(ERROR,
+                             (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                              errmsg("end-of-copy marker does not match previous newline style")));
+
+                 /*
+                  * In protocol version 3, we should ignore anything
+                  * after \. up to the protocol end of copy data.  (XXX
+                  * maybe better not to treat \. as special?)
+                  */
+                 if (copy_dest == COPY_NEW_FE)
+                 {
+                     while (c != EOF)
+                         c = CopyGetChar();
+                 }
+                 result = true;    /* report EOF */
+                 break;
+             }
+
+             CopyDonePeek(c2, false); /* not a dot, so put it back */
+
+         }
+
+         appendStringInfoCharMacro(&line_buf, c);
+
+         /*
+          * When client encoding != server, must be careful to read the
+          * extra bytes of a multibyte character exactly, since the encoding
+          * might not ensure they don't look like ASCII.  When the encodings
+          * are the same, we need not do this, since no server encoding we
+          * use has ASCII-like following bytes.
+          */
+         if (change_encoding)
+         {
+             s[0] = c;
+             mblen = pg_encoding_mblen(client_encoding, s);
+             for (j = 1; j < mblen; j++)
+             {
+                 c = CopyGetChar();
+                 if (c == EOF)
+                 {
+                     result = true;
+                     break;
+                 }
+                 appendStringInfoCharMacro(&line_buf, c);
+             }
+             if (result)
+                 break;            /* out of outer loop */
+         }
+     } /* end of outer loop */
+
+     /*
+      * Done reading the line.  Convert it to server encoding.
+      *
+      * Note: set line_buf_converted to true *before* attempting conversion;
+      * this prevents infinite recursion during error reporting should
+      * pg_client_to_server() issue an error, due to copy_in_error_callback
+      * again attempting the same conversion.  We'll end up issuing the message
+      * without conversion, which is bad but better than nothing ...
+      */
+     line_buf_converted = true;
+
+     if (change_encoding)
+     {
+         cvt = (char *) pg_client_to_server((unsigned char *) line_buf.data,
+                                            line_buf.len);
+         if (cvt != line_buf.data)
+         {
+             /* transfer converted data back to line_buf */
+             line_buf.len = 0;
+             line_buf.data[0] = '\0';
+             appendBinaryStringInfo(&line_buf, cvt, strlen(cvt));
+         }
+     }
+
+     return result;
+ }
+
  /*----------
   * Read the value of a single attribute, performing de-escaping as needed.
   *
***************
*** 2369,2402 ****

      for (;;)
      {
-         /* handle multiline quoted fields */
-         if (in_quote && line_buf.cursor >= line_buf.len)
-         {
-             bool        done;
-
-             switch (eol_type)
-             {
-                 case EOL_NL:
-                     appendStringInfoString(&attribute_buf, "\n");
-                     break;
-                 case EOL_CR:
-                     appendStringInfoString(&attribute_buf, "\r");
-                     break;
-                 case EOL_CRNL:
-                     appendStringInfoString(&attribute_buf, "\r\n");
-                     break;
-                 case EOL_UNKNOWN:
-                     /* shouldn't happen - just keep going */
-                     break;
-             }
-
-             copy_lineno++;
-             done = CopyReadLine();
-             if (done && line_buf.len == 0)
-                 break;
-             start_cursor = line_buf.cursor;
-         }
-
          end_cursor = line_buf.cursor;
          if (line_buf.cursor >= line_buf.len)
              break;
--- 2618,2623 ----
***************
*** 2629,2653 ****
           !use_quote && (c = *test_string) != '\0';
           test_string += mblen)
      {
-         /*
-          * We don't know here what the surrounding line end characters
-          * might be. It might not even be under postgres' control. So
-          * we simple warn on ANY embedded line ending character.
-          *
-          * This warning will disappear when we make line parsing field-aware,
-          * so that we can reliably read in embedded line ending characters
-          * regardless of the file's line-end context.
-          *
-          */
-
-         if (!embedded_line_warning  && (c == '\n' || c == '\r') )
-         {
-             embedded_line_warning = true;
-             elog(WARNING,
-                  "CSV fields with embedded linefeed or carriage return "
-                  "characters might not be able to be reimported");
-         }
-
          if (c == delimc || c == quotec || c == '\n' || c == '\r')
              use_quote = true;
          if (!same_encoding)
--- 2850,2855 ----


create temp table copytest (
    style    text,
    test     text,
    filler    int);

insert into copytest values('DOS','abc\r\ndef',1);
insert into copytest values('Unix','abc\ndef',2);
insert into copytest values('Mac','abc\rdef',3);
insert into copytest values('esc\\ape','a\\r\\\r\\\n\\nb',4);

copy copytest to '/tmp/copytest.csv' csv;

create temp table copytest2 (like copytest);

copy copytest2 from '/tmp/copytest.csv' csv;

select * from copytest except select * from copytest2;

truncate copytest2;

copy copytest to '/tmp/copytest.csv' csv quote '\'' escape '\\';

copy copytest2 from '/tmp/copytest.csv' csv quote '\'' escape '\\';

select * from copytest except select * from copytest2;

create or replace function nice_quote(text) returns text language plperl as $$

  my $txt = shift;
  foreach ($txt)
  {
    s/\r/\\r/g;
    s/\n/\\n/g;
  };
  return $txt;

$$;

select nice_quote(quote_literal(style)) as style,
       nice_quote(quote_literal(test)) as test
from copytest;




pgsql-patches by date:

Previous
From: Peter Eisentraut
Date:
Subject: Re: Change < to -f in examples with input files
Next
From: Neil Conway
Date:
Subject: Re: pg_ctl reference page