Thread: fix CSV multiline parsing - proof of concept

fix CSV multiline parsing - proof of concept

From
Andrew Dunstan
Date:
06 February 2005, 19:16:08
Attached is a proof-of-concept patch (i.e. not intended for application
just yet) to fix the problem of parsing CSV multiline fields.

Originally I indicated that the way to solve this IMHO was to the
combine reading and parsing phases of COPY for CSV. However, there's a
lot going on there and I adopted a somewhat less invasive approach,
which detects if a CR and/orNL should be part of a data value and if so
treats it as just another character. Also, it removes the escaping
nature of backslash for NL and CR in CSV, which is clearly a bug.

One thing I noticed is that (unless I misread the code) our standard
detection of the end marker  \.<EOL> doesn't seem to require that it be
at the beginning of a line, as the docs say it should. I didn't change
that but did build a test for it into the special CSV code.

comments welcome.

cheers

andrew
*** copy.c.orig    Wed May 26 00:41:10 2004
--- copy.c    Sun Feb  6 10:36:29 2005
***************
*** 137,142 ****
--- 137,143 ----
           char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
           List *force_notnull_atts);
  static bool CopyReadLine(void);
+ static bool CopyReadLineCSV(char * quote, char * escape);
  static char *CopyReadAttribute(const char *delim, const char *null_print,
                                 CopyReadResult *result, bool *isnull);
  static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
***************
*** 1683,1689 ****
              ListCell   *cur;

              /* Actually read the line into memory here */
!             done = CopyReadLine();

              /*
               * EOF at start of line means we're done.  If we see EOF
--- 1684,1690 ----
              ListCell   *cur;

              /* Actually read the line into memory here */
!             done = csv_mode ? CopyReadLineCSV(quote, escape) : CopyReadLine();

              /*
               * EOF at start of line means we're done.  If we see EOF
***************
*** 2159,2164 ****
--- 2160,2398 ----
      return result;
  }

+ /*
+  * Read a line for CSV copy mode. Differences from standard mode:
+  * . CR an NL are not special inside quoted fields - they just get added
+  *   to the buffer.
+  * . \ is not magical except as the start of the end of data marker.
+  *
+  */
+
+ static bool
+ CopyReadLineCSV(char * quote, char * escape)
+ {
+     bool        result;
+     bool        change_encoding = (client_encoding != server_encoding);
+     int            c;
+     int            mblen;
+     int            j;
+     unsigned char s[2];
+     char       *cvt;
+     bool        in_quote = false;
+     char        quotec = quote[0];
+     char        escapec = escape[0];
+
+     s[1] = 0;
+
+     /* reset line_buf to empty */
+     line_buf.len = 0;
+     line_buf.data[0] = '\0';
+     line_buf.cursor = 0;
+
+     /* mark that encoding conversion hasn't occurred yet */
+     line_buf_converted = false;
+
+     /* set default status */
+     result = false;
+
+     /*
+      * In this loop we only care for detecting newlines (\r and/or \n)
+      * and the end-of-copy marker (\.).  These four
+      * characters, and only these four, are assumed the same in frontend
+      * and backend encodings.  We do not assume that second and later bytes
+      * of a frontend multibyte character couldn't look like ASCII characters.
+      *
+      * What about the encoding implications of the quote / excape chars?
+      *
+      * However, CR and NL characters that are inside a quoted field are
+      * not special, and are simply a part of the data value. The rough
+      * and ready rule we use to recognise this situation is to count
+      * the number of quote and escape characters on the line - if it's odd
+      * we are inside a quoted field (so these chars act like a toggle).
+      */
+     for (;;)
+     {
+         c = CopyGetChar();
+         if (c == EOF)
+         {
+             result = true;
+             break;
+         }
+
+         /* not a complete parsing rule but good enough for here */
+         if (c == quotec || c == escapec)
+             in_quote = ! in_quote;
+
+         /*
+          * updating the line count for embedded CR and/or LF chars is
+          * necessarily a little fragile - this test is probably about
+          * the best we can do.
+          */
+         if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n'))
+             copy_lineno++;
+
+         if (!in_quote && c == '\r')
+         {
+             if (eol_type == EOL_NL)
+                 ereport(ERROR,
+                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                          errmsg("unquoted carriage return found in CSV data"),
+                   errhint("Use quoted CSV field to represent carriage return.")));
+             /* Check for \r\n on first line, _and_ handle \r\n. */
+             if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
+             {
+                 int            c2 = CopyPeekChar();
+
+                 if (c2 == '\n')
+                 {
+                     CopyDonePeek(c2, true);        /* eat newline */
+                     eol_type = EOL_CRNL;
+                 }
+                 else
+                 {
+                     /* found \r, but no \n */
+                     if (eol_type == EOL_CRNL)
+                         ereport(ERROR,
+                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                          errmsg("unquoted carriage return found in CSV data"),
+                                  errhint("Use quoted CSV field to represent carriage return.")));
+
+                     /*
+                      * if we got here, it is the first line and we didn't
+                      * get \n, so put it back
+                      */
+                     CopyDonePeek(c2, false);
+                     eol_type = EOL_CR;
+                 }
+             }
+             break;
+         }
+         if (!in_quote && c == '\n')
+         {
+             if (eol_type == EOL_CR || eol_type == EOL_CRNL)
+                 ereport(ERROR,
+                         (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                          errmsg("unquoted newline found in CSV data"),
+                          errhint("Use quoted CSV field to represent newline.")));
+             eol_type = EOL_NL;
+             break;
+         }
+
+         /* \ is only potentially magical at the start of a line */
+         if (line_buf.len == 0 && c == '\\')
+         {
+             int            c2 = CopyPeekChar();
+
+             if (c2 == EOF)
+             {
+                 result = true;
+
+                 CopyDonePeek(c2, true); /* eat it - do we need to? */
+
+                 break;
+             }
+             if (c2 == '.')
+             {
+
+                 CopyDonePeek(c2, true); /* so we can keep calling GetChar() */
+
+                 if (eol_type == EOL_CRNL)
+                 {
+                     c = CopyGetChar();
+                     if (c == '\n')
+                         ereport(ERROR,
+                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                  errmsg("end-of-copy marker does not match previous newline style")));
+                     if (c != '\r')
+                         ereport(ERROR,
+                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                                  errmsg("end-of-copy marker corrupt")));
+                 }
+                 c = CopyGetChar();
+                 if (c != '\r' && c != '\n')
+                     ereport(ERROR,
+                             (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                              errmsg("end-of-copy marker corrupt")));
+                 if ((eol_type == EOL_NL && c != '\n') ||
+                     (eol_type == EOL_CRNL && c != '\n') ||
+                     (eol_type == EOL_CR && c != '\r'))
+                     ereport(ERROR,
+                             (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+                              errmsg("end-of-copy marker does not match previous newline style")));
+
+                 /*
+                  * In protocol version 3, we should ignore anything
+                  * after \. up to the protocol end of copy data.  (XXX
+                  * maybe better not to treat \. as special?)
+                  */
+                 if (copy_dest == COPY_NEW_FE)
+                 {
+                     while (c != EOF)
+                         c = CopyGetChar();
+                 }
+                 result = true;    /* report EOF */
+                 break;
+             }
+
+             CopyDonePeek(c2, false); /* not a dot, so put it back */
+
+         }
+
+         appendStringInfoCharMacro(&line_buf, c);
+
+         /*
+          * When client encoding != server, must be careful to read the
+          * extra bytes of a multibyte character exactly, since the encoding
+          * might not ensure they don't look like ASCII.  When the encodings
+          * are the same, we need not do this, since no server encoding we
+          * use has ASCII-like following bytes.
+          */
+         if (change_encoding)
+         {
+             s[0] = c;
+             mblen = pg_encoding_mblen(client_encoding, s);
+             for (j = 1; j < mblen; j++)
+             {
+                 c = CopyGetChar();
+                 if (c == EOF)
+                 {
+                     result = true;
+                     break;
+                 }
+                 appendStringInfoCharMacro(&line_buf, c);
+             }
+             if (result)
+                 break;            /* out of outer loop */
+         }
+     } /* end of outer loop */
+
+     /*
+      * Done reading the line.  Convert it to server encoding.
+      *
+      * Note: set line_buf_converted to true *before* attempting conversion;
+      * this prevents infinite recursion during error reporting should
+      * pg_client_to_server() issue an error, due to copy_in_error_callback
+      * again attempting the same conversion.  We'll end up issuing the message
+      * without conversion, which is bad but better than nothing ...
+      */
+     line_buf_converted = true;
+
+     if (change_encoding)
+     {
+         cvt = (char *) pg_client_to_server((unsigned char *) line_buf.data,
+                                            line_buf.len);
+         if (cvt != line_buf.data)
+         {
+             /* transfer converted data back to line_buf */
+             line_buf.len = 0;
+             line_buf.data[0] = '\0';
+             appendBinaryStringInfo(&line_buf, cvt, strlen(cvt));
+         }
+     }
+
+     return result;
+ }
+
  /*----------
   * Read the value of a single attribute, performing de-escaping as needed.
   *
***************
*** 2332,2366 ****

      for (;;)
      {
-         /* handle multiline quoted fields */
-         if (in_quote && line_buf.cursor >= line_buf.len)
-         {
-             bool done;
-
-             switch(eol_type)
-             {
-                 case EOL_NL:
-                     appendStringInfoString(&attribute_buf,"\n");
-                     break;
-                 case EOL_CR:
-                     appendStringInfoString(&attribute_buf,"\r");
-                     break;
-                 case EOL_CRNL:
-                     appendStringInfoString(&attribute_buf,"\r\n");
-                     break;
-                 case EOL_UNKNOWN:
-                     /* shouldn't happen - just keep going */
-                     break;
-             }
-
-             copy_lineno++;
-             done = CopyReadLine();
-             if (done && line_buf.len == 0)
-                 break;
-             start_cursor = line_buf.cursor;
-         }
-
          end_cursor = line_buf.cursor;
          if (line_buf.cursor >= line_buf.len)
              break;
          c = line_buf.data[line_buf.cursor++];
--- 2566,2573 ----

      for (;;)
      {
          end_cursor = line_buf.cursor;
+
          if (line_buf.cursor >= line_buf.len)
              break;
          c = line_buf.data[line_buf.cursor++];