Attached is a proof-of-concept patch (i.e. not intended for application
just yet) to fix the problem of parsing CSV multiline fields.
Originally I indicated that the way to solve this IMHO was to the
combine reading and parsing phases of COPY for CSV. However, there's a
lot going on there and I adopted a somewhat less invasive approach,
which detects if a CR and/orNL should be part of a data value and if so
treats it as just another character. Also, it removes the escaping
nature of backslash for NL and CR in CSV, which is clearly a bug.
One thing I noticed is that (unless I misread the code) our standard
detection of the end marker \.<EOL> doesn't seem to require that it be
at the beginning of a line, as the docs say it should. I didn't change
that but did build a test for it into the special CSV code.
comments welcome.
cheers
andrew
*** copy.c.orig Wed May 26 00:41:10 2004
--- copy.c Sun Feb 6 10:36:29 2005
***************
*** 137,142 ****
--- 137,143 ----
char *delim, char *null_print, bool csv_mode, char *quote, char *escape,
List *force_notnull_atts);
static bool CopyReadLine(void);
+ static bool CopyReadLineCSV(char * quote, char * escape);
static char *CopyReadAttribute(const char *delim, const char *null_print,
CopyReadResult *result, bool *isnull);
static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
***************
*** 1683,1689 ****
ListCell *cur;
/* Actually read the line into memory here */
! done = CopyReadLine();
/*
* EOF at start of line means we're done. If we see EOF
--- 1684,1690 ----
ListCell *cur;
/* Actually read the line into memory here */
! done = csv_mode ? CopyReadLineCSV(quote, escape) : CopyReadLine();
/*
* EOF at start of line means we're done. If we see EOF
***************
*** 2159,2164 ****
--- 2160,2398 ----
return result;
}
+ /*
+ * Read a line for CSV copy mode. Differences from standard mode:
+ * . CR an NL are not special inside quoted fields - they just get added
+ * to the buffer.
+ * . \ is not magical except as the start of the end of data marker.
+ *
+ */
+
+ static bool
+ CopyReadLineCSV(char * quote, char * escape)
+ {
+ bool result;
+ bool change_encoding = (client_encoding != server_encoding);
+ int c;
+ int mblen;
+ int j;
+ unsigned char s[2];
+ char *cvt;
+ bool in_quote = false;
+ char quotec = quote[0];
+ char escapec = escape[0];
+
+ s[1] = 0;
+
+ /* reset line_buf to empty */
+ line_buf.len = 0;
+ line_buf.data[0] = '\0';
+ line_buf.cursor = 0;
+
+ /* mark that encoding conversion hasn't occurred yet */
+ line_buf_converted = false;
+
+ /* set default status */
+ result = false;
+
+ /*
+ * In this loop we only care for detecting newlines (\r and/or \n)
+ * and the end-of-copy marker (\.). These four
+ * characters, and only these four, are assumed the same in frontend
+ * and backend encodings. We do not assume that second and later bytes
+ * of a frontend multibyte character couldn't look like ASCII characters.
+ *
+ * What about the encoding implications of the quote / excape chars?
+ *
+ * However, CR and NL characters that are inside a quoted field are
+ * not special, and are simply a part of the data value. The rough
+ * and ready rule we use to recognise this situation is to count
+ * the number of quote and escape characters on the line - if it's odd
+ * we are inside a quoted field (so these chars act like a toggle).
+ */
+ for (;;)
+ {
+ c = CopyGetChar();
+ if (c == EOF)
+ {
+ result = true;
+ break;
+ }
+
+ /* not a complete parsing rule but good enough for here */
+ if (c == quotec || c == escapec)
+ in_quote = ! in_quote;
+
+ /*
+ * updating the line count for embedded CR and/or LF chars is
+ * necessarily a little fragile - this test is probably about
+ * the best we can do.
+ */
+ if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n'))
+ copy_lineno++;
+
+ if (!in_quote && c == '\r')
+ {
+ if (eol_type == EOL_NL)
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("unquoted carriage return found in CSV data"),
+ errhint("Use quoted CSV field to represent carriage return.")));
+ /* Check for \r\n on first line, _and_ handle \r\n. */
+ if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL)
+ {
+ int c2 = CopyPeekChar();
+
+ if (c2 == '\n')
+ {
+ CopyDonePeek(c2, true); /* eat newline */
+ eol_type = EOL_CRNL;
+ }
+ else
+ {
+ /* found \r, but no \n */
+ if (eol_type == EOL_CRNL)
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("unquoted carriage return found in CSV data"),
+ errhint("Use quoted CSV field to represent carriage return.")));
+
+ /*
+ * if we got here, it is the first line and we didn't
+ * get \n, so put it back
+ */
+ CopyDonePeek(c2, false);
+ eol_type = EOL_CR;
+ }
+ }
+ break;
+ }
+ if (!in_quote && c == '\n')
+ {
+ if (eol_type == EOL_CR || eol_type == EOL_CRNL)
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("unquoted newline found in CSV data"),
+ errhint("Use quoted CSV field to represent newline.")));
+ eol_type = EOL_NL;
+ break;
+ }
+
+ /* \ is only potentially magical at the start of a line */
+ if (line_buf.len == 0 && c == '\\')
+ {
+ int c2 = CopyPeekChar();
+
+ if (c2 == EOF)
+ {
+ result = true;
+
+ CopyDonePeek(c2, true); /* eat it - do we need to? */
+
+ break;
+ }
+ if (c2 == '.')
+ {
+
+ CopyDonePeek(c2, true); /* so we can keep calling GetChar() */
+
+ if (eol_type == EOL_CRNL)
+ {
+ c = CopyGetChar();
+ if (c == '\n')
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("end-of-copy marker does not match previous newline style")));
+ if (c != '\r')
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("end-of-copy marker corrupt")));
+ }
+ c = CopyGetChar();
+ if (c != '\r' && c != '\n')
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("end-of-copy marker corrupt")));
+ if ((eol_type == EOL_NL && c != '\n') ||
+ (eol_type == EOL_CRNL && c != '\n') ||
+ (eol_type == EOL_CR && c != '\r'))
+ ereport(ERROR,
+ (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
+ errmsg("end-of-copy marker does not match previous newline style")));
+
+ /*
+ * In protocol version 3, we should ignore anything
+ * after \. up to the protocol end of copy data. (XXX
+ * maybe better not to treat \. as special?)
+ */
+ if (copy_dest == COPY_NEW_FE)
+ {
+ while (c != EOF)
+ c = CopyGetChar();
+ }
+ result = true; /* report EOF */
+ break;
+ }
+
+ CopyDonePeek(c2, false); /* not a dot, so put it back */
+
+ }
+
+ appendStringInfoCharMacro(&line_buf, c);
+
+ /*
+ * When client encoding != server, must be careful to read the
+ * extra bytes of a multibyte character exactly, since the encoding
+ * might not ensure they don't look like ASCII. When the encodings
+ * are the same, we need not do this, since no server encoding we
+ * use has ASCII-like following bytes.
+ */
+ if (change_encoding)
+ {
+ s[0] = c;
+ mblen = pg_encoding_mblen(client_encoding, s);
+ for (j = 1; j < mblen; j++)
+ {
+ c = CopyGetChar();
+ if (c == EOF)
+ {
+ result = true;
+ break;
+ }
+ appendStringInfoCharMacro(&line_buf, c);
+ }
+ if (result)
+ break; /* out of outer loop */
+ }
+ } /* end of outer loop */
+
+ /*
+ * Done reading the line. Convert it to server encoding.
+ *
+ * Note: set line_buf_converted to true *before* attempting conversion;
+ * this prevents infinite recursion during error reporting should
+ * pg_client_to_server() issue an error, due to copy_in_error_callback
+ * again attempting the same conversion. We'll end up issuing the message
+ * without conversion, which is bad but better than nothing ...
+ */
+ line_buf_converted = true;
+
+ if (change_encoding)
+ {
+ cvt = (char *) pg_client_to_server((unsigned char *) line_buf.data,
+ line_buf.len);
+ if (cvt != line_buf.data)
+ {
+ /* transfer converted data back to line_buf */
+ line_buf.len = 0;
+ line_buf.data[0] = '\0';
+ appendBinaryStringInfo(&line_buf, cvt, strlen(cvt));
+ }
+ }
+
+ return result;
+ }
+
/*----------
* Read the value of a single attribute, performing de-escaping as needed.
*
***************
*** 2332,2366 ****
for (;;)
{
- /* handle multiline quoted fields */
- if (in_quote && line_buf.cursor >= line_buf.len)
- {
- bool done;
-
- switch(eol_type)
- {
- case EOL_NL:
- appendStringInfoString(&attribute_buf,"\n");
- break;
- case EOL_CR:
- appendStringInfoString(&attribute_buf,"\r");
- break;
- case EOL_CRNL:
- appendStringInfoString(&attribute_buf,"\r\n");
- break;
- case EOL_UNKNOWN:
- /* shouldn't happen - just keep going */
- break;
- }
-
- copy_lineno++;
- done = CopyReadLine();
- if (done && line_buf.len == 0)
- break;
- start_cursor = line_buf.cursor;
- }
-
end_cursor = line_buf.cursor;
if (line_buf.cursor >= line_buf.len)
break;
c = line_buf.data[line_buf.cursor++];
--- 2566,2573 ----
for (;;)
{
end_cursor = line_buf.cursor;
+
if (line_buf.cursor >= line_buf.len)
break;
c = line_buf.data[line_buf.cursor++];