fix CSV multiline parsing - proof of concept - Mailing list pgsql-patches
From | Andrew Dunstan |
---|---|
Subject | fix CSV multiline parsing - proof of concept |
Date | |
Msg-id | 420642BC.4000806@dunslane.net Whole thread Raw |
List | pgsql-patches |
Attached is a proof-of-concept patch (i.e. not intended for application just yet) to fix the problem of parsing CSV multiline fields. Originally I indicated that the way to solve this IMHO was to the combine reading and parsing phases of COPY for CSV. However, there's a lot going on there and I adopted a somewhat less invasive approach, which detects if a CR and/orNL should be part of a data value and if so treats it as just another character. Also, it removes the escaping nature of backslash for NL and CR in CSV, which is clearly a bug. One thing I noticed is that (unless I misread the code) our standard detection of the end marker \.<EOL> doesn't seem to require that it be at the beginning of a line, as the docs say it should. I didn't change that but did build a test for it into the special CSV code. comments welcome. cheers andrew *** copy.c.orig Wed May 26 00:41:10 2004 --- copy.c Sun Feb 6 10:36:29 2005 *************** *** 137,142 **** --- 137,143 ---- char *delim, char *null_print, bool csv_mode, char *quote, char *escape, List *force_notnull_atts); static bool CopyReadLine(void); + static bool CopyReadLineCSV(char * quote, char * escape); static char *CopyReadAttribute(const char *delim, const char *null_print, CopyReadResult *result, bool *isnull); static char *CopyReadAttributeCSV(const char *delim, const char *null_print, *************** *** 1683,1689 **** ListCell *cur; /* Actually read the line into memory here */ ! done = CopyReadLine(); /* * EOF at start of line means we're done. If we see EOF --- 1684,1690 ---- ListCell *cur; /* Actually read the line into memory here */ ! done = csv_mode ? CopyReadLineCSV(quote, escape) : CopyReadLine(); /* * EOF at start of line means we're done. If we see EOF *************** *** 2159,2164 **** --- 2160,2398 ---- return result; } + /* + * Read a line for CSV copy mode. Differences from standard mode: + * . CR an NL are not special inside quoted fields - they just get added + * to the buffer. + * . \ is not magical except as the start of the end of data marker. + * + */ + + static bool + CopyReadLineCSV(char * quote, char * escape) + { + bool result; + bool change_encoding = (client_encoding != server_encoding); + int c; + int mblen; + int j; + unsigned char s[2]; + char *cvt; + bool in_quote = false; + char quotec = quote[0]; + char escapec = escape[0]; + + s[1] = 0; + + /* reset line_buf to empty */ + line_buf.len = 0; + line_buf.data[0] = '\0'; + line_buf.cursor = 0; + + /* mark that encoding conversion hasn't occurred yet */ + line_buf_converted = false; + + /* set default status */ + result = false; + + /* + * In this loop we only care for detecting newlines (\r and/or \n) + * and the end-of-copy marker (\.). These four + * characters, and only these four, are assumed the same in frontend + * and backend encodings. We do not assume that second and later bytes + * of a frontend multibyte character couldn't look like ASCII characters. + * + * What about the encoding implications of the quote / excape chars? + * + * However, CR and NL characters that are inside a quoted field are + * not special, and are simply a part of the data value. The rough + * and ready rule we use to recognise this situation is to count + * the number of quote and escape characters on the line - if it's odd + * we are inside a quoted field (so these chars act like a toggle). + */ + for (;;) + { + c = CopyGetChar(); + if (c == EOF) + { + result = true; + break; + } + + /* not a complete parsing rule but good enough for here */ + if (c == quotec || c == escapec) + in_quote = ! in_quote; + + /* + * updating the line count for embedded CR and/or LF chars is + * necessarily a little fragile - this test is probably about + * the best we can do. + */ + if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n')) + copy_lineno++; + + if (!in_quote && c == '\r') + { + if (eol_type == EOL_NL) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("unquoted carriage return found in CSV data"), + errhint("Use quoted CSV field to represent carriage return."))); + /* Check for \r\n on first line, _and_ handle \r\n. */ + if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL) + { + int c2 = CopyPeekChar(); + + if (c2 == '\n') + { + CopyDonePeek(c2, true); /* eat newline */ + eol_type = EOL_CRNL; + } + else + { + /* found \r, but no \n */ + if (eol_type == EOL_CRNL) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("unquoted carriage return found in CSV data"), + errhint("Use quoted CSV field to represent carriage return."))); + + /* + * if we got here, it is the first line and we didn't + * get \n, so put it back + */ + CopyDonePeek(c2, false); + eol_type = EOL_CR; + } + } + break; + } + if (!in_quote && c == '\n') + { + if (eol_type == EOL_CR || eol_type == EOL_CRNL) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("unquoted newline found in CSV data"), + errhint("Use quoted CSV field to represent newline."))); + eol_type = EOL_NL; + break; + } + + /* \ is only potentially magical at the start of a line */ + if (line_buf.len == 0 && c == '\\') + { + int c2 = CopyPeekChar(); + + if (c2 == EOF) + { + result = true; + + CopyDonePeek(c2, true); /* eat it - do we need to? */ + + break; + } + if (c2 == '.') + { + + CopyDonePeek(c2, true); /* so we can keep calling GetChar() */ + + if (eol_type == EOL_CRNL) + { + c = CopyGetChar(); + if (c == '\n') + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("end-of-copy marker does not match previous newline style"))); + if (c != '\r') + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("end-of-copy marker corrupt"))); + } + c = CopyGetChar(); + if (c != '\r' && c != '\n') + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("end-of-copy marker corrupt"))); + if ((eol_type == EOL_NL && c != '\n') || + (eol_type == EOL_CRNL && c != '\n') || + (eol_type == EOL_CR && c != '\r')) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("end-of-copy marker does not match previous newline style"))); + + /* + * In protocol version 3, we should ignore anything + * after \. up to the protocol end of copy data. (XXX + * maybe better not to treat \. as special?) + */ + if (copy_dest == COPY_NEW_FE) + { + while (c != EOF) + c = CopyGetChar(); + } + result = true; /* report EOF */ + break; + } + + CopyDonePeek(c2, false); /* not a dot, so put it back */ + + } + + appendStringInfoCharMacro(&line_buf, c); + + /* + * When client encoding != server, must be careful to read the + * extra bytes of a multibyte character exactly, since the encoding + * might not ensure they don't look like ASCII. When the encodings + * are the same, we need not do this, since no server encoding we + * use has ASCII-like following bytes. + */ + if (change_encoding) + { + s[0] = c; + mblen = pg_encoding_mblen(client_encoding, s); + for (j = 1; j < mblen; j++) + { + c = CopyGetChar(); + if (c == EOF) + { + result = true; + break; + } + appendStringInfoCharMacro(&line_buf, c); + } + if (result) + break; /* out of outer loop */ + } + } /* end of outer loop */ + + /* + * Done reading the line. Convert it to server encoding. + * + * Note: set line_buf_converted to true *before* attempting conversion; + * this prevents infinite recursion during error reporting should + * pg_client_to_server() issue an error, due to copy_in_error_callback + * again attempting the same conversion. We'll end up issuing the message + * without conversion, which is bad but better than nothing ... + */ + line_buf_converted = true; + + if (change_encoding) + { + cvt = (char *) pg_client_to_server((unsigned char *) line_buf.data, + line_buf.len); + if (cvt != line_buf.data) + { + /* transfer converted data back to line_buf */ + line_buf.len = 0; + line_buf.data[0] = '\0'; + appendBinaryStringInfo(&line_buf, cvt, strlen(cvt)); + } + } + + return result; + } + /*---------- * Read the value of a single attribute, performing de-escaping as needed. * *************** *** 2332,2366 **** for (;;) { - /* handle multiline quoted fields */ - if (in_quote && line_buf.cursor >= line_buf.len) - { - bool done; - - switch(eol_type) - { - case EOL_NL: - appendStringInfoString(&attribute_buf,"\n"); - break; - case EOL_CR: - appendStringInfoString(&attribute_buf,"\r"); - break; - case EOL_CRNL: - appendStringInfoString(&attribute_buf,"\r\n"); - break; - case EOL_UNKNOWN: - /* shouldn't happen - just keep going */ - break; - } - - copy_lineno++; - done = CopyReadLine(); - if (done && line_buf.len == 0) - break; - start_cursor = line_buf.cursor; - } - end_cursor = line_buf.cursor; if (line_buf.cursor >= line_buf.len) break; c = line_buf.data[line_buf.cursor++]; --- 2566,2573 ---- for (;;) { end_cursor = line_buf.cursor; + if (line_buf.cursor >= line_buf.len) break; c = line_buf.data[line_buf.cursor++];
pgsql-patches by date: