Re: CSV multiline final fix - Mailing list pgsql-patches
From | Andrew Dunstan |
---|---|
Subject | Re: CSV multiline final fix |
Date | |
Msg-id | 421ABB50.6090006@dunslane.net Whole thread Raw |
In response to | Re: CSV multiline final fix ("Andrew Dunstan" <andrew@dunslane.net>) |
Responses |
Re: CSV multiline final fix
|
List | pgsql-patches |
Andrew Dunstan wrote: >Bruce Momjian said: > > >>Shame we had to duplicate CopyReadLine() in a sense. >> >> >> >> > > >If you can find a clean way to merge them please do - I'll be very grateful. > My head started to spin when I tried. In general I dislike having more than >2 or 2 levels of logic in a given piece of code. > > > > Previous comment courtesy clumsy fingers and the Department of Redundancy Department (of course, I meant 2 or 3). Anyway, please review this patch for copy.c - it's possibly more to your taste. It's less redundant, but I'm not sure it's more clear. cheers andrew *** copy.c.orig Mon Feb 21 23:12:41 2005 --- copy.c Mon Feb 21 23:35:22 2005 *************** *** 98,104 **** static EolType eol_type; /* EOL type of input */ static int client_encoding; /* remote side's character encoding */ static int server_encoding; /* local encoding */ - static bool embedded_line_warning; /* these are just for error messages, see copy_in_error_callback */ static bool copy_binary; /* is it a binary copy? */ --- 98,103 ---- *************** *** 139,145 **** static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids, char *delim, char *null_print, bool csv_mode, char *quote, char *escape, List *force_notnull_atts); ! static bool CopyReadLine(void); static char *CopyReadAttribute(const char *delim, const char *null_print, CopyReadResult *result, bool *isnull); static char *CopyReadAttributeCSV(const char *delim, const char *null_print, --- 138,144 ---- static void CopyFrom(Relation rel, List *attnumlist, bool binary, bool oids, char *delim, char *null_print, bool csv_mode, char *quote, char *escape, List *force_notnull_atts); ! static bool CopyReadLine(char * quote, char * escape); static char *CopyReadAttribute(const char *delim, const char *null_print, CopyReadResult *result, bool *isnull); static char *CopyReadAttributeCSV(const char *delim, const char *null_print, *************** *** 1191,1197 **** attr = tupDesc->attrs; num_phys_attrs = tupDesc->natts; attr_count = list_length(attnumlist); - embedded_line_warning = false; /* * Get info about the columns we need to process. --- 1190,1195 ---- *************** *** 1718,1724 **** ListCell *cur; /* Actually read the line into memory here */ ! done = CopyReadLine(); /* * EOF at start of line means we're done. If we see EOF after --- 1716,1723 ---- ListCell *cur; /* Actually read the line into memory here */ ! done = csv_mode ? ! CopyReadLine(quote, escape) : CopyReadLine(NULL, NULL); /* * EOF at start of line means we're done. If we see EOF after *************** *** 2006,2012 **** * by newline. */ static bool ! CopyReadLine(void) { bool result; bool change_encoding = (client_encoding != server_encoding); --- 2005,2011 ---- * by newline. */ static bool ! CopyReadLine(char * quote, char * escape) { bool result; bool change_encoding = (client_encoding != server_encoding); *************** *** 2015,2020 **** --- 2014,2032 ---- int j; unsigned char s[2]; char *cvt; + bool in_quote = false, last_was_esc = false, csv_mode = false; + char quotec = '\0', escapec = '\0'; + + if (quote) + { + csv_mode = true; + quotec = quote[0]; + escapec = escape[0]; + /* ignore special escape processing if it's the same as quotec */ + if (quotec == escapec) + escapec = '\0'; + } + s[1] = 0; *************** *** 2031,2041 **** /* * In this loop we only care for detecting newlines (\r and/or \n) and ! * the end-of-copy marker (\.). For backwards compatibility we allow * backslashes to escape newline characters. Backslashes other than * the end marker get put into the line_buf, since CopyReadAttribute ! * does its own escape processing. These four characters, and only ! * these four, are assumed the same in frontend and backend encodings. * We do not assume that second and later bytes of a frontend * multibyte character couldn't look like ASCII characters. */ --- 2043,2062 ---- /* * In this loop we only care for detecting newlines (\r and/or \n) and ! * the end-of-copy marker (\.). ! * ! * In Text mode, for backwards compatibility we allow * backslashes to escape newline characters. Backslashes other than * the end marker get put into the line_buf, since CopyReadAttribute ! * does its own escape processing. ! * ! * In CSV mode, CR and NL inside q quoted field are just part of the ! * data value and are put in line_buf. We keep just enough state ! * to know if we are currently in a quoted field or not. ! * ! * These four characters, and only these four, are assumed the same in ! * frontend and backend encodings. ! * * We do not assume that second and later bytes of a frontend * multibyte character couldn't look like ASCII characters. */ *************** *** 2047,2059 **** result = true; break; } ! if (c == '\r') { if (eol_type == EOL_NL) ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("literal carriage return found in data"), ! errhint("Use \"\\r\" to represent carriage return."))); /* Check for \r\n on first line, _and_ handle \r\n. */ if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL) { --- 2068,2116 ---- result = true; break; } ! ! if (csv_mode) ! { ! /* ! * Dealing with quotes and escapes here is mildly tricky. If the ! * quote char is also the escape char, there's no problem - we ! * just use the char as a toggle. If they are different, we need ! * to ensure that we only take account of an escape inside a quoted ! * field and immediately preceding a quote char, and not the ! * second in a escape-escape sequence. ! */ ! ! if (in_quote && c == escapec) ! last_was_esc = ! last_was_esc; ! if (c == quotec && ! last_was_esc) ! in_quote = ! in_quote; ! if (c != escapec) ! last_was_esc = false; ! ! /* ! * updating the line count for embedded CR and/or LF chars is ! * necessarily a little fragile - this test is probably about ! * the best we can do. ! */ ! if (in_quote && c == (eol_type == EOL_CR ? '\r' : '\n')) ! copy_lineno++; ! } ! ! if (!in_quote && c == '\r') { if (eol_type == EOL_NL) ! { ! if (! csv_mode) ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("literal carriage return found in data"), ! errhint("Use \"\\r\" to represent carriage return."))); ! else ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("unquoted carriage return found in CSV data"), ! errhint("Use quoted CSV field to represent carriage return."))); ! } /* Check for \r\n on first line, _and_ handle \r\n. */ if (eol_type == EOL_UNKNOWN || eol_type == EOL_CRNL) { *************** *** 2068,2077 **** { /* found \r, but no \n */ if (eol_type == EOL_CRNL) ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("literal carriage return found in data"), ! errhint("Use \"\\r\" to represent carriage return."))); /* * if we got here, it is the first line and we didn't --- 2125,2143 ---- { /* found \r, but no \n */ if (eol_type == EOL_CRNL) ! { ! if (!csv_mode) ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("literal carriage return found in data"), ! errhint("Use \"\\r\" to represent carriage return."))); ! else ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("unquoted carriage return found in data"), ! errhint("Use quoted CSV field to represent carriage return."))); ! ! } /* * if we got here, it is the first line and we didn't *************** *** 2083,2108 **** } break; } ! if (c == '\n') { if (eol_type == EOL_CR || eol_type == EOL_CRNL) ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("literal newline found in data"), ! errhint("Use \"\\n\" to represent newline."))); eol_type = EOL_NL; break; } ! if (c == '\\') { ! c = CopyGetChar(); ! if (c == EOF) { result = true; break; } ! if (c == '.') { if (eol_type == EOL_CRNL) { c = CopyGetChar(); --- 2149,2195 ---- } break; } ! if (!in_quote && c == '\n') { if (eol_type == EOL_CR || eol_type == EOL_CRNL) ! { ! if (!csv_mode) ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("literal newline found in data"), ! errhint("Use \"\\n\" to represent newline."))); ! else ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("unquoted newline found in data"), ! errhint("Use quoted CSV field to represent newline."))); ! ! } eol_type = EOL_NL; break; } ! ! if ((line_buf.len == 0 || !csv_mode) && c == '\\') { ! int c2; ! ! if (csv_mode) ! c2 = CopyPeekChar(); ! else ! c2 = c = CopyGetChar(); ! ! if (c2 == EOF) { result = true; + if (csv_mode) + CopyDonePeek(c2, true); break; } ! if (c2 == '.') { + if (csv_mode) + CopyDonePeek(c2, true); /* allow keep calling GetChar() */ + if (eol_type == EOL_CRNL) { c = CopyGetChar(); *************** *** 2140,2147 **** result = true; /* report EOF */ break; } ! /* not EOF mark, so emit \ and following char literally */ ! appendStringInfoCharMacro(&line_buf, '\\'); } appendStringInfoCharMacro(&line_buf, c); --- 2227,2238 ---- result = true; /* report EOF */ break; } ! ! if (csv_mode) ! CopyDonePeek(c2, false); /* not a dot, so put it back */ ! else ! /* not EOF mark, so emit \ and following char literally */ ! appendStringInfoCharMacro(&line_buf, '\\'); } appendStringInfoCharMacro(&line_buf, c); *************** *** 2369,2402 **** for (;;) { - /* handle multiline quoted fields */ - if (in_quote && line_buf.cursor >= line_buf.len) - { - bool done; - - switch (eol_type) - { - case EOL_NL: - appendStringInfoString(&attribute_buf, "\n"); - break; - case EOL_CR: - appendStringInfoString(&attribute_buf, "\r"); - break; - case EOL_CRNL: - appendStringInfoString(&attribute_buf, "\r\n"); - break; - case EOL_UNKNOWN: - /* shouldn't happen - just keep going */ - break; - } - - copy_lineno++; - done = CopyReadLine(); - if (done && line_buf.len == 0) - break; - start_cursor = line_buf.cursor; - } - end_cursor = line_buf.cursor; if (line_buf.cursor >= line_buf.len) break; --- 2460,2465 ---- *************** *** 2629,2653 **** !use_quote && (c = *test_string) != '\0'; test_string += mblen) { - /* - * We don't know here what the surrounding line end characters - * might be. It might not even be under postgres' control. So - * we simple warn on ANY embedded line ending character. - * - * This warning will disappear when we make line parsing field-aware, - * so that we can reliably read in embedded line ending characters - * regardless of the file's line-end context. - * - */ - - if (!embedded_line_warning && (c == '\n' || c == '\r') ) - { - embedded_line_warning = true; - elog(WARNING, - "CSV fields with embedded linefeed or carriage return " - "characters might not be able to be reimported"); - } - if (c == delimc || c == quotec || c == '\n' || c == '\r') use_quote = true; if (!same_encoding) --- 2692,2697 ----
pgsql-patches by date: