Re: COPY for CSV documentation - Mailing list pgsql-patches
From | Andrew Dunstan |
---|---|
Subject | Re: COPY for CSV documentation |
Date | |
Msg-id | 407AC373.2010106@dunslane.net Whole thread Raw |
In response to | Re: COPY for CSV documentation (Bruce Momjian <pgman@candle.pha.pa.us>) |
Responses |
Re: COPY for CSV documentation
Re: COPY for CSV documentation Updated COPY CSV patch |
List | pgsql-patches |
Bruce Momjian wrote: >Andrew Dunstan wrote: > > >>In fact, in the patch I sent in, no quoted string is marked as null when >>being read (so even if you use \N as the null marker, "\N" will be that >>literal and not a null marker). And the null marker, whatever it is, >>should be made quote safe by us throwing an error if it contains the >>quote marker, just as we now make sure that the null marker is >>delimiter-safe. >> >> > >What value does an int column get if the input file has ',,'. Don't >tell me zero? :-) Error? > > If the null marker is not an empty string, it gets an error, of course - if it is it gets a null: [andrew@marmaduke pginst]$ echo ',,' | bin/psql -c "create temp table foo (a int, b text, c text); copy foo from stdin delimiter ',\"' null '\\\\N';" ERROR: invalid input syntax for integer: "" CONTEXT: COPY foo, line 1, column a: "" [andrew@marmaduke pginst]$ echo ',,' | bin/psql -c "create temp table foo (a int, b text, c text); copy foo from stdin delimiter ',\"' ;" [andrew@marmaduke pginst]$ I hope that is expected behaviour - it's what *I* expect, at least. > > >>I will check on the write behaviour - it might need ammending too. >> >>I'll submit a revised patch based on the original syntax scheme, and >>then you (Bruce) can make the syntax/psql changes that seem to be agreed >>on now - is that ok? >> >> > >OK, go as far as you want and post it. I will turn around a new patch >in a few hours after you post. > > > >>The default NULL value issue can be determined at the end of any >>exhaustive debate we have - in the end it's a one line code change ;-) >> >> > >Agreed. > > > Attached patch has these additions to previously posted patch: . quote character may not appear in NULL marker . any non-null value that matches the NULL marker is forced to be quoted when written. cheers andrew Index: src/backend/commands/copy.c =================================================================== RCS file: /projects/cvsroot/pgsql-server/src/backend/commands/copy.c,v retrieving revision 1.219 diff -c -r1.219 copy.c *** src/backend/commands/copy.c 6 Apr 2004 13:21:33 -0000 1.219 --- src/backend/commands/copy.c 12 Apr 2004 16:21:33 -0000 *************** *** 70,76 **** typedef enum CopyReadResult { NORMAL_ATTR, ! END_OF_LINE } CopyReadResult; /* --- 70,77 ---- typedef enum CopyReadResult { NORMAL_ATTR, ! END_OF_LINE, ! UNTERMINATED_FIELD } CopyReadResult; /* *************** *** 136,144 **** --- 137,148 ---- static bool CopyReadLine(void); static char *CopyReadAttribute(const char *delim, const char *null_print, CopyReadResult *result, bool *isnull); + static char *CopyReadAttributeCSV(const char *delim, const char *null_print, + CopyReadResult *result, bool *isnull); static Datum CopyReadBinaryAttribute(int column_no, FmgrInfo *flinfo, Oid typelem, bool *isnull); static void CopyAttributeOut(char *string, char *delim); + static void CopyAttributeOutCSV(char *string, char *delim, bool force_quote); static List *CopyGetAttnums(Relation rel, List *attnamelist); static void limit_printout_length(StringInfo buf); *************** *** 682,687 **** --- 686,692 ---- List *attnumlist; bool binary = false; bool oids = false; + bool csv_mode = false; char *delim = NULL; char *null_print = NULL; Relation rel; *************** *** 744,751 **** if (!delim) delim = "\t"; if (!null_print) ! null_print = "\\N"; /* * Open and lock the relation, using the appropriate lock type. --- 749,759 ---- if (!delim) delim = "\t"; + if (strlen(delim) > 1) + csv_mode = true; + if (!null_print) ! null_print = csv_mode ? "" : "\\N"; /* * Open and lock the relation, using the appropriate lock type. *************** *** 772,783 **** "psql's \\copy command also works for anyone."))); /* ! * Presently, only single-character delimiter strings are supported. */ ! if (strlen(delim) != 1) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("COPY delimiter must be a single character"))); /* * Don't allow the delimiter to appear in the null string. --- 780,806 ---- "psql's \\copy command also works for anyone."))); /* ! * Only single-character delimiter strings are supported, ! * except in CSV mode, where the string must be ! * delimiter-char quote-char [escape-char] */ ! if (!csv_mode && strlen(delim) != 1) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("COPY delimiter must be a single character"))); + else if (csv_mode) + { + if(strlen(delim) > 3) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("COPY delimiters for CSV must be a 2 or 3 characters"))); + if (delim[0] == delim[1] || + (strlen(delim) == 3 && delim[0] == delim[2])) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("CSV delimiter character must not be same as quote character or escape character"))); + + } /* * Don't allow the delimiter to appear in the null string. *************** *** 788,793 **** --- 811,833 ---- errmsg("COPY delimiter must not appear in the NULL specification"))); /* + * Don't allow the csv quote char to appear in the null string. + */ + if (csv_mode && strchr(null_print, delim[1]) != NULL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("CSV quote character must not appear in the NULL specification"))); + + /* + * Don't allow OIDs in CSV mode + */ + + if (csv_mode && oids) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Cannot specify OIDS in CSV mode "))); + + /* * Don't allow COPY w/ OIDs to or from a table without them */ if (oids && !rel->rd_rel->relhasoids) *************** *** 969,974 **** --- 1009,1015 ---- FmgrInfo *out_functions; Oid *elements; bool *isvarlena; + bool csv_mode; char *string; Snapshot mySnapshot; List *cur; *************** *** 979,984 **** --- 1020,1026 ---- attr = tupDesc->attrs; num_phys_attrs = tupDesc->natts; attr_count = length(attnumlist); + csv_mode = (strlen(delim) > 1); /* * Get info about the columns we need to process. *************** *** 1051,1057 **** while ((tuple = heap_getnext(scandesc, ForwardScanDirection)) != NULL) { bool need_delim = false; - CHECK_FOR_INTERRUPTS(); MemoryContextReset(mycontext); --- 1093,1098 ---- *************** *** 1113,1119 **** value, ObjectIdGetDatum(elements[attnum - 1]), Int32GetDatum(attr[attnum - 1]->atttypmod))); ! CopyAttributeOut(string, delim); } else { --- 1154,1167 ---- value, ObjectIdGetDatum(elements[attnum - 1]), Int32GetDatum(attr[attnum - 1]->atttypmod))); ! if (csv_mode) ! { ! bool force_quote = (strcmp(string,null_print) == 0); ! CopyAttributeOutCSV(string, delim, force_quote); ! } ! else ! CopyAttributeOut(string, delim); ! } else { *************** *** 1263,1268 **** --- 1311,1317 ---- Datum *values; char *nulls; bool done = false; + bool csv_mode; bool isnull; ResultRelInfo *resultRelInfo; EState *estate = CreateExecutorState(); /* for ExecConstraints() */ *************** *** 1280,1285 **** --- 1329,1335 ---- num_phys_attrs = tupDesc->natts; attr_count = length(attnumlist); num_defaults = 0; + csv_mode = (strlen(delim) > 1); /* * We need a ResultRelInfo so we can use the regular executor's *************** *** 1499,1504 **** --- 1549,1555 ---- if (file_has_oids) { + /* can't be in CSV mode here */ string = CopyReadAttribute(delim, null_print, &result, &isnull); *************** *** 1537,1544 **** errmsg("missing data for column \"%s\"", NameStr(attr[m]->attname)))); ! string = CopyReadAttribute(delim, null_print, ! &result, &isnull); if (isnull) { --- 1588,1608 ---- errmsg("missing data for column \"%s\"", NameStr(attr[m]->attname)))); ! if (csv_mode) ! { ! string = CopyReadAttributeCSV(delim, null_print, ! &result, &isnull); ! if (result == UNTERMINATED_FIELD) ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("unterminated CSV quoted field"))); ! } ! else ! { ! string = CopyReadAttribute(delim, null_print, ! &result, &isnull); ! } ! if (isnull) { *************** *** 2069,2074 **** --- 2133,2288 ---- return attribute_buf.data; } + + /* + * Read the value of a single attribute in CSV mode, + * performing de-escaping as needed. Escaping does not follow the normal + * PostgreSQL text mode, but instead "standard" (i.e. common) CSV usage. + * + * Quoted fields can span lines, in which case the line end is embedded + * in the returned string. + * + * delim is a 2- or 3-character string. The first character is the + * field delimiter, the second the quote character, the third is the + * escape character indise quotes, and defaults to the quote character. + * + * null_print is the null marker string. Note that this is compared to + * the pre-de-escaped input string (thus if it is quoted it is not a NULL). + * + * *result is set to indicate what terminated the read: + * NORMAL_ATTR: column delimiter + * END_OF_LINE: end of line + * UNTERMINATED_FIELD no quote detected at end of a quoted field + * + * In any case, the string read up to the terminator (or end of file) + * is returned. + * + * *isnull is set true or false depending on whether the input matched + * the null marker. Note that the caller cannot check this since the + * returned string will be the post-de-escaping equivalent, which may + * look the same as some valid data string. + *---------- + */ + + static char * + CopyReadAttributeCSV(const char *delim, const char *null_print, + CopyReadResult *result, bool *isnull) + { + char delimc = delim[0]; + char quotec = delim[1]; + char escapec = delim[2] ? delim[2] : delim[1]; + char c; + int start_cursor = line_buf.cursor; + int end_cursor = start_cursor;; + int input_len; + bool in_quote = false; + bool saw_quote = false; + + /* reset attribute_buf to empty */ + attribute_buf.len = 0; + attribute_buf.data[0] = '\0'; + + /* set default status */ + *result = END_OF_LINE; + + for (;;) + { + /* handle multiline quoted fields */ + if (in_quote && line_buf.cursor >= line_buf.len) + { + bool done; + + switch(eol_type) + { + case EOL_NL: + appendStringInfoString(&attribute_buf,"\n"); + break; + case EOL_CR: + appendStringInfoString(&attribute_buf,"\r"); + break; + case EOL_CRNL: + appendStringInfoString(&attribute_buf,"\r\n"); + break; + case EOL_UNKNOWN: + /* shouldn't happen - just keep going */ + break; + } + + copy_lineno++; + done = CopyReadLine(); + if (done && line_buf.len == 0) + break; + start_cursor = line_buf.cursor; + } + + end_cursor = line_buf.cursor; + if (line_buf.cursor >= line_buf.len) + break; + c = line_buf.data[line_buf.cursor++]; + /* + * unquoted field delimiter + */ + if (!in_quote && c == delimc) + { + *result = NORMAL_ATTR; + break; + } + /* + * start of quoted field (or part of field) + */ + if (!in_quote && c == quotec) + { + saw_quote = true; + in_quote = true; + continue; + } + /* + * escape within a quoted field + */ + if (in_quote && c == escapec) + { + /* + * peek at the next char if available, and escape it if it + * is an escape char or a quote char + */ + if (line_buf.cursor <= line_buf.len) + { + char nextc = line_buf.data[line_buf.cursor]; + if (nextc == escapec || nextc == quotec) + { + appendStringInfoCharMacro(&attribute_buf, nextc); + line_buf.cursor++; + continue; + } + } + } + /* + * end of quoted field. + * Must do this test after testing for escape in case quote char + * and escape char are the same (which is the common case). + */ + if(in_quote && c == quotec) + { + in_quote = false; + continue; + } + appendStringInfoCharMacro(&attribute_buf, c); + } + + if (in_quote) + *result = UNTERMINATED_FIELD; + + /* check whether raw input matched null marker */ + input_len = end_cursor - start_cursor; + if (!saw_quote && input_len == strlen(null_print) && + strncmp(&line_buf.data[start_cursor], null_print, input_len) == 0) + *isnull = true; + else + *isnull = false; + + return attribute_buf.data; + } + /* * Read a binary attribute */ *************** *** 2192,2197 **** --- 2406,2479 ---- break; } } + } + + /* + * Send CSV representation of one attribute, with conversion and + * CSV type escaping + */ + static void + CopyAttributeOutCSV(char *server_string, char *delim, bool force_quote) + { + char *string; + char c; + char delimc = delim[0]; + char quotec = delim[1]; + char escapec = delim[2] ? delim[2] : delim[1]; + bool need_quote = force_quote; + char *test_string; + bool same_encoding; + int mblen; + int i; + + same_encoding = (server_encoding == client_encoding); + if (!same_encoding) + string = (char *) pg_server_to_client((unsigned char *) server_string, + strlen(server_string)); + else + string = server_string; + + /* have to run through the string twice, + * first time to see if it needs quoting, second to actually send it + */ + + for(test_string = string; + !need_quote && (c = *test_string) != '\0'; + test_string += mblen) + { + if (c == delimc || c == quotec || c == '\n' || c == '\r') + { + need_quote = true; + } + if (!same_encoding) + mblen = pg_encoding_mblen(client_encoding, test_string); + else + mblen = 1; + } + + if (need_quote) + CopySendChar(quotec); + + for (; (c = *string) != '\0'; string += mblen) + { + if (c == quotec || c == escapec) + CopySendChar(escapec); + + CopySendChar(c); + + if (!same_encoding) + { + /* send additional bytes of the char, if any */ + mblen = pg_encoding_mblen(client_encoding, string); + for (i = 1; i < mblen; i++) + CopySendChar(string[i]); + } + else + mblen = 1; + } + + if (need_quote) + CopySendChar(quotec); } /*
pgsql-patches by date: