Re: COPY for CSV documentation - Mailing list pgsql-patches
From | Bruce Momjian |
---|---|
Subject | Re: COPY for CSV documentation |
Date | |
Msg-id | 200404121715.i3CHF7k10209@candle.pha.pa.us Whole thread Raw |
In response to | Re: COPY for CSV documentation (Andrew Dunstan <andrew@dunslane.net>) |
List | pgsql-patches |
FYI, this CVS is turning into quite a job, but doing it right takes this kind of effort. --------------------------------------------------------------------------- Andrew Dunstan wrote: > Bruce Momjian wrote: > > >Andrew Dunstan wrote: > > > > > >>In fact, in the patch I sent in, no quoted string is marked as null when > >>being read (so even if you use \N as the null marker, "\N" will be that > >>literal and not a null marker). And the null marker, whatever it is, > >>should be made quote safe by us throwing an error if it contains the > >>quote marker, just as we now make sure that the null marker is > >>delimiter-safe. > >> > >> > > > >What value does an int column get if the input file has ',,'. Don't > >tell me zero? :-) Error? > > > > > > If the null marker is not an empty string, it gets an error, of course - > if it is it gets a null: > > [andrew@marmaduke pginst]$ echo ',,' | bin/psql -c "create temp table > foo (a int, b text, c text); copy foo from stdin delimiter ',\"' null > '\\\\N';" > ERROR: invalid input syntax for integer: "" > CONTEXT: COPY foo, line 1, column a: "" > [andrew@marmaduke pginst]$ echo ',,' | bin/psql -c "create temp table > foo (a int, b text, c text); copy foo from stdin delimiter ',\"' ;" > [andrew@marmaduke pginst]$ > > > I hope that is expected behaviour - it's what *I* expect, at least. > > > > > > > >>I will check on the write behaviour - it might need ammending too. > >> > >>I'll submit a revised patch based on the original syntax scheme, and > >>then you (Bruce) can make the syntax/psql changes that seem to be agreed > >>on now - is that ok? > >> > >> > > > >OK, go as far as you want and post it. I will turn around a new patch > >in a few hours after you post. > > > > > > > >>The default NULL value issue can be determined at the end of any > >>exhaustive debate we have - in the end it's a one line code change ;-) > >> > >> > > > >Agreed. > > > > > > > > Attached patch has these additions to previously posted patch: > . quote character may not appear in NULL marker > . any non-null value that matches the NULL marker is forced to be quoted > when written. > > > cheers > > andrew > > Index: src/backend/commands/copy.c > =================================================================== > RCS file: /projects/cvsroot/pgsql-server/src/backend/commands/copy.c,v > retrieving revision 1.219 > diff -c -r1.219 copy.c > *** src/backend/commands/copy.c 6 Apr 2004 13:21:33 -0000 1.219 > --- src/backend/commands/copy.c 12 Apr 2004 16:21:33 -0000 > *************** > *** 70,76 **** > typedef enum CopyReadResult > { > NORMAL_ATTR, > ! END_OF_LINE > } CopyReadResult; > > /* > --- 70,77 ---- > typedef enum CopyReadResult > { > NORMAL_ATTR, > ! END_OF_LINE, > ! UNTERMINATED_FIELD > } CopyReadResult; > > /* > *************** > *** 136,144 **** > --- 137,148 ---- > static bool CopyReadLine(void); > static char *CopyReadAttribute(const char *delim, const char *null_print, > CopyReadResult *result, bool *isnull); > + static char *CopyReadAttributeCSV(const char *delim, const char *null_print, > + CopyReadResult *result, bool *isnull); > static Datum CopyReadBinaryAttribute(int column_no, FmgrInfo *flinfo, > Oid typelem, bool *isnull); > static void CopyAttributeOut(char *string, char *delim); > + static void CopyAttributeOutCSV(char *string, char *delim, bool force_quote); > static List *CopyGetAttnums(Relation rel, List *attnamelist); > static void limit_printout_length(StringInfo buf); > > *************** > *** 682,687 **** > --- 686,692 ---- > List *attnumlist; > bool binary = false; > bool oids = false; > + bool csv_mode = false; > char *delim = NULL; > char *null_print = NULL; > Relation rel; > *************** > *** 744,751 **** > if (!delim) > delim = "\t"; > > if (!null_print) > ! null_print = "\\N"; > > /* > * Open and lock the relation, using the appropriate lock type. > --- 749,759 ---- > if (!delim) > delim = "\t"; > > + if (strlen(delim) > 1) > + csv_mode = true; > + > if (!null_print) > ! null_print = csv_mode ? "" : "\\N"; > > /* > * Open and lock the relation, using the appropriate lock type. > *************** > *** 772,783 **** > "psql's \\copy command also works for anyone."))); > > /* > ! * Presently, only single-character delimiter strings are supported. > */ > ! if (strlen(delim) != 1) > ereport(ERROR, > (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), > errmsg("COPY delimiter must be a single character"))); > > /* > * Don't allow the delimiter to appear in the null string. > --- 780,806 ---- > "psql's \\copy command also works for anyone."))); > > /* > ! * Only single-character delimiter strings are supported, > ! * except in CSV mode, where the string must be > ! * delimiter-char quote-char [escape-char] > */ > ! if (!csv_mode && strlen(delim) != 1) > ereport(ERROR, > (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), > errmsg("COPY delimiter must be a single character"))); > + else if (csv_mode) > + { > + if(strlen(delim) > 3) > + ereport(ERROR, > + (errcode(ERRCODE_SYNTAX_ERROR), > + errmsg("COPY delimiters for CSV must be a 2 or 3 characters"))); > + if (delim[0] == delim[1] || > + (strlen(delim) == 3 && delim[0] == delim[2])) > + ereport(ERROR, > + (errcode(ERRCODE_SYNTAX_ERROR), > + errmsg("CSV delimiter character must not be same as quote character or escape character"))); > + > + } > > /* > * Don't allow the delimiter to appear in the null string. > *************** > *** 788,793 **** > --- 811,833 ---- > errmsg("COPY delimiter must not appear in the NULL specification"))); > > /* > + * Don't allow the csv quote char to appear in the null string. > + */ > + if (csv_mode && strchr(null_print, delim[1]) != NULL) > + ereport(ERROR, > + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), > + errmsg("CSV quote character must not appear in the NULL specification"))); > + > + /* > + * Don't allow OIDs in CSV mode > + */ > + > + if (csv_mode && oids) > + ereport(ERROR, > + (errcode(ERRCODE_SYNTAX_ERROR), > + errmsg("Cannot specify OIDS in CSV mode "))); > + > + /* > * Don't allow COPY w/ OIDs to or from a table without them > */ > if (oids && !rel->rd_rel->relhasoids) > *************** > *** 969,974 **** > --- 1009,1015 ---- > FmgrInfo *out_functions; > Oid *elements; > bool *isvarlena; > + bool csv_mode; > char *string; > Snapshot mySnapshot; > List *cur; > *************** > *** 979,984 **** > --- 1020,1026 ---- > attr = tupDesc->attrs; > num_phys_attrs = tupDesc->natts; > attr_count = length(attnumlist); > + csv_mode = (strlen(delim) > 1); > > /* > * Get info about the columns we need to process. > *************** > *** 1051,1057 **** > while ((tuple = heap_getnext(scandesc, ForwardScanDirection)) != NULL) > { > bool need_delim = false; > - > CHECK_FOR_INTERRUPTS(); > > MemoryContextReset(mycontext); > --- 1093,1098 ---- > *************** > *** 1113,1119 **** > value, > ObjectIdGetDatum(elements[attnum - 1]), > Int32GetDatum(attr[attnum - 1]->atttypmod))); > ! CopyAttributeOut(string, delim); > } > else > { > --- 1154,1167 ---- > value, > ObjectIdGetDatum(elements[attnum - 1]), > Int32GetDatum(attr[attnum - 1]->atttypmod))); > ! if (csv_mode) > ! { > ! bool force_quote = (strcmp(string,null_print) == 0); > ! CopyAttributeOutCSV(string, delim, force_quote); > ! } > ! else > ! CopyAttributeOut(string, delim); > ! > } > else > { > *************** > *** 1263,1268 **** > --- 1311,1317 ---- > Datum *values; > char *nulls; > bool done = false; > + bool csv_mode; > bool isnull; > ResultRelInfo *resultRelInfo; > EState *estate = CreateExecutorState(); /* for ExecConstraints() */ > *************** > *** 1280,1285 **** > --- 1329,1335 ---- > num_phys_attrs = tupDesc->natts; > attr_count = length(attnumlist); > num_defaults = 0; > + csv_mode = (strlen(delim) > 1); > > /* > * We need a ResultRelInfo so we can use the regular executor's > *************** > *** 1499,1504 **** > --- 1549,1555 ---- > > if (file_has_oids) > { > + /* can't be in CSV mode here */ > string = CopyReadAttribute(delim, null_print, > &result, &isnull); > > *************** > *** 1537,1544 **** > errmsg("missing data for column \"%s\"", > NameStr(attr[m]->attname)))); > > ! string = CopyReadAttribute(delim, null_print, > ! &result, &isnull); > > if (isnull) > { > --- 1588,1608 ---- > errmsg("missing data for column \"%s\"", > NameStr(attr[m]->attname)))); > > ! if (csv_mode) > ! { > ! string = CopyReadAttributeCSV(delim, null_print, > ! &result, &isnull); > ! if (result == UNTERMINATED_FIELD) > ! ereport(ERROR, > ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), > ! errmsg("unterminated CSV quoted field"))); > ! } > ! else > ! { > ! string = CopyReadAttribute(delim, null_print, > ! &result, &isnull); > ! } > ! > > if (isnull) > { > *************** > *** 2069,2074 **** > --- 2133,2288 ---- > return attribute_buf.data; > } > > + > + /* > + * Read the value of a single attribute in CSV mode, > + * performing de-escaping as needed. Escaping does not follow the normal > + * PostgreSQL text mode, but instead "standard" (i.e. common) CSV usage. > + * > + * Quoted fields can span lines, in which case the line end is embedded > + * in the returned string. > + * > + * delim is a 2- or 3-character string. The first character is the > + * field delimiter, the second the quote character, the third is the > + * escape character indise quotes, and defaults to the quote character. > + * > + * null_print is the null marker string. Note that this is compared to > + * the pre-de-escaped input string (thus if it is quoted it is not a NULL). > + * > + * *result is set to indicate what terminated the read: > + * NORMAL_ATTR: column delimiter > + * END_OF_LINE: end of line > + * UNTERMINATED_FIELD no quote detected at end of a quoted field > + * > + * In any case, the string read up to the terminator (or end of file) > + * is returned. > + * > + * *isnull is set true or false depending on whether the input matched > + * the null marker. Note that the caller cannot check this since the > + * returned string will be the post-de-escaping equivalent, which may > + * look the same as some valid data string. > + *---------- > + */ > + > + static char * > + CopyReadAttributeCSV(const char *delim, const char *null_print, > + CopyReadResult *result, bool *isnull) > + { > + char delimc = delim[0]; > + char quotec = delim[1]; > + char escapec = delim[2] ? delim[2] : delim[1]; > + char c; > + int start_cursor = line_buf.cursor; > + int end_cursor = start_cursor;; > + int input_len; > + bool in_quote = false; > + bool saw_quote = false; > + > + /* reset attribute_buf to empty */ > + attribute_buf.len = 0; > + attribute_buf.data[0] = '\0'; > + > + /* set default status */ > + *result = END_OF_LINE; > + > + for (;;) > + { > + /* handle multiline quoted fields */ > + if (in_quote && line_buf.cursor >= line_buf.len) > + { > + bool done; > + > + switch(eol_type) > + { > + case EOL_NL: > + appendStringInfoString(&attribute_buf,"\n"); > + break; > + case EOL_CR: > + appendStringInfoString(&attribute_buf,"\r"); > + break; > + case EOL_CRNL: > + appendStringInfoString(&attribute_buf,"\r\n"); > + break; > + case EOL_UNKNOWN: > + /* shouldn't happen - just keep going */ > + break; > + } > + > + copy_lineno++; > + done = CopyReadLine(); > + if (done && line_buf.len == 0) > + break; > + start_cursor = line_buf.cursor; > + } > + > + end_cursor = line_buf.cursor; > + if (line_buf.cursor >= line_buf.len) > + break; > + c = line_buf.data[line_buf.cursor++]; > + /* > + * unquoted field delimiter > + */ > + if (!in_quote && c == delimc) > + { > + *result = NORMAL_ATTR; > + break; > + } > + /* > + * start of quoted field (or part of field) > + */ > + if (!in_quote && c == quotec) > + { > + saw_quote = true; > + in_quote = true; > + continue; > + } > + /* > + * escape within a quoted field > + */ > + if (in_quote && c == escapec) > + { > + /* > + * peek at the next char if available, and escape it if it > + * is an escape char or a quote char > + */ > + if (line_buf.cursor <= line_buf.len) > + { > + char nextc = line_buf.data[line_buf.cursor]; > + if (nextc == escapec || nextc == quotec) > + { > + appendStringInfoCharMacro(&attribute_buf, nextc); > + line_buf.cursor++; > + continue; > + } > + } > + } > + /* > + * end of quoted field. > + * Must do this test after testing for escape in case quote char > + * and escape char are the same (which is the common case). > + */ > + if(in_quote && c == quotec) > + { > + in_quote = false; > + continue; > + } > + appendStringInfoCharMacro(&attribute_buf, c); > + } > + > + if (in_quote) > + *result = UNTERMINATED_FIELD; > + > + /* check whether raw input matched null marker */ > + input_len = end_cursor - start_cursor; > + if (!saw_quote && input_len == strlen(null_print) && > + strncmp(&line_buf.data[start_cursor], null_print, input_len) == 0) > + *isnull = true; > + else > + *isnull = false; > + > + return attribute_buf.data; > + } > + > /* > * Read a binary attribute > */ > *************** > *** 2192,2197 **** > --- 2406,2479 ---- > break; > } > } > + } > + > + /* > + * Send CSV representation of one attribute, with conversion and > + * CSV type escaping > + */ > + static void > + CopyAttributeOutCSV(char *server_string, char *delim, bool force_quote) > + { > + char *string; > + char c; > + char delimc = delim[0]; > + char quotec = delim[1]; > + char escapec = delim[2] ? delim[2] : delim[1]; > + bool need_quote = force_quote; > + char *test_string; > + bool same_encoding; > + int mblen; > + int i; > + > + same_encoding = (server_encoding == client_encoding); > + if (!same_encoding) > + string = (char *) pg_server_to_client((unsigned char *) server_string, > + strlen(server_string)); > + else > + string = server_string; > + > + /* have to run through the string twice, > + * first time to see if it needs quoting, second to actually send it > + */ > + > + for(test_string = string; > + !need_quote && (c = *test_string) != '\0'; > + test_string += mblen) > + { > + if (c == delimc || c == quotec || c == '\n' || c == '\r') > + { > + need_quote = true; > + } > + if (!same_encoding) > + mblen = pg_encoding_mblen(client_encoding, test_string); > + else > + mblen = 1; > + } > + > + if (need_quote) > + CopySendChar(quotec); > + > + for (; (c = *string) != '\0'; string += mblen) > + { > + if (c == quotec || c == escapec) > + CopySendChar(escapec); > + > + CopySendChar(c); > + > + if (!same_encoding) > + { > + /* send additional bytes of the char, if any */ > + mblen = pg_encoding_mblen(client_encoding, string); > + for (i = 1; i < mblen; i++) > + CopySendChar(string[i]); > + } > + else > + mblen = 1; > + } > + > + if (need_quote) > + CopySendChar(quotec); > } > > /* > > ---------------------------(end of broadcast)--------------------------- > TIP 4: Don't 'kill -9' the postmaster -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 359-1001 + If your life is a hard drive, | 13 Roberts Road + Christ can be your backup. | Newtown Square, Pennsylvania 19073
pgsql-patches by date: