Re: COPY for CSVs - Mailing list pgsql-patches

From Andrew Dunstan
Subject Re: COPY for CSVs
Date
Msg-id 407736FE.8080305@dunslane.net
Whole thread Raw
In response to Re: COPY for CSVs  (Andrew Dunstan <andrew@dunslane.net>)
List pgsql-patches

Andrew Dunstan wrote:

> Andrew Dunstan wrote:
>
>>
>> Attached is an undocumented patch that implements COPY for CSVs
>> according to the scheme recently discussed. I believe it handles all
>> the straightforward and perverse cases that were discussed (including
>> multiline fields and partially quoted fields).
>>
>> I have done some light testing on it, but would appreciate some
>> assistance with testing, as well as any useful review comments.
>>
>
> Looking back over the discussion, I see that there was demand for the
> ability to export as well as import CSVs.
>
> That will actually be rather easier to do, so I will revise the patch
> accordingly.
>

version attached that does copy out as well as copy in. I am not sure
about the multibyte stuff, so that needs careful review.

cheers

andrew
Index: src/backend/commands/copy.c
===================================================================
RCS file: /projects/cvsroot/pgsql-server/src/backend/commands/copy.c,v
retrieving revision 1.219
diff -c -r1.219 copy.c
*** src/backend/commands/copy.c    6 Apr 2004 13:21:33 -0000    1.219
--- src/backend/commands/copy.c    9 Apr 2004 23:47:53 -0000
***************
*** 70,76 ****
  typedef enum CopyReadResult
  {
      NORMAL_ATTR,
!     END_OF_LINE
  } CopyReadResult;

  /*
--- 70,77 ----
  typedef enum CopyReadResult
  {
      NORMAL_ATTR,
!     END_OF_LINE,
!     UNTERMINATED_FIELD
  } CopyReadResult;

  /*
***************
*** 136,144 ****
--- 137,148 ----
  static bool CopyReadLine(void);
  static char *CopyReadAttribute(const char *delim, const char *null_print,
                                 CopyReadResult *result, bool *isnull);
+ static char *CopyReadAttributeCSV(const char *delim, const char *null_print,
+                                CopyReadResult *result, bool *isnull);
  static Datum CopyReadBinaryAttribute(int column_no, FmgrInfo *flinfo,
                          Oid typelem, bool *isnull);
  static void CopyAttributeOut(char *string, char *delim);
+ static void CopyAttributeOutCSV(char *string, char *delim);
  static List *CopyGetAttnums(Relation rel, List *attnamelist);
  static void limit_printout_length(StringInfo buf);

***************
*** 682,687 ****
--- 686,692 ----
      List       *attnumlist;
      bool        binary = false;
      bool        oids = false;
+     bool        csv_mode = false;
      char       *delim = NULL;
      char       *null_print = NULL;
      Relation    rel;
***************
*** 744,751 ****
      if (!delim)
          delim = "\t";

      if (!null_print)
!         null_print = "\\N";

      /*
       * Open and lock the relation, using the appropriate lock type.
--- 749,759 ----
      if (!delim)
          delim = "\t";

+     if (strlen(delim) > 1)
+         csv_mode = true;
+
      if (!null_print)
!         null_print = csv_mode ? "" : "\\N";

      /*
       * Open and lock the relation, using the appropriate lock type.
***************
*** 772,783 ****
                         "psql's \\copy command also works for anyone.")));

      /*
!      * Presently, only single-character delimiter strings are supported.
       */
!     if (strlen(delim) != 1)
          ereport(ERROR,
                  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                   errmsg("COPY delimiter must be a single character")));

      /*
       * Don't allow the delimiter to appear in the null string.
--- 780,806 ----
                         "psql's \\copy command also works for anyone.")));

      /*
!      * Only single-character delimiter strings are supported,
!      * except in CSV mode, where the string must be
!      * delimiter-char quote-char [escape-char]
       */
!     if (!csv_mode && strlen(delim) != 1)
          ereport(ERROR,
                  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                   errmsg("COPY delimiter must be a single character")));
+     else if (csv_mode)
+     {
+         if(strlen(delim) > 3)
+             ereport(ERROR,
+                 (errcode(ERRCODE_SYNTAX_ERROR),
+                  errmsg("COPY delimiters for CSV must be a 2 or 3 characters")));
+         if (delim[0] == delim[1] ||
+             (strlen(delim) == 3 && delim[0] == delim[2]))
+             ereport(ERROR,
+                 (errcode(ERRCODE_SYNTAX_ERROR),
+                  errmsg("CSV delimiter character must not be same as quote character or escape character")));
+
+     }

      /*
       * Don't allow the delimiter to appear in the null string.
***************
*** 788,793 ****
--- 811,825 ----
                   errmsg("COPY delimiter must not appear in the NULL specification")));

      /*
+      * Don't allow OIDs in CSV mode
+      */
+
+     if (csv_mode && oids)
+         ereport(ERROR,
+                 (errcode(ERRCODE_SYNTAX_ERROR),
+                  errmsg("Cannot specify OIDS in CSV mode ")));
+
+     /*
       * Don't allow COPY w/ OIDs to or from a table without them
       */
      if (oids && !rel->rd_rel->relhasoids)
***************
*** 969,974 ****
--- 1001,1007 ----
      FmgrInfo   *out_functions;
      Oid           *elements;
      bool       *isvarlena;
+     bool       csv_mode;
      char       *string;
      Snapshot    mySnapshot;
      List       *cur;
***************
*** 979,984 ****
--- 1012,1018 ----
      attr = tupDesc->attrs;
      num_phys_attrs = tupDesc->natts;
      attr_count = length(attnumlist);
+     csv_mode = (strlen(delim) > 1);

      /*
       * Get info about the columns we need to process.
***************
*** 1051,1057 ****
      while ((tuple = heap_getnext(scandesc, ForwardScanDirection)) != NULL)
      {
          bool        need_delim = false;
-
          CHECK_FOR_INTERRUPTS();

          MemoryContextReset(mycontext);
--- 1085,1090 ----
***************
*** 1113,1119 ****
                                                             value,
                                    ObjectIdGetDatum(elements[attnum - 1]),
                              Int32GetDatum(attr[attnum - 1]->atttypmod)));
!                     CopyAttributeOut(string, delim);
                  }
                  else
                  {
--- 1146,1156 ----
                                                             value,
                                    ObjectIdGetDatum(elements[attnum - 1]),
                              Int32GetDatum(attr[attnum - 1]->atttypmod)));
!                     if (csv_mode)
!                         CopyAttributeOutCSV(string, delim);
!                     else
!                         CopyAttributeOut(string, delim);
!
                  }
                  else
                  {
***************
*** 1263,1268 ****
--- 1300,1306 ----
      Datum       *values;
      char       *nulls;
      bool        done = false;
+     bool        csv_mode;
      bool        isnull;
      ResultRelInfo *resultRelInfo;
      EState       *estate = CreateExecutorState(); /* for ExecConstraints() */
***************
*** 1280,1285 ****
--- 1318,1324 ----
      num_phys_attrs = tupDesc->natts;
      attr_count = length(attnumlist);
      num_defaults = 0;
+     csv_mode = (strlen(delim) > 1);

      /*
       * We need a ResultRelInfo so we can use the regular executor's
***************
*** 1499,1504 ****
--- 1538,1544 ----

              if (file_has_oids)
              {
+                 /* can't be in CSV mode here */
                  string = CopyReadAttribute(delim, null_print,
                                             &result, &isnull);

***************
*** 1537,1544 ****
                               errmsg("missing data for column \"%s\"",
                                      NameStr(attr[m]->attname))));

!                 string = CopyReadAttribute(delim, null_print,
!                                            &result, &isnull);

                  if (isnull)
                  {
--- 1577,1597 ----
                               errmsg("missing data for column \"%s\"",
                                      NameStr(attr[m]->attname))));

!                 if (csv_mode)
!                 {
!                     string = CopyReadAttributeCSV(delim, null_print,
!                                                   &result, &isnull);
!                     if (result == UNTERMINATED_FIELD)
!                         ereport(ERROR,
!                                 (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
!                                  errmsg("unterminated CSV quoted field")));
!                 }
!                 else
!                 {
!                     string = CopyReadAttribute(delim, null_print,
!                                                &result, &isnull);
!                 }
!

                  if (isnull)
                  {
***************
*** 2069,2074 ****
--- 2122,2277 ----
      return attribute_buf.data;
  }

+
+ /*
+  * Read the value of a single attribute in CSV mode,
+  * performing de-escaping as needed. Escaping does not follow the normal
+  * PostgreSQL text mode, but instead "standard" (i.e. common) CSV usage.
+  *
+  * Quoted fields can span lines, in which case the line end is embedded
+  * in the returned string.
+  *
+  * delim is a 2- or 3-character string. The first character is the
+  * field delimiter, the second the quote character, the third is the
+  * escape character indise quotes, and defaults to the quote character.
+  *
+  * null_print is the null marker string.  Note that this is compared to
+  * the pre-de-escaped input string (thus if it is quoted it is not a NULL).
+  *
+  * *result is set to indicate what terminated the read:
+  *        NORMAL_ATTR:    column delimiter
+  *        END_OF_LINE:    end of line
+  *      UNTERMINATED_FIELD no quote detected at end of a quoted field
+  *
+  * In any case, the string read up to the terminator (or end of file)
+  * is returned.
+  *
+  * *isnull is set true or false depending on whether the input matched
+  * the null marker.  Note that the caller cannot check this since the
+  * returned string will be the post-de-escaping equivalent, which may
+  * look the same as some valid data string.
+  *----------
+  */
+
+ static char *
+ CopyReadAttributeCSV(const char *delim, const char *null_print,
+                   CopyReadResult *result, bool *isnull)
+ {
+     char        delimc = delim[0];
+     char        quotec = delim[1];
+     char        escapec = delim[2] ? delim[2] : delim[1];
+     char        c;
+     int            start_cursor = line_buf.cursor;
+     int            end_cursor = start_cursor;;
+     int            input_len;
+     bool        in_quote = false;
+     bool        saw_quote = false;
+
+     /* reset attribute_buf to empty */
+     attribute_buf.len = 0;
+     attribute_buf.data[0] = '\0';
+
+     /* set default status */
+     *result = END_OF_LINE;
+
+     for (;;)
+     {
+         /* handle multiline quoted fields */
+         if (in_quote && line_buf.cursor >= line_buf.len)
+         {
+             bool done;
+
+             switch(eol_type)
+             {
+                 case EOL_NL:
+                     appendStringInfoString(&attribute_buf,"\n");
+                     break;
+                 case EOL_CR:
+                     appendStringInfoString(&attribute_buf,"\r");
+                     break;
+                 case EOL_CRNL:
+                     appendStringInfoString(&attribute_buf,"\r\n");
+                     break;
+                 case EOL_UNKNOWN:
+                     /* shouldn't happen - just keep going */
+                     break;
+             }
+
+             copy_lineno++;
+             done = CopyReadLine();
+             if (done && line_buf.len == 0)
+                 break;
+             start_cursor = line_buf.cursor;
+         }
+
+         end_cursor = line_buf.cursor;
+         if (line_buf.cursor >= line_buf.len)
+             break;
+         c = line_buf.data[line_buf.cursor++];
+         /*
+          * unquoted field delimiter
+          */
+         if (!in_quote && c == delimc)
+         {
+             *result = NORMAL_ATTR;
+             break;
+         }
+         /*
+          * start of quoted field (or part of field)
+          */
+         if (!in_quote && c == quotec)
+         {
+             saw_quote = true;
+             in_quote = true;
+             continue;
+         }
+         /*
+          * escape within a quoted field
+          */
+         if (in_quote && c == escapec)
+         {
+             /*
+              * peek at the next char if available, and escape it if it
+              * is an escape char or a quote char
+              */
+             if (line_buf.cursor <= line_buf.len)
+             {
+                 char nextc = line_buf.data[line_buf.cursor];
+                 if (nextc == escapec || nextc == quotec)
+                 {
+                     appendStringInfoCharMacro(&attribute_buf, nextc);
+                     line_buf.cursor++;
+                     continue;
+                 }
+             }
+         }
+         /*
+          * end of quoted field.
+          * Must do this test after testing for escape in case quote char
+          * and escape char are the same (which is the common case).
+          */
+         if(in_quote && c == quotec)
+         {
+             in_quote = false;
+             continue;
+         }
+         appendStringInfoCharMacro(&attribute_buf, c);
+     }
+
+     if (in_quote)
+         *result = UNTERMINATED_FIELD;
+
+     /* check whether raw input matched null marker */
+     input_len = end_cursor - start_cursor;
+     if (!saw_quote && input_len == strlen(null_print) &&
+         strncmp(&line_buf.data[start_cursor], null_print, input_len) == 0)
+         *isnull = true;
+     else
+         *isnull = false;
+
+     return attribute_buf.data;
+ }
+
  /*
   * Read a binary attribute
   */
***************
*** 2192,2197 ****
--- 2395,2467 ----
                  break;
          }
      }
+ }
+
+ /*
+  * Send CSV representation of one attribute, with conversion and
+  * CSV type escaping
+  */
+ static void
+ CopyAttributeOutCSV(char *server_string, char *delim)
+ {
+     char       *string;
+     char        c;
+     char        delimc = delim[0];
+     char        quotec = delim[1];
+      char        escapec = delim[2] ? delim[2] : delim[1];
+     bool        need_quote = false;
+     char        *test_string;
+     bool        same_encoding;
+     int            mblen;
+     int            i;
+
+     same_encoding = (server_encoding == client_encoding);
+     if (!same_encoding)
+         string = (char *) pg_server_to_client((unsigned char *) server_string,
+                                               strlen(server_string));
+     else
+         string = server_string;
+
+     /* have to run through the string twice,
+      * first time to see if it needs quoting, second to actually send it
+      */
+
+     for(test_string = string; (c = *test_string) != '\0'; test_string += mblen)
+     {
+         if (c == delimc || c == quotec || c == '\n' || c == '\r')
+         {
+             need_quote = true;
+             break;
+         }
+         if (!same_encoding)
+             mblen = pg_encoding_mblen(client_encoding, test_string);
+         else
+             mblen = 1;
+     }
+
+     if (need_quote)
+         CopySendChar(quotec);
+
+     for (; (c = *string) != '\0'; string += mblen)
+     {
+         if (c == quotec || c == escapec)
+             CopySendChar(escapec);
+
+         CopySendChar(c);
+
+         if (!same_encoding)
+         {
+             /* send additional bytes of the char, if any */
+             mblen = pg_encoding_mblen(client_encoding, string);
+             for (i = 1; i < mblen; i++)
+                 CopySendChar(string[i]);
+         }
+         else
+             mblen = 1;
+     }
+
+     if (need_quote)
+         CopySendChar(quotec);
  }

  /*

pgsql-patches by date:

Previous
From: Tom Lane
Date:
Subject: Re: pg_restore ignore error patch
Next
From: Fabien COELHO
Date:
Subject: Re: pg_restore ignore error patch