CopyReadAttributesCSV optimization - Mailing list pgsql-patches

From Heikki Linnakangas
Subject CopyReadAttributesCSV optimization
Date
Msg-id 47C876B3.7020105@enterprisedb.com
Whole thread Raw
Responses Re: CopyReadAttributesCSV optimization
Re: CopyReadAttributesCSV optimization
List pgsql-patches
Here's a patch to speed up CopyReadAttributesCSV. On the test case I've
been playing with, loading the TPC-H partsupp table, about 20%
CopyReadAttributesCSV (inlined into DoCopy, DoCopy itself is insignificant):

samples  %        image name               symbol name
8136     25.8360  postgres                 CopyReadLine
6350     20.1645  postgres                 DoCopy
2181      6.9258  postgres                 pg_verify_mbstr_len
2157      6.8496  reiserfs                 (no symbols)
1668      5.2968  libc-2.7.so              memcpy
1142      3.6264  libc-2.7.so              ____strtod_l_internal
951       3.0199  postgres                 heap_formtuple
904       2.8707  libc-2.7.so              ____strtol_l_internal
619       1.9656  libc-2.7.so              memset
442       1.4036  libc-2.7.so              strlen
341       1.0828  postgres                 hash_any
329       1.0447  postgres                 pg_atoi
300       0.9527  postgres                 AllocSetAlloc

With this patch, the usage of that function goes down to ~13%

samples  %        image name               symbol name
7191     28.7778  postgres                 CopyReadLine
3257     13.0343  postgres                 DoCopy
2127      8.5121  reiserfs                 (no symbols)
1914      7.6597  postgres                 pg_verify_mbstr_len
1413      5.6547  libc-2.7.so              memcpy
920       3.6818  libc-2.7.so              ____strtod_l_internal
784       3.1375  libc-2.7.so              ____strtol_l_internal
745       2.9814  postgres                 heap_formtuple
508       2.0330  libc-2.7.so              memset
398       1.5928  libc-2.7.so              strlen
315       1.2606  postgres                 hash_any
255       1.0205  postgres                 AllocSetAlloc

The trick is to split the loop in CopyReadAttributesCSV into two parts,
inside quotes, and outside quotes, saving some instructions in both
parts.

Your mileage may vary, but I'm quite happy with this. I haven't tested
it much yet, but I wouldn't expect it to be a loss in any interesting
scenario. The code also doesn't look much worse after the patch, perhaps
even better.

--
   Heikki Linnakangas
   EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/commands/copy.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/copy.c,v
retrieving revision 1.295
diff -c -r1.295 copy.c
*** src/backend/commands/copy.c    1 Jan 2008 19:45:48 -0000    1.295
--- src/backend/commands/copy.c    29 Feb 2008 20:57:09 -0000
***************
*** 2913,2919 ****
      for (;;)
      {
          bool        found_delim = false;
-         bool        in_quote = false;
          bool        saw_quote = false;
          char       *start_ptr;
          char       *end_ptr;
--- 3146,3151 ----
***************
*** 2934,3000 ****
          {
              char        c;

!             end_ptr = cur_ptr;
!             if (cur_ptr >= line_end_ptr)
!                 break;
!             c = *cur_ptr++;
!             /* unquoted field delimiter */
!             if (c == delimc && !in_quote)
              {
!                 found_delim = true;
!                 break;
!             }
!             /* start of quoted field (or part of field) */
!             if (c == quotec && !in_quote)
!             {
!                 saw_quote = true;
!                 in_quote = true;
!                 continue;
              }
!             /* escape within a quoted field */
!             if (c == escapec && in_quote)
              {
!                 /*
!                  * peek at the next char if available, and escape it if it is
!                  * an escape char or a quote char
!                  */
!                 if (cur_ptr < line_end_ptr)
!                 {
!                     char        nextc = *cur_ptr;

!                     if (nextc == escapec || nextc == quotec)
                      {
!                         *output_ptr++ = nextc;
!                         cur_ptr++;
!                         continue;
                      }
                  }
!             }

!             /*
!              * end of quoted field. Must do this test after testing for escape
!              * in case quote char and escape char are the same (which is the
!              * common case).
!              */
!             if (c == quotec && in_quote)
!             {
!                 in_quote = false;
!                 continue;
              }
-
-             /* Add c to output string */
-             *output_ptr++ = c;
          }

          /* Terminate attribute value in output area */
          *output_ptr++ = '\0';

-         /* Shouldn't still be in quote mode */
-         if (in_quote)
-             ereport(ERROR,
-                     (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
-                      errmsg("unterminated CSV quoted field")));
-
          /* Check whether raw input matched null marker */
          input_len = end_ptr - start_ptr;
          if (!saw_quote && input_len == cstate->null_print_len &&
--- 3166,3241 ----
          {
              char        c;

!             /* Not in quote */
!             for (;;)
              {
!                 end_ptr = cur_ptr;
!                 if (cur_ptr >= line_end_ptr)
!                     goto endfield;
!                 c = *cur_ptr++;
!                 /* unquoted field delimiter */
!                 if (c == delimc)
!                 {
!                     found_delim = true;
!                     goto endfield;
!                 }
!                 /* start of quoted field (or part of field) */
!                 if (c == quotec)
!                 {
!                     saw_quote = true;
!                     break;
!                 }
!                 /* Add c to output string */
!                 *output_ptr++ = c;
              }
!
!             /* In quote */
!             for (;;)
              {
!                 end_ptr = cur_ptr;
!                 if (cur_ptr >= line_end_ptr)
!                     ereport(ERROR,
!                             (errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
!                              errmsg("unterminated CSV quoted field")));

!                 c = *cur_ptr++;
!
!                 /* escape within a quoted field */
!                 if (c == escapec)
!                 {
!                     /*
!                      * peek at the next char if available, and escape it if it is
!                      * an escape char or a quote char
!                      */
!                     if (cur_ptr < line_end_ptr)
                      {
!                         char        nextc = *cur_ptr;
!
!                         if (nextc == escapec || nextc == quotec)
!                         {
!                             *output_ptr++ = nextc;
!                             cur_ptr++;
!                             continue;
!                         }
                      }
                  }
!                 /*
!                  * end of quoted field. Must do this test after testing for escape
!                  * in case quote char and escape char are the same (which is the
!                  * common case).
!                  */
!                 if (c == quotec)
!                     break;

!                 /* Add c to output string */
!                 *output_ptr++ = c;
              }
          }
+     endfield:

          /* Terminate attribute value in output area */
          *output_ptr++ = '\0';

          /* Check whether raw input matched null marker */
          input_len = end_ptr - start_ptr;
          if (!saw_quote && input_len == cstate->null_print_len &&

pgsql-patches by date:

Previous
From: Alvaro Herrera
Date:
Subject: Re: remove TCL_ARRAYS
Next
From: Robert Lor
Date:
Subject: Re: DTrace probe patch for OS X Leopard