CopyReadAttributesCSV optimization - Mailing list pgsql-patches
From | Heikki Linnakangas |
---|---|
Subject | CopyReadAttributesCSV optimization |
Date | |
Msg-id | 47C876B3.7020105@enterprisedb.com Whole thread Raw |
Responses |
Re: CopyReadAttributesCSV optimization
Re: CopyReadAttributesCSV optimization |
List | pgsql-patches |
Here's a patch to speed up CopyReadAttributesCSV. On the test case I've been playing with, loading the TPC-H partsupp table, about 20% CopyReadAttributesCSV (inlined into DoCopy, DoCopy itself is insignificant): samples % image name symbol name 8136 25.8360 postgres CopyReadLine 6350 20.1645 postgres DoCopy 2181 6.9258 postgres pg_verify_mbstr_len 2157 6.8496 reiserfs (no symbols) 1668 5.2968 libc-2.7.so memcpy 1142 3.6264 libc-2.7.so ____strtod_l_internal 951 3.0199 postgres heap_formtuple 904 2.8707 libc-2.7.so ____strtol_l_internal 619 1.9656 libc-2.7.so memset 442 1.4036 libc-2.7.so strlen 341 1.0828 postgres hash_any 329 1.0447 postgres pg_atoi 300 0.9527 postgres AllocSetAlloc With this patch, the usage of that function goes down to ~13% samples % image name symbol name 7191 28.7778 postgres CopyReadLine 3257 13.0343 postgres DoCopy 2127 8.5121 reiserfs (no symbols) 1914 7.6597 postgres pg_verify_mbstr_len 1413 5.6547 libc-2.7.so memcpy 920 3.6818 libc-2.7.so ____strtod_l_internal 784 3.1375 libc-2.7.so ____strtol_l_internal 745 2.9814 postgres heap_formtuple 508 2.0330 libc-2.7.so memset 398 1.5928 libc-2.7.so strlen 315 1.2606 postgres hash_any 255 1.0205 postgres AllocSetAlloc The trick is to split the loop in CopyReadAttributesCSV into two parts, inside quotes, and outside quotes, saving some instructions in both parts. Your mileage may vary, but I'm quite happy with this. I haven't tested it much yet, but I wouldn't expect it to be a loss in any interesting scenario. The code also doesn't look much worse after the patch, perhaps even better. -- Heikki Linnakangas EnterpriseDB http://www.enterprisedb.com Index: src/backend/commands/copy.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/copy.c,v retrieving revision 1.295 diff -c -r1.295 copy.c *** src/backend/commands/copy.c 1 Jan 2008 19:45:48 -0000 1.295 --- src/backend/commands/copy.c 29 Feb 2008 20:57:09 -0000 *************** *** 2913,2919 **** for (;;) { bool found_delim = false; - bool in_quote = false; bool saw_quote = false; char *start_ptr; char *end_ptr; --- 3146,3151 ---- *************** *** 2934,3000 **** { char c; ! end_ptr = cur_ptr; ! if (cur_ptr >= line_end_ptr) ! break; ! c = *cur_ptr++; ! /* unquoted field delimiter */ ! if (c == delimc && !in_quote) { ! found_delim = true; ! break; ! } ! /* start of quoted field (or part of field) */ ! if (c == quotec && !in_quote) ! { ! saw_quote = true; ! in_quote = true; ! continue; } ! /* escape within a quoted field */ ! if (c == escapec && in_quote) { ! /* ! * peek at the next char if available, and escape it if it is ! * an escape char or a quote char ! */ ! if (cur_ptr < line_end_ptr) ! { ! char nextc = *cur_ptr; ! if (nextc == escapec || nextc == quotec) { ! *output_ptr++ = nextc; ! cur_ptr++; ! continue; } } ! } ! /* ! * end of quoted field. Must do this test after testing for escape ! * in case quote char and escape char are the same (which is the ! * common case). ! */ ! if (c == quotec && in_quote) ! { ! in_quote = false; ! continue; } - - /* Add c to output string */ - *output_ptr++ = c; } /* Terminate attribute value in output area */ *output_ptr++ = '\0'; - /* Shouldn't still be in quote mode */ - if (in_quote) - ereport(ERROR, - (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), - errmsg("unterminated CSV quoted field"))); - /* Check whether raw input matched null marker */ input_len = end_ptr - start_ptr; if (!saw_quote && input_len == cstate->null_print_len && --- 3166,3241 ---- { char c; ! /* Not in quote */ ! for (;;) { ! end_ptr = cur_ptr; ! if (cur_ptr >= line_end_ptr) ! goto endfield; ! c = *cur_ptr++; ! /* unquoted field delimiter */ ! if (c == delimc) ! { ! found_delim = true; ! goto endfield; ! } ! /* start of quoted field (or part of field) */ ! if (c == quotec) ! { ! saw_quote = true; ! break; ! } ! /* Add c to output string */ ! *output_ptr++ = c; } ! ! /* In quote */ ! for (;;) { ! end_ptr = cur_ptr; ! if (cur_ptr >= line_end_ptr) ! ereport(ERROR, ! (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), ! errmsg("unterminated CSV quoted field"))); ! c = *cur_ptr++; ! ! /* escape within a quoted field */ ! if (c == escapec) ! { ! /* ! * peek at the next char if available, and escape it if it is ! * an escape char or a quote char ! */ ! if (cur_ptr < line_end_ptr) { ! char nextc = *cur_ptr; ! ! if (nextc == escapec || nextc == quotec) ! { ! *output_ptr++ = nextc; ! cur_ptr++; ! continue; ! } } } ! /* ! * end of quoted field. Must do this test after testing for escape ! * in case quote char and escape char are the same (which is the ! * common case). ! */ ! if (c == quotec) ! break; ! /* Add c to output string */ ! *output_ptr++ = c; } } + endfield: /* Terminate attribute value in output area */ *output_ptr++ = '\0'; /* Check whether raw input matched null marker */ input_len = end_ptr - start_ptr; if (!saw_quote && input_len == cstate->null_print_len &&
pgsql-patches by date: