CopyReadLineText optimization - Mailing list pgsql-patches

From Heikki Linnakangas
Subject CopyReadLineText optimization
Date
Msg-id 47C0C88B.8090904@enterprisedb.com
Whole thread Raw
Responses Re: CopyReadLineText optimization  ("Luke Lonergan" <LLonergan@greenplum.com>)
Re: CopyReadLineText optimization  ("Heikki Linnakangas" <heikki@enterprisedb.com>)
List pgsql-patches
The purpose of CopyReadLineText is to scan the input buffer, and find
the next newline, taking into account any escape characters. It
currently operates in a loop, one byte at a time, searching for LF, CR,
or a backslash. That's a bit slow: I've been running oprofile on COPY,
and I've seen CopyReadLine to take around ~10% of the CPU time, and
Joshua Drake just posted a very similar profile to hackers.

Attached is a patch that modifies CopyReadLineText so that it uses
memchr to speed up the scan. The nice thing about memchr is that we can
take advantage of any clever optimizations that might be in libc or
compiler.

In the tests I've been running, it roughly halves the time spent in
CopyReadLine (including the new memchr calls), thus reducing the total
CPU overhead by ~5%. I'm planning to run more tests with data that has
backslashes and with different width tables to see what the worst-case
and best-case performance is like. Also, it doesn't work for CSV format
at the moment; that needs to be fixed.

5% isn't exactly breathtaking, but it's a start. I tried the same trick
to CopyReadAttributesText, but unfortunately it doesn't seem to help
there because you need to "stop" the efficient word-at-a-time scan that
memchr does (at least with glibc, YMMV) whenever there's a column
separator, while in CopyReadLineText you get to process the whole line
in one call, assuming there's no backslashes.

--
   Heikki Linnakangas
   EnterpriseDB   http://www.enterprisedb.com
Index: src/backend/commands/copy.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/copy.c,v
retrieving revision 1.295
diff -c -r1.295 copy.c
*** src/backend/commands/copy.c    1 Jan 2008 19:45:48 -0000    1.295
--- src/backend/commands/copy.c    24 Feb 2008 00:40:09 -0000
***************
*** 67,72 ****
--- 67,77 ----
      EOL_CRNL
  } EolType;

+ typedef struct {
+     char c;        /* byte we're looking for */
+     char *s;    /* pointer to next occurance of this byte */
+ } searchbyte;
+
  /*
   * This struct contains all the state variables used throughout a COPY
   * operation. For simplicity, we use the same struct for all variants of COPY,
***************
*** 159,164 ****
--- 164,171 ----
      char       *raw_buf;
      int            raw_buf_index;    /* next byte to process */
      int            raw_buf_len;    /* total # of bytes stored */
+
+     searchbyte    sb[3];            /* pointers to next "interesting" characters in raw_buf */
  } CopyStateData;

  typedef CopyStateData *CopyState;
***************
*** 170,175 ****
--- 177,283 ----
      CopyState    cstate;            /* CopyStateData for the command */
  } DR_copy;

+ #define swap_sb(a,b) do { \
+     searchbyte c = (a); \
+     (a) = (b); \
+     (b) = (c); \
+ } while(0)
+
+ /* is a < b? NULLs are last */
+ #define cmp_sb(a,b) ((a).s < (b).s && (a).s != NULL)
+
+ static void
+ sort_sb(searchbyte *sb)
+ {
+     if (cmp_sb(sb[2], sb[1]))
+         swap_sb(sb[1], sb[2]);
+     if (cmp_sb(sb[1], sb[0]))
+         swap_sb(sb[0], sb[1]);
+     if (cmp_sb(sb[2], sb[1]))
+         swap_sb(sb[1], sb[2]);
+ }
+
+ /*
+  * Starts a new search in a string
+  */
+ static void
+ init_interesting(searchbyte *sb, char *str, int len)
+ {
+     sb[0].c = '\n';
+     sb[1].c = '\r';
+     sb[2].c = '\\';
+     sb[0].s = memchr(str, '\n', len);
+     sb[1].s = memchr(str, '\r', len);
+     sb[2].s = memchr(str, '\\', len);
+     sort_sb(sb);
+ }
+
+ /*
+  * Look for the next interesting character
+  *
+  * sb - array of three searchbytes.
+  * ss - string to search
+  * len_left - number of bytes left in ss to search
+  *
+  * Returns the offset of the next interesting location, relative to 'ss'
+  */
+ static int
+ next_interesting(searchbyte *sb, char *ss, int len_left)
+ {
+     char *s;
+
+     if (sb[0].s == NULL)
+         return 0;
+
+     /*
+      * The caller skipped over the next pointer we had memorized, so we have to
+      * search for the one after that.
+      */
+     if (ss > sb[0].s)
+     {
+         sb[0].s = memchr(ss, sb[0].c, len_left);
+         if (sb[0].s == NULL)
+             return 0; /* we're done */
+         if (sb[1].s != NULL)
+         {
+             if (ss > sb[1].s)
+             {
+                 /*
+                  * The caller skipped over the 2nd pointer we had memorized
+                  * as well. Have to search for that as well
+                  */
+                 sb[1].s = memchr(ss, sb[1].c, len_left);
+                 if (sb[2].s != NULL && ss > sb[2].s)
+                 {
+                     /* Caller skipped over 3rd pointer as well... */
+                     sb[2].s = memchr(ss, sb[2].c, len_left);
+                 }
+             }
+             sort_sb(sb);
+         }
+     }
+
+     /*
+      * Return the next interesting location we had memorized, and search
+      * for the next occurance of that byte.
+      */
+     s = sb[0].s;
+
+     sb[0].s = memchr(s+1, sb[0].c, (ss+len_left) - (s + 1));
+
+     /*
+      * Bubble down the next location into the three-element list we have,
+      */
+     if (cmp_sb(sb[1], sb[0]))
+     {
+         swap_sb(sb[0], sb[1]);
+         if (cmp_sb(sb[2], sb[1]))
+             swap_sb(sb[1], sb[2]);
+     }
+
+     return s - ss;
+ }
+

  /*
   * These macros centralize code used to process line_buf and raw_buf buffers.
***************
*** 2350,2355 ****
--- 2458,2465 ----
              raw_buf_ptr = 0;
              copy_buf_len = cstate->raw_buf_len;

+             init_interesting(cstate->sb, copy_raw_buf, copy_buf_len);
+
              /*
               * If we are completely out of data, break out of the loop,
               * reporting EOF.
***************
*** 2363,2368 ****
--- 2473,2495 ----
          }

          /* OK to fetch a character */
+         if (!cstate->csv_mode)
+         {
+             int skip = next_interesting(cstate->sb,
+                                         ©_raw_buf[raw_buf_ptr],
+                                         copy_buf_len - raw_buf_ptr);
+
+
+             if (skip != 0)
+             {
+                 raw_buf_ptr += skip;
+                 first_char_in_line = false;
+
+                 prev_raw_ptr = raw_buf_ptr - 1;
+                 IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(1);
+             }
+         }
+
          prev_raw_ptr = raw_buf_ptr;
          c = copy_raw_buf[raw_buf_ptr++];


pgsql-patches by date:

Previous
From: Mathias Hasselmann
Date:
Subject: Re: Avahi support for Postgresql
Next
From: "Luke Lonergan"
Date:
Subject: Re: CopyReadLineText optimization