Re: CopyReadLineText optimization - Mailing list pgsql-patches
From | Heikki Linnakangas |
---|---|
Subject | Re: CopyReadLineText optimization |
Date | |
Msg-id | 47CEB31A.3050406@enterprisedb.com Whole thread Raw |
In response to | Re: CopyReadLineText optimization ("Heikki Linnakangas" <heikki@enterprisedb.com>) |
Responses |
Re: CopyReadLineText optimization
|
List | pgsql-patches |
Heikki Linnakangas wrote: > Heikki Linnakangas wrote: >> Attached is a patch that modifies CopyReadLineText so that it uses >> memchr to speed up the scan. The nice thing about memchr is that we >> can take advantage of any clever optimizations that might be in libc >> or compiler. > > Here's an updated version of the patch. The principle is the same, but > the same optimization is now used for CSV input as well, and there's > more comments. Another update attached: It occurred to me that the memchr approach is only safe for server encodings, where the non-first bytes of a multi-byte character always have the hi-bit set. -- Heikki Linnakangas EnterpriseDB http://www.enterprisedb.com Index: src/backend/commands/copy.c =================================================================== RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/copy.c,v retrieving revision 1.295 diff -c -r1.295 copy.c *** src/backend/commands/copy.c 1 Jan 2008 19:45:48 -0000 1.295 --- src/backend/commands/copy.c 5 Mar 2008 14:44:58 -0000 *************** *** 67,72 **** --- 67,275 ---- EOL_CRNL } EolType; + + /***** Fast byte searcher functions ***** + * + * CopyReadLine needs to search for newlines, as well as quoting and escape + * characters to determine which newlines are real and which ones are escaped. + * Doing that in the naive way, looping through the input byte by byte and + * comparing against the interesting characters, can be slow. + * + * To speed that up, we provide a "searcher" interface, that can be used to + * search a piece of memory for up to 4 different bytes simultaneously. It's + * similar to memchr, but allows searching for multiple chars at the same + * time. + * + * To start a search on a new block of memory, call init_searcher. Then call + * next_searcher repeatedly to return all the occurrances of the bytes being + * searched for. + * + * The searcher implementation uses memchr to search for the bytes, and keeps + * track of where the next occurrance of each is. Using memchr allows us + * to take advantage of any platform-specific optimizations. + */ + + /* + * Struct used to store state of the current search between calls to + * next_searcher. Initialized by init_search. + */ + typedef struct { + /* Each element in c corresponds the element in s. These are sorted + * by the pointers (ptr) */ + int n; /* elements used in the arrays */ + char c[4]; /* bytes we're looking for */ + char *ptr[4]; /* pointers to next occurances of the bytes */ + char *end; /* end of block being searched. Last byte to search is end-1 */ + } searcher; + + /* Functions internal to "searcher" */ + + #define swap_searcher_entries(searcher, a, b) do { \ + char tmpc = (searcher)->c[a]; \ + char *tmpptr = (searcher)->ptr[a]; \ + (searcher)->c[a] = (searcher)->c[b]; \ + (searcher)->ptr[a] = (searcher)->ptr[b]; \ + (searcher)->c[b] = tmpc; \ + (searcher)->ptr[b] = tmpptr; \ + } while(0) + + static void + sort_searcher(searcher *s) + { + switch(s->n) + { + case 0: + case 1: + /* Nothing to do */ + break; + case 2: + if (s->ptr[0] > s->ptr[1]) + swap_searcher_entries(s, 0, 1); + break; + case 3: + if (s->ptr[0] > s->ptr[2]) + swap_searcher_entries(s, 0, 2); + if (s->ptr[0] > s->ptr[1]) + swap_searcher_entries(s, 0, 1); + if (s->ptr[1] > s->ptr[2]) + swap_searcher_entries(s, 1, 2); + break; + case 4: + if (s->ptr[0] > s->ptr[1]) + swap_searcher_entries(s, 0, 1); + if (s->ptr[2] > s->ptr[3]) + swap_searcher_entries(s, 2, 3); + if (s->ptr[1] > s->ptr[2]) + swap_searcher_entries(s, 2, 3); + if (s->ptr[0] > s->ptr[1]) + swap_searcher_entries(s, 0, 1); + if (s->ptr[2] > s->ptr[3]) + swap_searcher_entries(s, 2, 3); + break; + } + } + + /* Remove the topmost element */ + static void + remove_top(searcher *searcher) + { + int i; + + searcher->n--; + + /* Move up the remaining items */ + for (i = 0; i < searcher->n; i++) + { + searcher->c[i] = searcher->c[i + 1]; + searcher->ptr[i] = searcher->ptr[i + 1]; + } + } + + /* + * The first element in the list has been replaced by the caller. + * Move it to the right place in the list, to keep it sorted. + */ + static void + sift_down(searcher *searcher) + { + if (searcher->n < 2 || searcher->ptr[0] <= searcher->ptr[1]) + return; + swap_searcher_entries(searcher, 0, 1); + + if (searcher->n < 3 || searcher->ptr[1] <= searcher->ptr[2]) + return; + swap_searcher_entries(searcher, 1, 2); + + if (searcher->n < 4 || searcher->ptr[2] <= searcher->ptr[3]) + return; + swap_searcher_entries(searcher, 2, 3); + } + + /* + * Starts a new search in a block of memory. + * + * searcher - for storing state between calls. Opaque to caller, + * init_searcher will initialize this. + * str - block of memory to search + * len - length of str + * c - bytes to search for, max 4. + * n - number of bytes in c + */ + static void + init_searcher(searcher *searcher, char *str, int len, char c[4], int n) + { + int i; + + Assert(n <= 4); + + searcher->n = 0; + searcher->end = str + len; + + /* Find first occurances of the searched-for bytes */ + for (i = 0; i < n; i++) + { + char *hit = memchr(str, c[i], len); + if (hit != NULL) + { + searcher->c[searcher->n] = c[i]; + searcher->ptr[searcher->n] = hit; + searcher->n++; + } + /* + * If the byte doesn't occur in the string at all, we don't + * need to put it in the list. + */ + } + sort_searcher(searcher); + } + + /* + * Look for the next interesting character + * + * searcher - state, opaque to caller + * str - block to search. + * + * 'str' must be part of the block passed to init_searcher, and >= + * the return value of last call to next_searcher. + * + * Returns the offset of the next interesting location, relative to 'ss' + */ + static char * + next_searcher(searcher *searcher, char *str) + { + Assert(str < searcher->end); + + if (searcher->n == 0) + return NULL; + + /* + * If the caller skipped over the next pointer we had memorized, we have to + * search for the one after that. + */ + while (str > searcher->ptr[0]) + { + char *s = memchr(str, searcher->c[0], searcher->end - str); + + if (s == NULL) + { + remove_top(searcher); + if (searcher->n == 0) + return NULL; + } + else + { + searcher->ptr[0] = s; + sift_down(searcher); + } + } + + /* + * Return the next interesting location we had memorized. + */ + return searcher->ptr[0]; + } + + /* * This struct contains all the state variables used throughout a COPY * operation. For simplicity, we use the same struct for all variants of COPY, *************** *** 159,164 **** --- 362,369 ---- char *raw_buf; int raw_buf_index; /* next byte to process */ int raw_buf_len; /* total # of bytes stored */ + + searcher searcher; } CopyStateData; typedef CopyStateData *CopyState; *************** *** 2285,2299 **** --- 2490,2516 ---- last_was_esc = false; char quotec = '\0'; char escapec = '\0'; + char search_chars[4]; + int nsearch_chars = 0; + + search_chars[nsearch_chars++] = '\n'; + search_chars[nsearch_chars++] = '\r'; if (cstate->csv_mode) { quotec = cstate->quote[0]; escapec = cstate->escape[0]; + + search_chars[nsearch_chars++] = quotec; + /* ignore special escape processing if it's the same as quotec */ if (quotec == escapec) escapec = '\0'; + else + search_chars[nsearch_chars++] = escapec; } + else + search_chars[nsearch_chars++] = '\\'; mblen_str[1] = '\0'; *************** *** 2360,2368 **** --- 2577,2603 ---- break; } need_data = false; + + if (!cstate->encoding_embeds_ascii) + init_searcher(&cstate->searcher, copy_raw_buf, copy_buf_len, + search_chars, nsearch_chars); } /* OK to fetch a character */ + if ((!cstate->csv_mode || !first_char_in_line) + && !cstate->encoding_embeds_ascii + && (raw_buf_ptr + 4 <= copy_buf_len)) + { + char *next = next_searcher(&cstate->searcher, + ©_raw_buf[raw_buf_ptr]); + + if (next != NULL) + { + raw_buf_ptr = next - copy_raw_buf; + first_char_in_line = false; + } + } + prev_raw_ptr = raw_buf_ptr; c = copy_raw_buf[raw_buf_ptr++];
pgsql-patches by date: