From 203d648c4cf64c6d629f2abc719a371dd0393e22 Mon Sep 17 00:00:00 2001 From: Nazir Bilal Yavuz Date: Thu, 7 Aug 2025 13:27:34 +0300 Subject: [PATCH v2] Feedback --- src/backend/commands/copyfromparse.c | 176 ++++++++++++++++++++++++--- 1 file changed, 160 insertions(+), 16 deletions(-) diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index 5aba0fa6cb7..7b83e64e23b 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -670,8 +670,12 @@ CopyLoadInputBuf(CopyFromState cstate) /* If we now have some unconverted data, try to convert it */ CopyConvertBuf(cstate); - /* If we now have some more input bytes ready, return them */ - if (INPUT_BUF_BYTES(cstate) > nbytes) + /* + * If we now have at least sizeof(Vector8) input bytes ready, return + * them. This is beneficial for SIMD processing in the + * CopyReadLineText() function. + */ + if (INPUT_BUF_BYTES(cstate) > nbytes + sizeof(Vector8)) return; /* @@ -1322,7 +1326,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) * unsafe with the old v2 COPY protocol, but we don't support that * anymore. */ - if (input_buf_ptr >= copy_buf_len || need_data) + if (input_buf_ptr + sizeof(Vector8) >= copy_buf_len || need_data) { REFILL_LINEBUF; @@ -1345,21 +1349,22 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) } #ifndef USE_NO_SIMD + /* - * SIMD instructions are used here to efficiently scan the input buffer - * for special characters (e.g., newline, carriage return, quotes, or - * escape characters). This approach significantly improves performance - * compared to byte-by-byte iteration, especially for large input - * buffers. + * SIMD instructions are used here to efficiently scan the input + * buffer for special characters (e.g., newline, carriage return, + * quotes, or escape characters). This approach significantly improves + * performance compared to byte-by-byte iteration, especially for + * large input buffers. * - * However, SIMD optimization cannot be applied in the following cases: - * - Inside quoted fields, where escape sequences and closing quotes - * require sequential processing to handle correctly. - * - When the remaining buffer size is smaller than the size of a SIMD - * vector register, as SIMD operations require processing data in - * fixed-size chunks. + * However, SIMD optimization cannot be applied in the following + * cases: - Inside quoted fields, where escape sequences and closing + * quotes require sequential processing to handle correctly. - When + * the remaining buffer size is smaller than the size of a SIMD vector + * register, as SIMD operations require processing data in fixed-size + * chunks. */ - if (!in_quote && copy_buf_len - input_buf_ptr >= sizeof(Vector8)) + if (copy_buf_len - input_buf_ptr >= sizeof(Vector8)) { Vector8 chunk; Vector8 match; @@ -1388,13 +1393,15 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) * Found a special character. Advance up to that point and let * the scalar code handle it. */ - int advance = pg_rightmost_one_pos32(mask); + int advance = pg_rightmost_one_pos32(mask); + input_buf_ptr += advance; } else { /* No special characters found, so skip the entire chunk */ input_buf_ptr += sizeof(Vector8); + last_was_esc = false; continue; } } @@ -1650,6 +1657,11 @@ CopyReadAttributesText(CopyFromState cstate) char *cur_ptr; char *line_end_ptr; +#ifndef USE_NO_SIMD + Vector8 bs = vector8_broadcast('\\'); + Vector8 delim = vector8_broadcast(delimc); +#endif + /* * We need a special case for zero-column tables: check that the input * line is empty, and return. @@ -1717,6 +1729,44 @@ CopyReadAttributesText(CopyFromState cstate) { char c; +#ifndef USE_NO_SIMD + if (line_end_ptr - cur_ptr >= sizeof(Vector8)) + { + Vector8 chunk; + Vector8 match; + uint32 mask; + + /* Load a chunk of data into a vector register */ + vector8_load(&chunk, (const uint8 *) cur_ptr); + + /* Create a mask of all special characters we need to stop at */ + match = vector8_or(vector8_eq(chunk, bs), vector8_eq(chunk, delim)); + + /* Check if we found any special characters */ + mask = vector8_highbit_mask(match); + if (mask != 0) + { + /* + * Found a special character. Advance up to that point and + * let the scalar code handle it. + */ + int advance = pg_rightmost_one_pos32(mask); + + memcpy(output_ptr, cur_ptr, advance); + output_ptr += advance; + cur_ptr += advance; + } + else + { + /* No special characters found, so skip the entire chunk */ + memcpy(output_ptr, cur_ptr, sizeof(Vector8)); + output_ptr += sizeof(Vector8); + cur_ptr += sizeof(Vector8); + continue; + } + } +#endif + end_ptr = cur_ptr; if (cur_ptr >= line_end_ptr) break; @@ -1906,6 +1956,12 @@ CopyReadAttributesCSV(CopyFromState cstate) char *cur_ptr; char *line_end_ptr; +#ifndef USE_NO_SIMD + Vector8 quote = vector8_broadcast(quotec); + Vector8 delim = vector8_broadcast(delimc); + Vector8 escape = vector8_broadcast(escapec); +#endif + /* * We need a special case for zero-column tables: check that the input * line is empty, and return. @@ -1972,6 +2028,50 @@ CopyReadAttributesCSV(CopyFromState cstate) /* Not in quote */ for (;;) { +#ifndef USE_NO_SIMD + if (line_end_ptr - cur_ptr >= sizeof(Vector8)) + { + Vector8 chunk; + Vector8 match; + uint32 mask; + + /* Load a chunk of data into a vector register */ + vector8_load(&chunk, (const uint8 *) cur_ptr); + + /* + * Create a mask of all special characters we need to stop + * at + */ + match = vector8_or(vector8_eq(chunk, quote), vector8_eq(chunk, delim)); + + /* Check if we found any special characters */ + mask = vector8_highbit_mask(match); + if (mask != 0) + { + /* + * Found a special character. Advance up to that point + * and let the scalar code handle it. + */ + int advance = pg_rightmost_one_pos32(mask); + + memcpy(output_ptr, cur_ptr, advance); + output_ptr += advance; + cur_ptr += advance; + } + else + { + /* + * No special characters found, so skip the entire + * chunk + */ + memcpy(output_ptr, cur_ptr, sizeof(Vector8)); + output_ptr += sizeof(Vector8); + cur_ptr += sizeof(Vector8); + continue; + } + } +#endif + end_ptr = cur_ptr; if (cur_ptr >= line_end_ptr) goto endfield; @@ -1995,6 +2095,50 @@ CopyReadAttributesCSV(CopyFromState cstate) /* In quote */ for (;;) { +#ifndef USE_NO_SIMD + if (line_end_ptr - cur_ptr >= sizeof(Vector8)) + { + Vector8 chunk; + Vector8 match; + uint32 mask; + + /* Load a chunk of data into a vector register */ + vector8_load(&chunk, (const uint8 *) cur_ptr); + + /* + * Create a mask of all special characters we need to stop + * at + */ + match = vector8_or(vector8_eq(chunk, quote), vector8_eq(chunk, escape)); + + /* Check if we found any special characters */ + mask = vector8_highbit_mask(match); + if (mask != 0) + { + /* + * Found a special character. Advance up to that point + * and let the scalar code handle it. + */ + int advance = pg_rightmost_one_pos32(mask); + + memcpy(output_ptr, cur_ptr, advance); + output_ptr += advance; + cur_ptr += advance; + } + else + { + /* + * No special characters found, so skip the entire + * chunk + */ + memcpy(output_ptr, cur_ptr, sizeof(Vector8)); + output_ptr += sizeof(Vector8); + cur_ptr += sizeof(Vector8); + continue; + } + } +#endif + end_ptr = cur_ptr; if (cur_ptr >= line_end_ptr) ereport(ERROR, -- 2.50.1