From e4546b0612bd2fde6190a9ade6e60a1f08299184 Mon Sep 17 00:00:00 2001 From: Manni Wood Date: Fri, 5 Dec 2025 18:30:00 -0600 Subject: [PATCH v4.2 1/3] Speed up COPY FROM text/CSV parsing using SIMD Authors: Shinya Kato , Nazir Bilal Yavuz , Ayoub Kazar Reviewers: Andrew Dunstan Descussion: https://www.postgresql.org/message-id/flat/CAOzEurSW8cNr6TPKsjrstnPfhf4QyQqB4tnPXGGe8N4e_v7Jig@mail.gmail.com --- src/backend/commands/copyfromparse.c | 73 ++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index 62afcd8fad1..673d6683a72 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -71,7 +71,9 @@ #include "mb/pg_wchar.h" #include "miscadmin.h" #include "pgstat.h" +#include "port/pg_bitutils.h" #include "port/pg_bswap.h" +#include "port/simd.h" #include "utils/builtins.h" #include "utils/rel.h" @@ -1255,6 +1257,14 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) char quotec = '\0'; char escapec = '\0'; +#ifndef USE_NO_SIMD + Vector8 nl = vector8_broadcast('\n'); + Vector8 cr = vector8_broadcast('\r'); + Vector8 bs = vector8_broadcast('\\'); + Vector8 quote = vector8_broadcast(0); + Vector8 escape = vector8_broadcast(0); +#endif + if (is_csv) { quotec = cstate->opts.quote[0]; @@ -1262,6 +1272,12 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) /* ignore special escape processing if it's the same as quotec */ if (quotec == escapec) escapec = '\0'; + +#ifndef USE_NO_SIMD + quote = vector8_broadcast(quotec); + if (quotec != escapec) + escape = vector8_broadcast(escapec); +#endif } /* @@ -1328,6 +1344,63 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) need_data = false; } +#ifndef USE_NO_SIMD + + /* + * Use SIMD instructions to efficiently scan the input buffer for + * special characters (e.g., newline, carriage return, quote, and + * escape). This is faster than byte-by-byte iteration, especially on + * large buffers. + * + * We do not apply the SIMD fast path in either of the following + * cases: - When the previously processed character was an escape + * character (last_was_esc), since the next byte must be examined + * sequentially. - The remaining buffer is smaller than one vector + * width (sizeof(Vector8)); SIMD operates on fixed-size chunks. + */ + if (!last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8)) + { + Vector8 chunk; + Vector8 match = vector8_broadcast(0); + uint32 mask; + + /* Load a chunk of data into a vector register */ + vector8_load(&chunk, (const uint8 *) ©_input_buf[input_buf_ptr]); + + /* \n and \r are not special inside quotes */ + if (!in_quote) + match = vector8_or(vector8_eq(chunk, nl), vector8_eq(chunk, cr)); + + if (is_csv) + { + match = vector8_or(match, vector8_eq(chunk, quote)); + if (escapec != '\0') + match = vector8_or(match, vector8_eq(chunk, escape)); + } + else + match = vector8_or(match, vector8_eq(chunk, bs)); + + /* Check if we found any special characters */ + mask = vector8_highbit_mask(match); + if (mask != 0) + { + /* + * Found a special character. Advance up to that point and let + * the scalar code handle it. + */ + int advance = pg_rightmost_one_pos32(mask); + + input_buf_ptr += advance; + } + else + { + /* No special characters found, so skip the entire chunk */ + input_buf_ptr += sizeof(Vector8); + continue; + } + } +#endif + /* OK to fetch a character */ prev_raw_ptr = input_buf_ptr; c = copy_input_buf[input_buf_ptr++]; -- 2.51.0