From 38b587dda44cb7160ee734cdea55a573f302c3a9 Mon Sep 17 00:00:00 2001 From: Manni Wood Date: Fri, 5 Dec 2025 18:33:46 -0600 Subject: [PATCH v4 2/2] Speed up COPY FROM text/CSV parsing using SIMD Authors: Shinya Kato , Nazir Bilal Yavuz , Ayoub Kazar Reviewers: Andrew Dunstan Descussion: https://www.postgresql.org/message-id/flat/CAOzEurSW8cNr6TPKsjrstnPfhf4QyQqB4tnPXGGe8N4e_v7Jig@mail.gmail.com --- src/backend/commands/copyfrom.c | 3 +++ src/backend/commands/copyfromparse.c | 29 +++++++++++++++++++++++- src/include/commands/copyfrom_internal.h | 11 +++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c index 12781963b4f..e638623e5b5 100644 --- a/src/backend/commands/copyfrom.c +++ b/src/backend/commands/copyfrom.c @@ -1720,6 +1720,9 @@ BeginCopyFrom(ParseState *pstate, cstate->cur_attname = NULL; cstate->cur_attval = NULL; cstate->relname_only = false; + cstate->special_chars_encountered = 0; + cstate->checked_simd = false; + cstate->use_simd = false; /* * Allocate buffers for the input pipeline. diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index 1edb525f072..8cfdfcd4cd8 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -1346,6 +1346,28 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) #ifndef USE_NO_SIMD + /* + * Wait until we have read more than BYTES_PROCESSED_UNTIL_SIMD_CHECK. + * cstate->bytes_processed will grow an unpredictable amount with each + * call to this function, so just wait until we have crossed the + * threshold. + */ + if (!cstate->checked_simd && cstate->bytes_processed > BYTES_PROCESSED_UNTIL_SIMD_CHECK) + { + cstate->checked_simd = true; + + /* + * If we have not read too many special characters + * (SPECIAL_CHAR_SIMD_THRESHOLD) then start using SIMD to speed up + * processing. This heuristic assumes that input does not vary too + * much from line to line and that number of special characters + * encountered in the first BYTES_PROCESSED_UNTIL_SIMD_CHECK are + * indicitive of the whole file. + */ + if (cstate->special_chars_encountered < SPECIAL_CHAR_SIMD_THRESHOLD) + cstate->use_simd = true; + } + /* * Use SIMD instructions to efficiently scan the input buffer for * special characters (e.g., newline, carriage return, quote, and @@ -1358,7 +1380,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) * sequentially. - The remaining buffer is smaller than one vector * width (sizeof(Vector8)); SIMD operates on fixed-size chunks. */ - if (!last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8)) + if (cstate->use_simd && !last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8)) { Vector8 chunk; Vector8 match = vector8_broadcast(0); @@ -1415,6 +1437,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) */ if (c == '\r') { + cstate->special_chars_encountered++; IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); } @@ -1446,6 +1469,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) /* Process \r */ if (c == '\r' && (!is_csv || !in_quote)) { + cstate->special_chars_encountered++; /* Check for \r\n on first line, _and_ handle \r\n. */ if (cstate->eol_type == EOL_UNKNOWN || cstate->eol_type == EOL_CRNL) @@ -1502,6 +1526,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) /* Process \n */ if (c == '\n' && (!is_csv || !in_quote)) { + cstate->special_chars_encountered++; if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL) ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), @@ -1524,6 +1549,8 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) { char c2; + cstate->special_chars_encountered++; + IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); IF_NEED_REFILL_AND_EOF_BREAK(0); diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h index c8b22af22d8..215215f909f 100644 --- a/src/include/commands/copyfrom_internal.h +++ b/src/include/commands/copyfrom_internal.h @@ -181,6 +181,17 @@ typedef struct CopyFromStateData #define RAW_BUF_BYTES(cstate) ((cstate)->raw_buf_len - (cstate)->raw_buf_index) uint64 bytes_processed; /* number of bytes processed so far */ + + /* the amount of bytes to read until checking if we should try simd */ +#define BYTES_PROCESSED_UNTIL_SIMD_CHECK 100000 + /* the number of special chars read below which we use simd */ +#define SPECIAL_CHAR_SIMD_THRESHOLD 20000 + uint64 special_chars_encountered; /* number of special chars + * encountered so far */ + bool checked_simd; /* we read BYTES_PROCESSED_UNTIL_SIMD_CHECK + * and checked if we should use SIMD on the + * rest of the file */ + bool use_simd; /* use simd to speed up copying */ } CopyFromStateData; extern void ReceiveCopyBegin(CopyFromState cstate); -- 2.52.0