From 8d0e6766175abac15b39884126c29da03657be40 Mon Sep 17 00:00:00 2001 From: Nazir Bilal Yavuz Date: Tue, 9 Dec 2025 15:32:10 +0300 Subject: [PATCH v4.1 3/3] Feedback / Changes --- src/include/commands/copyfrom_internal.h | 9 +-- src/backend/commands/copyfrom.c | 1 + src/backend/commands/copyfromparse.c | 88 +++++++++++++++--------- 3 files changed, 60 insertions(+), 38 deletions(-) diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h index 215215f909f..397720bf875 100644 --- a/src/include/commands/copyfrom_internal.h +++ b/src/include/commands/copyfrom_internal.h @@ -183,12 +183,13 @@ typedef struct CopyFromStateData uint64 bytes_processed; /* number of bytes processed so far */ /* the amount of bytes to read until checking if we should try simd */ -#define BYTES_PROCESSED_UNTIL_SIMD_CHECK 100000 - /* the number of special chars read below which we use simd */ -#define SPECIAL_CHAR_SIMD_THRESHOLD 20000 +#define CHARS_PROCESSED_UNTIL_SIMD_CHECK 100000 + /* the ratio of special chars read below which we use simd */ +#define SPECIAL_CHAR_SIMD_RATIO 4 + uint64 chars_processed; uint64 special_chars_encountered; /* number of special chars * encountered so far */ - bool checked_simd; /* we read BYTES_PROCESSED_UNTIL_SIMD_CHECK + bool checked_simd; /* we read CHARS_PROCESSED_UNTIL_SIMD_CHECK * and checked if we should use SIMD on the * rest of the file */ bool use_simd; /* use simd to speed up copying */ diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c index e638623e5b5..d44dd16eced 100644 --- a/src/backend/commands/copyfrom.c +++ b/src/backend/commands/copyfrom.c @@ -1720,6 +1720,7 @@ BeginCopyFrom(ParseState *pstate, cstate->cur_attname = NULL; cstate->cur_attval = NULL; cstate->relname_only = false; + cstate->chars_processed = 0; cstate->special_chars_encountered = 0; cstate->checked_simd = false; cstate->use_simd = false; diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index 549b56c21fb..86a268d0df9 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -143,7 +143,7 @@ static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0"; /* non-export function prototypes */ static bool CopyReadLine(CopyFromState cstate, bool is_csv); -static bool CopyReadLineText(CopyFromState cstate, bool is_csv); +static pg_attribute_always_inline bool CopyReadLineText(CopyFromState cstate, bool is_csv, bool use_simd); static int CopyReadAttributesText(CopyFromState cstate); static int CopyReadAttributesCSV(CopyFromState cstate); static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo, @@ -1173,8 +1173,40 @@ CopyReadLine(CopyFromState cstate, bool is_csv) resetStringInfo(&cstate->line_buf); cstate->line_buf_valid = false; - /* Parse data and transfer into line_buf */ - result = CopyReadLineText(cstate, is_csv); +#ifndef USE_NO_SIMD + + /* + * Wait until we have read more than CHARS_PROCESSED_UNTIL_SIMD_CHECK. + * cstate->bytes_processed will grow an unpredictable amount with each + * call to this function, so just wait until we have crossed the + * threshold. + */ + if (!cstate->checked_simd && cstate->chars_processed > CHARS_PROCESSED_UNTIL_SIMD_CHECK) + { + cstate->checked_simd = true; + + /* + * If we have not read too many special characters then start using + * SIMD to speed up processing. This heuristic assumes that input does + * not vary too much from line to line and that number of special + * characters encountered in the first + * CHARS_PROCESSED_UNTIL_SIMD_CHECK are indicitive of the whole file. + */ + if (cstate->chars_processed / SPECIAL_CHAR_SIMD_RATIO >= cstate->special_chars_encountered) + { + cstate->use_simd = true; + } + } +#endif + + /* + * Parse data and transfer into line_buf. To get benefit from inlining, + * call CopyReadLineText() with the constant boolean variables. + */ + if (cstate->use_simd) + result = CopyReadLineText(cstate, is_csv, true); + else + result = CopyReadLineText(cstate, is_csv, false); if (result) { @@ -1241,8 +1273,8 @@ CopyReadLine(CopyFromState cstate, bool is_csv) /* * CopyReadLineText - inner loop of CopyReadLine for text mode */ -static bool -CopyReadLineText(CopyFromState cstate, bool is_csv) +static pg_attribute_always_inline bool +CopyReadLineText(CopyFromState cstate, bool is_csv, bool use_simd) { char *copy_input_buf; int input_buf_ptr; @@ -1309,7 +1341,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) input_buf_ptr = cstate->input_buf_index; copy_buf_len = cstate->input_buf_len; - for (;;) + for (;; cstate->chars_processed++) { int prev_raw_ptr; char c; @@ -1346,28 +1378,6 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) #ifndef USE_NO_SIMD - /* - * Wait until we have read more than BYTES_PROCESSED_UNTIL_SIMD_CHECK. - * cstate->bytes_processed will grow an unpredictable amount with each - * call to this function, so just wait until we have crossed the - * threshold. - */ - if (!cstate->checked_simd && cstate->bytes_processed > BYTES_PROCESSED_UNTIL_SIMD_CHECK) - { - cstate->checked_simd = true; - - /* - * If we have not read too many special characters - * (SPECIAL_CHAR_SIMD_THRESHOLD) then start using SIMD to speed up - * processing. This heuristic assumes that input does not vary too - * much from line to line and that number of special characters - * encountered in the first BYTES_PROCESSED_UNTIL_SIMD_CHECK are - * indicitive of the whole file. - */ - if (cstate->special_chars_encountered < SPECIAL_CHAR_SIMD_THRESHOLD) - cstate->use_simd = true; - } - /* * Use SIMD instructions to efficiently scan the input buffer for * special characters (e.g., newline, carriage return, quote, and @@ -1380,7 +1390,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) * sequentially. - The remaining buffer is smaller than one vector * width (sizeof(Vector8)); SIMD operates on fixed-size chunks. */ - if (cstate->use_simd && !last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8)) + if (use_simd && !last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8)) { Vector8 chunk; Vector8 match = vector8_broadcast(0); @@ -1430,6 +1440,21 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) prev_raw_ptr = input_buf_ptr; c = copy_input_buf[input_buf_ptr++]; + /* Use this calculation decide whether to use SIMD later */ + if (!use_simd && unlikely(!cstate->checked_simd)) + { + if (is_csv) + { + if (c == '\r' || c == '\n' || c == quotec || c == escapec) + cstate->special_chars_encountered++; + } + else + { + if (c == '\r' || c == '\n' || c == '\\') + cstate->special_chars_encountered++; + } + } + if (is_csv) { /* @@ -1440,7 +1465,6 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) */ if (c == '\r') { - cstate->special_chars_encountered++; IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); } @@ -1472,7 +1496,6 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) /* Process \r */ if (c == '\r' && (!is_csv || !in_quote)) { - cstate->special_chars_encountered++; /* Check for \r\n on first line, _and_ handle \r\n. */ if (cstate->eol_type == EOL_UNKNOWN || cstate->eol_type == EOL_CRNL) @@ -1529,7 +1552,6 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) /* Process \n */ if (c == '\n' && (!is_csv || !in_quote)) { - cstate->special_chars_encountered++; if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL) ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), @@ -1552,8 +1574,6 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) { char c2; - cstate->special_chars_encountered++; - IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); IF_NEED_REFILL_AND_EOF_BREAK(0); -- 2.51.0