From 9f03e4d23356e8012e8a190bddd81059dc88471b Mon Sep 17 00:00:00 2001 From: Noah Misch Date: Sat, 14 Feb 2026 21:07:38 +1300 Subject: [PATCH v2tm 1/2] Fix SUBSTRING() for toasted multibyte characters. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1e7fe06c10c0a8da9dd6261a6be8d405dc17c728 changed pg_mbstrlen_with_len() to ereport(ERROR) if the input ends in an incomplete character. Most callers want that. text_substring() did not. It detoasts the most bytes it could possibly need to get the requested number of characters. For example, to extract up to 2 chars from UTF8, it needs to detoast 8 bytes. In a string of 3-byte UTF8 chars, 8 bytes spans 2 complete chars and 1 partial char. Fix by removing the pg_mbstrlen_with_len() call. An encoding error is still raised if input runs out in the middle of a multi-byte sequence that is needed for the result, but that can be detected incrementally. This is consistent with the general philosophy of the above commit, which was to raise errors on a just-in-time basis. Before the above commit, SUBSTRING() never raised an encoding error. SUBSTRING() has long been detoasting enough for one more char than needed, because it did not distinguish exclusive and inclusive end position. For avoidance of doubt, stop detoasting extra. A long standing overflow bug affecting positions exceeding 2^29 is also addressed, though such large numbers would be unlikely to be used in real applications. Back-patch to v14, like the above commit. For applications using SUBSTRING() on non-ASCII column values, consider applying this to your copy of any of the February 12, 2026 releases. Reported-by: SATŌ Kentarō Reviewed-by: FIXME Bug: #19406 Discussion: https://postgr.es/m/19406-9867fddddd724fca@postgresql.org Backpatch-through: 14 --- src/backend/utils/adt/varlena.c | 103 +++++++------------------ src/test/regress/expected/encoding.out | 72 ++++++++++++----- src/test/regress/sql/encoding.sql | 26 +++++-- 3 files changed, 102 insertions(+), 99 deletions(-) diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index dbecd7160d6..5e3951ab0e1 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -586,7 +586,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) int32 S = start; /* start position */ int32 S1; /* adjusted start position */ int32 L1; /* adjusted substring length */ - int32 E; /* end position */ + int32 E; /* end position, exclusive */ /* * SQL99 says S can be zero or negative (which we don't document), but we @@ -644,34 +644,27 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) * detoasting, so we'll grab a conservatively large slice now and go * back later to do the right thing */ - int32 slice_start; int32 slice_size; - int32 slice_strlen; - int32 slice_len; + const char *slice_end; text *slice; - int32 E1; int32 i; char *p; char *s; text *ret; - /* - * We need to start at position zero because there is no way to know - * in advance which byte offset corresponds to the supplied start - * position. - */ - slice_start = 0; - - if (length_not_specified) /* special case - get length to end of - * string */ - slice_size = L1 = -1; + if (length_not_specified) /* special case - get whole string */ + { + slice_size = -1; + E = PG_INT32_MAX; + } else if (length < 0) { /* SQL99 says to throw an error for E < S, i.e., negative length */ ereport(ERROR, (errcode(ERRCODE_SUBSTRING_ERROR), errmsg("negative substring length not allowed"))); - slice_size = L1 = -1; /* silence stupider compilers */ + slice_size = -1; /* silence stupider compilers */ + E = PG_INT32_MAX; } else if (pg_add_s32_overflow(S, length, &E)) { @@ -679,30 +672,17 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) * L could be large enough for S + L to overflow, in which case * the substring must run to end of string. */ - slice_size = L1 = -1; + slice_size = -1; + E = PG_INT32_MAX; } else { /* - * A zero or negative value for the end position can happen if the - * start was negative or one. SQL99 says to return a zero-length - * string. - */ - if (E < 1) - return cstring_to_text(""); - - /* - * if E is past the end of the string, the tuple toaster will - * truncate the length for us + * Total slice size in bytes can't be any longer than the end + * position (made zero-based) times the encoding max length. If + * that overflows, we can just use -1. */ - L1 = E - S1; - - /* - * Total slice size in bytes can't be any longer than the start - * position plus substring length times the encoding max length. - * If that overflows, we can just use -1. - */ - if (pg_mul_s32_overflow(E, eml, &slice_size)) + if (pg_mul_s32_overflow(E - 1, eml, &slice_size)) slice_size = -1; } @@ -712,63 +692,34 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified) */ if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) || VARATT_IS_EXTERNAL(DatumGetPointer(str))) - slice = DatumGetTextPSlice(str, slice_start, slice_size); + slice = DatumGetTextPSlice(str, 0, slice_size); else slice = (text *) DatumGetPointer(str); - /* see if we got back an empty string */ - slice_len = VARSIZE_ANY_EXHDR(slice); - if (slice_len == 0) - { - if (slice != (text *) DatumGetPointer(str)) - pfree(slice); - return cstring_to_text(""); - } - - /* Now we can get the actual length of the slice in MB characters */ - slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice), - slice_len); - - /* - * Check that the start position wasn't > slice_strlen. If so, SQL99 - * says to return a zero-length string. - */ - if (S1 > slice_strlen) - { - if (slice != (text *) DatumGetPointer(str)) - pfree(slice); - return cstring_to_text(""); - } - - /* - * Adjust L1 and E1 now that we know the slice string length. Again - * remember that S1 is one based, and slice_start is zero based. - */ - if (L1 > -1) - E1 = Min(S1 + L1, slice_start + 1 + slice_strlen); - else - E1 = slice_start + 1 + slice_strlen; + p = VARDATA_ANY(slice); + slice_end = p + VARSIZE_ANY_EXHDR(slice); /* - * Find the start position in the slice; remember S1 is not zero based + * Find the start position in the slice; remember S1 is not zero + * based. If we run out of input first, we'll return an empty string. */ - p = VARDATA_ANY(slice); - for (i = 0; i < S1 - 1; i++) - p += pg_mblen_unbounded(p); + for (i = 1; i < S1 && p < slice_end; i++) + p += pg_mblen_range(p, slice_end); /* hang onto a pointer to our start position */ s = p; /* * Count the actual bytes used by the substring of the requested - * length. + * length, again giving up if we run out of input. */ - for (i = S1; i < E1; i++) - p += pg_mblen_unbounded(p); + for (i = S1; i < E && p < slice_end; i++) + p += pg_mblen_range(p, slice_end); ret = (text *) palloc(VARHDRSZ + (p - s)); SET_VARSIZE(ret, VARHDRSZ + (p - s)); - memcpy(VARDATA(ret), s, (p - s)); + if (s < p) + memcpy(VARDATA(ret), s, (p - s)); if (slice != (text *) DatumGetPointer(str)) pfree(slice); diff --git a/src/test/regress/expected/encoding.out b/src/test/regress/expected/encoding.out index ea1f38cff41..eec879b05ec 100644 --- a/src/test/regress/expected/encoding.out +++ b/src/test/regress/expected/encoding.out @@ -63,7 +63,13 @@ SELECT reverse(good) FROM regress_encoding; -- invalid short mb character = error SELECT length(truncated) FROM regress_encoding; ERROR: invalid byte sequence for encoding "UTF8": 0xc3 -SELECT substring(truncated, 1, 1) FROM regress_encoding; +SELECT substring(truncated, 1, 3) FROM regress_encoding; + substring +----------- + caf +(1 row) + +SELECT substring(truncated, 1, 4) FROM regress_encoding; ERROR: invalid byte sequence for encoding "UTF8": 0xc3 SELECT reverse(truncated) FROM regress_encoding; ERROR: invalid byte sequence for encoding "UTF8": 0xc3 @@ -84,6 +90,13 @@ SELECT length(with_nul) FROM regress_encoding; 4 (1 row) +SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding; + regexp_replace +---------------- + é +(1 row) + +-- NUL = character SELECT substring(with_nul, 3, 1) FROM regress_encoding; substring ----------- @@ -96,25 +109,12 @@ SELECT substring(with_nul, 4, 1) FROM regress_encoding; é (1 row) -SELECT substring(with_nul, 5, 1) FROM regress_encoding; - substring ------------ - -(1 row) - -SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding; - convert_to ------------- - \x -(1 row) - -SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding; - regexp_replace ----------------- - é +SELECT substring(with_nul, 5, 1) = test_bytea_to_text('\x00') FROM regress_encoding; + ?column? +---------- + t (1 row) --- NUL = character SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding; with_nul | reverse | reverse ----------+---------+--------- @@ -375,7 +375,43 @@ NOTICE: MULE_INTERNAL LC2: \x908283 -> {9470595} -> \x908283 = OK t (1 row) +-- substring fetches a slice of a toasted value; unused tail of that slice is +-- an incomplete char (bug #19406) +CREATE TABLE toast_3b_utf8 (c text); +INSERT INTO toast_3b_utf8 VALUES (repeat(U&'\2026', 4000)); +SELECT SUBSTRING(c FROM 1 FOR 1) FROM toast_3b_utf8; + substring +----------- + … +(1 row) + +SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; + substring +----------- + +(1 row) + +-- diagnose incomplete char iff within the substring +UPDATE toast_3b_utf8 SET c = c || test_bytea_to_text('\xe280'); +SELECT SUBSTRING(c FROM 4000 FOR 1) FROM toast_3b_utf8; + substring +----------- + … +(1 row) + +SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; +ERROR: invalid byte sequence for encoding "UTF8": 0xe2 0x80 +-- substring needing last byte of its slice_size +ALTER TABLE toast_3b_utf8 RENAME TO toast_4b_utf8; +UPDATE toast_4b_utf8 SET c = repeat(U&'\+01F680', 3000); +SELECT SUBSTRING(c FROM 3000 FOR 1) FROM toast_4b_utf8; + substring +----------- + 🚀 +(1 row) + DROP TABLE encoding_tests; +DROP TABLE toast_4b_utf8; DROP FUNCTION test_encoding; DROP FUNCTION test_text_to_wchars; DROP FUNCTION test_mblen_func; diff --git a/src/test/regress/sql/encoding.sql b/src/test/regress/sql/encoding.sql index b9543c0cb32..7de95f49947 100644 --- a/src/test/regress/sql/encoding.sql +++ b/src/test/regress/sql/encoding.sql @@ -40,7 +40,8 @@ SELECT reverse(good) FROM regress_encoding; -- invalid short mb character = error SELECT length(truncated) FROM regress_encoding; -SELECT substring(truncated, 1, 1) FROM regress_encoding; +SELECT substring(truncated, 1, 3) FROM regress_encoding; +SELECT substring(truncated, 1, 4) FROM regress_encoding; SELECT reverse(truncated) FROM regress_encoding; -- invalid short mb character = silently dropped SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding; @@ -51,12 +52,11 @@ SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding; -- NUL = terminator SELECT length(with_nul) FROM regress_encoding; -SELECT substring(with_nul, 3, 1) FROM regress_encoding; -SELECT substring(with_nul, 4, 1) FROM regress_encoding; -SELECT substring(with_nul, 5, 1) FROM regress_encoding; -SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding; SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding; -- NUL = character +SELECT substring(with_nul, 3, 1) FROM regress_encoding; +SELECT substring(with_nul, 4, 1) FROM regress_encoding; +SELECT substring(with_nul, 5, 1) = test_bytea_to_text('\x00') FROM regress_encoding; SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding; -- If a corrupted string contains NUL in the tail bytes of a multibyte @@ -212,7 +212,23 @@ INSERT INTO encoding_tests VALUES SELECT COUNT(test_encoding(encoding, description, input)) > 0 FROM encoding_tests; +-- substring fetches a slice of a toasted value; unused tail of that slice is +-- an incomplete char (bug #19406) +CREATE TABLE toast_3b_utf8 (c text); +INSERT INTO toast_3b_utf8 VALUES (repeat(U&'\2026', 4000)); +SELECT SUBSTRING(c FROM 1 FOR 1) FROM toast_3b_utf8; +SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; +-- diagnose incomplete char iff within the substring +UPDATE toast_3b_utf8 SET c = c || test_bytea_to_text('\xe280'); +SELECT SUBSTRING(c FROM 4000 FOR 1) FROM toast_3b_utf8; +SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; +-- substring needing last byte of its slice_size +ALTER TABLE toast_3b_utf8 RENAME TO toast_4b_utf8; +UPDATE toast_4b_utf8 SET c = repeat(U&'\+01F680', 3000); +SELECT SUBSTRING(c FROM 3000 FOR 1) FROM toast_4b_utf8; + DROP TABLE encoding_tests; +DROP TABLE toast_4b_utf8; DROP FUNCTION test_encoding; DROP FUNCTION test_text_to_wchars; DROP FUNCTION test_mblen_func; -- 2.47.3