From 96dcf1ae9d73df3287d521b009b39c2728a993af Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Sun, 6 Jun 2021 11:13:09 -0400
Subject: [PATCH v11 1/2] Rewrite pg_utf8_verifystr() for speed

Instead of relying on pg_utf8_verifychar() and pg_utf8_isvalid(),
rewrite this function in a manner loosely based on the fallback that
is part of the simdjson library.

Verifying multibyte UTF-8 text is modestly faster, but the biggest
improvement is in verifying ASCII,  which is now around 6x times
faster, depending on platform.
---
 src/common/wchar.c                       | 134 ++++++++++++++++++++++-
 src/test/regress/expected/conversion.out | 106 ++++++++++++++++--
 src/test/regress/sql/conversion.sql      |  66 ++++++++++-
 3 files changed, 294 insertions(+), 12 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..2805d01f7f 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -15,6 +15,56 @@
 #include "mb/pg_wchar.h"
 
 
+/* for UTF-8 */
+#define IS_CONTINUATION_BYTE(c)	(((c) & 0xC0) == 0x80)
+#define IS_TWO_BYTE_LEAD(c)		(((c) & 0xE0) == 0xC0)
+#define IS_THREE_BYTE_LEAD(c)	(((c) & 0xF0) == 0xE0)
+#define IS_FOUR_BYTE_LEAD(c)	(((c) & 0xF8) == 0xF0)
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		half1,
+				half2,
+				highbits_set,
+				x1,
+				x2,
+				x;
+
+	if (len >= 2 * sizeof(uint64))
+	{
+		memcpy(&half1, s, sizeof(uint64));
+		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbits_set = (half1 | half2) & UINT64CONST(0x8080808080808080);
+		if (highbits_set)
+			return 0;
+
+		/*
+		 * Check if there are any zero bytes in this chunk.
+		 *
+		 * First, add 0x7f to each byte. This sets the high bit in each byte,
+		 * unless it was a zero. We already checked that none of the bytes had
+		 * the high bit set previously, so the max value each byte can have
+		 * after the addition is 0x7f + 0x7f = 0xfe, and we don't need to
+		 * worry about carrying over to the next byte.
+		 */
+		x1 = half1 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+		x2 = half2 + UINT64CONST(0x7f7f7f7f7f7f7f7f);
+
+		/* Then check that the high bit is set in each byte. */
+		x = (x1 & x2) & UINT64CONST(0x8080808080808080);
+		if (x != UINT64CONST(0x8080808080808080))
+			return 0;
+
+		return 2 * sizeof(uint64);
+	}
+	else
+		return 0;
+}
+
 /*
  * Operations on multi-byte encodings are driven by a table of helper
  * functions.
@@ -1761,24 +1811,102 @@ static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
+	unsigned char b1,
+				b2,
+				b3,
+				b4;
 
 	while (len > 0)
 	{
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
 				break;
 			l = 1;
 		}
-		else
+		/* code points U+0080 through U+07FF */
+		else if (IS_TWO_BYTE_LEAD(*s))
 		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
+			l = 2;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+
+			if (!IS_CONTINUATION_BYTE(b2))
+				break;
+
+			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+			if (b1 < 0xC2)
+				break;
+		}
+		/* code points U+0800 through U+D7FF and U+E000 through U+FFFF */
+		else if (IS_THREE_BYTE_LEAD(*s))
+		{
+			l = 3;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3))
+				break;
+
+			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+			if (b1 == 0xE0 && b2 < 0xA0)
+				break;
+
+			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			if (b1 == 0xED && b2 > 0x9F)
 				break;
 		}
+		/* code points U+010000 through U+10FFFF */
+		else if (IS_FOUR_BYTE_LEAD(*s))
+		{
+			l = 4;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+			b4 = *(s + 3);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3) ||
+				!IS_CONTINUATION_BYTE(b4))
+				break;
+
+			/*
+			 * check 4-byte overlong: 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if (b1 == 0xF0 && b2 < 0x90)
+				break;
+
+			/* check too large: 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx */
+			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+				break;
+		}
+		else
+			/* invalid byte */
+			break;
+
 		s += l;
 		len -= l;
 	}
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 04fdcba496..9315ad3abd 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -72,6 +72,58 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+            description             |   result   |   errorat    |                             error                              
+------------------------------------+------------+--------------+----------------------------------------------------------------
+ bare continuation                  | \x         | \xaf         | invalid byte sequence for encoding "UTF8": 0xaf
+ missing second byte in 2-byte char | \x         | \xc5         | invalid byte sequence for encoding "UTF8": 0xc5
+ smallest 2-byte overlong           | \x         | \xc080       | invalid byte sequence for encoding "UTF8": 0xc0 0x80
+ largest 2-byte overlong            | \x         | \xc1bf       | invalid byte sequence for encoding "UTF8": 0xc1 0xbf
+ next 2-byte after overlongs        | \xc280     |              | 
+ largest 2-byte                     | \xdfbf     |              | 
+ missing third byte in 3-byte char  | \x         | \xe9af       | invalid byte sequence for encoding "UTF8": 0xe9 0xaf
+ smallest 3-byte overlong           | \x         | \xe08080     | invalid byte sequence for encoding "UTF8": 0xe0 0x80 0x80
+ largest 3-byte overlong            | \x         | \xe09fbf     | invalid byte sequence for encoding "UTF8": 0xe0 0x9f 0xbf
+ next 3-byte after overlong         | \xe0a080   |              | 
+ last before surrogates             | \xed9fbf   |              | 
+ smallest surrogate                 | \x         | \xeda080     | invalid byte sequence for encoding "UTF8": 0xed 0xa0 0x80
+ largest surrogate                  | \x         | \xedbfbf     | invalid byte sequence for encoding "UTF8": 0xed 0xbf 0xbf
+ next after surrogates              | \xee8080   |              | 
+ largest 3-byte                     | \xefbfbf   |              | 
+ missing fourth byte in 4-byte char | \x         | \xf1afbf     | invalid byte sequence for encoding "UTF8": 0xf1 0xaf 0xbf
+ smallest 4-byte overlong           | \x         | \xf0808080   | invalid byte sequence for encoding "UTF8": 0xf0 0x80 0x80 0x80
+ largest 4-byte overlong            | \x         | \xf08fbfbf   | invalid byte sequence for encoding "UTF8": 0xf0 0x8f 0xbf 0xbf
+ next 4-byte after overlong         | \xf0908080 |              | 
+ largest 4-byte                     | \xf48fbfbf |              | 
+ smallest too large                 | \x         | \xf4908080   | invalid byte sequence for encoding "UTF8": 0xf4 0x90 0x80 0x80
+ 5 byte                             | \x         | \xfa9a9a8a8a | invalid byte sequence for encoding "UTF8": 0xfa
+(22 rows)
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
@@ -85,7 +137,7 @@ insert into utf8_inputs  values
   ('\x666f6fefa8aa',	'valid, needs mapping function to convert to GB18030'),
   ('\x66e8b1ff6f6f',	'invalid byte sequence'),
   ('\x66006f',		'invalid, NUL byte'),
-  ('\x666f6fe8b100',	'invalid, NUL byte'),
+  ('\x666f6fe8b100',	'invalid, NUL byte at end'),
   ('\x666f6fe8b1',	'incomplete character at end');
 -- Test UTF-8 verification
 select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
@@ -102,10 +154,48 @@ select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       |              | 
  invalid byte sequence                                | \x66                 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66                 | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6f             | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6f             | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6f             | \xe8b1       | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on 16-bytes at a time.
+-- XXX: The descriptions must be unique across the utf8_inputs and
+-- utf8_verification_inputs tables.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+  union all
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+ description | orig_error | error_after_padding 
+-------------+------------+---------------------
+(0 rows)
+
 -- Test conversions from UTF-8
 select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs;
                      description                      |       inbytes        |     result     |       errorat        |                                                    error                                                    
@@ -121,7 +211,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f       | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "EUC_JIS_2004"
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66           | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66           | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f       | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f       | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f       | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
@@ -139,7 +229,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN1"
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
@@ -157,7 +247,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'latin2')).* from utf8_
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN2"
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
@@ -175,7 +265,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'latin5')).* from utf8_
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN5"
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
@@ -193,7 +283,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'koi8r')).* from utf8_i
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f | \xefa8aa             | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "KOI8R"
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66     | \xe8b1ff6f6f         | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66     | \x006f               | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f | \xe8b100             | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f | \xe8b1               | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
@@ -211,7 +301,7 @@ select description, inbytes, (test_conv(inbytes, 'utf8', 'gb18030')).* from utf8
  valid, needs mapping function to convert to GB18030  | \x666f6fefa8aa       | \x666f6f84309c38           |              | 
  invalid byte sequence                                | \x66e8b1ff6f6f       | \x66                       | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff
  invalid, NUL byte                                    | \x66006f             | \x66                       | \x006f       | invalid byte sequence for encoding "UTF8": 0x00
- invalid, NUL byte                                    | \x666f6fe8b100       | \x666f6f                   | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
+ invalid, NUL byte at end                             | \x666f6fe8b100       | \x666f6f                   | \xe8b100     | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00
  incomplete character at end                          | \x666f6fe8b1         | \x666f6f                   | \xe8b1       | invalid byte sequence for encoding "UTF8": 0xe8 0xb1
 (13 rows)
 
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index 8358682432..8ad5290f4c 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -74,6 +74,34 @@ $$;
 --
 -- UTF-8
 --
+CREATE TABLE utf8_verification_inputs (inbytes bytea, description text);
+insert into utf8_verification_inputs  values
+  ('\xaf',		'bare continuation'),
+  ('\xc5',		'missing second byte in 2-byte char'),
+  ('\xc080',	'smallest 2-byte overlong'),
+  ('\xc1bf',	'largest 2-byte overlong'),
+  ('\xc280',	'next 2-byte after overlongs'),
+  ('\xdfbf',	'largest 2-byte'),
+  ('\xe9af',	'missing third byte in 3-byte char'),
+  ('\xe08080',	'smallest 3-byte overlong'),
+  ('\xe09fbf',	'largest 3-byte overlong'),
+  ('\xe0a080',	'next 3-byte after overlong'),
+  ('\xed9fbf',	'last before surrogates'),
+  ('\xeda080',	'smallest surrogate'),
+  ('\xedbfbf',	'largest surrogate'),
+  ('\xee8080',	'next after surrogates'),
+  ('\xefbfbf',	'largest 3-byte'),
+  ('\xf1afbf',	'missing fourth byte in 4-byte char'),
+  ('\xf0808080',	'smallest 4-byte overlong'),
+  ('\xf08fbfbf',	'largest 4-byte overlong'),
+  ('\xf0908080',	'next 4-byte after overlong'),
+  ('\xf48fbfbf',	'largest 4-byte'),
+  ('\xf4908080',	'smallest too large'),
+  ('\xfa9a9a8a8a',	'5 byte');
+
+-- Test UTF-8 verification
+select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_verification_inputs;
+
 CREATE TABLE utf8_inputs (inbytes bytea, description text);
 insert into utf8_inputs  values
   ('\x666f6f',		'valid, pure ASCII'),
@@ -87,11 +115,47 @@ insert into utf8_inputs  values
   ('\x666f6fefa8aa',	'valid, needs mapping function to convert to GB18030'),
   ('\x66e8b1ff6f6f',	'invalid byte sequence'),
   ('\x66006f',		'invalid, NUL byte'),
-  ('\x666f6fe8b100',	'invalid, NUL byte'),
+  ('\x666f6fe8b100',	'invalid, NUL byte at end'),
   ('\x666f6fe8b1',	'incomplete character at end');
 
 -- Test UTF-8 verification
 select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs;
+
+-- Test UTF-8 verification with ASCII padding appended to provide
+-- coverage for algorithms that work on 16-bytes at a time.
+-- XXX: The descriptions must be unique across the utf8_inputs and
+-- utf8_verification_inputs tables.
+with test_bytes as (
+  -- The error message for a sequence starting with a 4-byte lead
+  -- will contain all 4 bytes if they are present, so add 3
+  -- ASCII bytes to the end to ensure consistent error messages.
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_verification_inputs
+  union all
+  select
+    inbytes,
+    description,
+    (test_conv(inbytes || repeat('.', 3)::bytea, 'utf8', 'utf8')).error
+  from utf8_inputs
+), test_padded as (
+  select
+    description,
+    (test_conv(inbytes || repeat('.', 16)::bytea, 'utf8', 'utf8')).error
+  from test_bytes
+)
+select
+  description,
+  b.error as orig_error,
+  p.error as error_after_padding
+from test_padded p
+join test_bytes b
+using (description)
+where p.error is distinct from b.error
+order by description;
+
 -- Test conversions from UTF-8
 select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs;
 select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_inputs;
-- 
2.31.1