From 58a913589b0b89a8c5ece50b5f8de6c9321a8366 Mon Sep 17 00:00:00 2001 From: David Rowley Date: Thu, 23 May 2024 10:53:23 +1200 Subject: [PATCH v5] Optimize escaping of JSON strings using SIMD Here we adjust escape_json_with_len() to make use of SIMD to allow processing of up to 16-bytes at a time rather than processing a single byte at a time. This has been shown to speed up escaping of JSON strings significantly, especially when no escaping is required. Reviewed-by: Melih Mutlu Discussion: https://postgr.es/m/CAApHDvpLXwMZvbCKcdGfU9XQjGCDm7tFpRdTXuB9PVgpNUYfEQ@mail.gmail.com --- src/backend/utils/adt/json.c | 82 +++++++++++++++++++++++++++++- src/test/regress/expected/json.out | 48 +++++++++++++++++ src/test/regress/sql/json.sql | 7 +++ 3 files changed, 135 insertions(+), 2 deletions(-) diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c index be7bc46038..4e86d734e4 100644 --- a/src/backend/utils/adt/json.c +++ b/src/backend/utils/adt/json.c @@ -19,6 +19,7 @@ #include "funcapi.h" #include "libpq/pqformat.h" #include "miscadmin.h" +#include "port/simd.h" #include "utils/array.h" #include "utils/builtins.h" #include "utils/date.h" @@ -1603,11 +1604,88 @@ escape_json(StringInfo buf, const char *str) void escape_json_with_len(StringInfo buf, const char *str, int len) { + int i = 0; + int copypos = 0; + int vlen; + + Assert(len >= 0); + + /* + * Figure out how many bytes to process using SIMD. Round 'len' down to + * the previous multiple of sizeof(Vector8), assuming that's a power-of-2. + */ + vlen = len & (int) (~(sizeof(Vector8) - 1)); + appendStringInfoCharMacro(buf, '"'); - for (int i = 0; i < len; i++) - escape_json_char(buf, str[i]); + for (;;) + { + /* + * To speed this up try searching sizeof(Vector8) bytes at once for + * special characters that we need to escape. When we find one, we + * fall out of the Vector8 loop and copy the portion we've vector + * searched and then we process sizeof(Vector8) bytes one byte at a + * time. Once done, come back and try doing vector searching again. + * We'll also process any remaining bytes at the tail end of the + * string byte-by-byte. This optimization assumes special characters + * are not that common. + */ + for (; i < vlen; i += sizeof(Vector8)) + { + Vector8 chunk; + + vector8_load(&chunk, (const uint8 *) &str[i]); + + /* + * Break on anything less than ' ' or if we find a '"' or '\\'. + * Those need special handling. That's done in the per-byte loop. + */ + if (vector8_has_le(chunk, (unsigned char) 0x1F) || + vector8_has(chunk, (unsigned char) '"') || + vector8_has(chunk, (unsigned char) '\\')) + break; + +/* #define ESCAPE_JSON_MAX_LOOKHEAD 512 */ +#ifdef ESCAPE_JSON_MAX_LOOKHEAD + if (i - copypos >= ESCAPE_JSON_MAX_LOOKHEAD) + { + appendBinaryStringInfo(buf, &str[copypos], i - copypos); + copypos = i; + } +#endif + } + + /* + * Write to the destination up to the point of that we've vector + * searched so far. Do this only when switching into per-byte mode + * rather than once every sizeof(Vector8) bytes. + */ + if (copypos < i) + { + appendBinaryStringInfo(buf, &str[copypos], i - copypos); + copypos = i; + } + + /* + * Per-byte loop for Vector8s containing special chars and for + * processing the tail of the string. + */ + for (int b = 0; b < sizeof(Vector8); b++) + { + /* check if we've finished */ + if (i == len) + goto done; + + Assert(i < len); + + escape_json_char(buf, str[i++]); + } + + copypos = i; + /* We're not done yet. Try the vector search again */ + } +done: appendStringInfoCharMacro(buf, '"'); } diff --git a/src/test/regress/expected/json.out b/src/test/regress/expected/json.out index aa29bc597b..c8e9b97f0a 100644 --- a/src/test/regress/expected/json.out +++ b/src/test/regress/expected/json.out @@ -55,6 +55,54 @@ SELECT ('"'||repeat('.', 12)||'abc\n"')::json; -- OK, legal escapes "............abc\n" (1 row) +-- Test various lengths of strings to validate SIMD processing to escape +-- special chars in the JSON. +SELECT row_to_json(j)::jsonb FROM ( + SELECT left(E'abcdefghijklmnopqrstuvwxyz0123456"\t78', a) AS very_long_column_name_to_test_json_escape + FROM generate_series(0,37) a +) j; + row_to_json +------------------------------------------------------------------------------------------ + {"very_long_column_name_to_test_json_escape": ""} + {"very_long_column_name_to_test_json_escape": "a"} + {"very_long_column_name_to_test_json_escape": "ab"} + {"very_long_column_name_to_test_json_escape": "abc"} + {"very_long_column_name_to_test_json_escape": "abcd"} + {"very_long_column_name_to_test_json_escape": "abcde"} + {"very_long_column_name_to_test_json_escape": "abcdef"} + {"very_long_column_name_to_test_json_escape": "abcdefg"} + {"very_long_column_name_to_test_json_escape": "abcdefgh"} + {"very_long_column_name_to_test_json_escape": "abcdefghi"} + {"very_long_column_name_to_test_json_escape": "abcdefghij"} + {"very_long_column_name_to_test_json_escape": "abcdefghijk"} + {"very_long_column_name_to_test_json_escape": "abcdefghijkl"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklm"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmn"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmno"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnop"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopq"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqr"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrs"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrst"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstu"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuv"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvw"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwx"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxy"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz0"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz01"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz012"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz0123"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz01234"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz012345"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz0123456"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz0123456\""} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz0123456\"\t"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz0123456\"\t7"} + {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz0123456\"\t78"} +(38 rows) + -- see json_encoding test for input with unicode escapes -- Numbers. SELECT '1'::json; -- OK diff --git a/src/test/regress/sql/json.sql b/src/test/regress/sql/json.sql index ec57dfe707..9bf33115d4 100644 --- a/src/test/regress/sql/json.sql +++ b/src/test/regress/sql/json.sql @@ -12,6 +12,13 @@ SELECT '"\v"'::json; -- ERROR, not a valid JSON escape SELECT ('"'||repeat('.', 12)||'abc"')::json; -- OK SELECT ('"'||repeat('.', 12)||'abc\n"')::json; -- OK, legal escapes +-- Test various lengths of strings to validate SIMD processing to escape +-- special chars in the JSON. +SELECT row_to_json(j)::jsonb FROM ( + SELECT left(E'abcdefghijklmnopqrstuvwxyz0123456"\t78', a) AS very_long_column_name_to_test_json_escape + FROM generate_series(0,37) a +) j; + -- see json_encoding test for input with unicode escapes -- Numbers. -- 2.34.1