From d6268d03c3d8cbd340ba4a8fbf588a94b82d4ad6 Mon Sep 17 00:00:00 2001 From: Mark Dilger Date: Sat, 21 Mar 2026 20:11:26 -0700 Subject: [PATCH v3] Demonstrate inconsistency in gin index vs seq scan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After 0001, ILIKE uses str_casefold() for matching, but pg_trgm still uses str_tolower() for trigram extraction (trgm_op.c:352 and :948). With builtin collations, these produce different results: str_casefold('ς') = 'σ' (Greek final sigma folds to regular sigma) str_tolower('ς') = 'ς' (already lowercase, no change) The GIN consistent function (trgm_gin.c:228-236) requires ALL pattern trigrams to be present in the index. When the data contains 'ς' but the pattern contains 'σ' (or vice versa), the trigrams extracted by str_tolower won't match, so the GIN filter rejects the row before recheck runs. Sequential scan finds it; index scan does not. --- contrib/pg_trgm/Makefile | 2 +- .../expected/pg_trgm_ilike_false_negative.out | 135 ++++++++++++++++++ contrib/pg_trgm/meson.build | 1 + .../sql/pg_trgm_ilike_false_negative.sql | 118 +++++++++++++++ 4 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 contrib/pg_trgm/expected/pg_trgm_ilike_false_negative.out create mode 100644 contrib/pg_trgm/sql/pg_trgm_ilike_false_negative.sql diff --git a/contrib/pg_trgm/Makefile b/contrib/pg_trgm/Makefile index c1756993ec7..b0e68228f00 100644 --- a/contrib/pg_trgm/Makefile +++ b/contrib/pg_trgm/Makefile @@ -14,7 +14,7 @@ DATA = pg_trgm--1.5--1.6.sql pg_trgm--1.4--1.5.sql pg_trgm--1.3--1.4.sql \ pg_trgm--1.0--1.1.sql PGFILEDESC = "pg_trgm - trigram matching" -REGRESS = pg_trgm pg_utf8_trgm pg_word_trgm pg_strict_word_trgm +REGRESS = pg_trgm pg_utf8_trgm pg_word_trgm pg_strict_word_trgm pg_trgm_ilike_false_negative ifdef USE_PGXS PG_CONFIG = pg_config diff --git a/contrib/pg_trgm/expected/pg_trgm_ilike_false_negative.out b/contrib/pg_trgm/expected/pg_trgm_ilike_false_negative.out new file mode 100644 index 00000000000..4cd2b659a89 --- /dev/null +++ b/contrib/pg_trgm/expected/pg_trgm_ilike_false_negative.out @@ -0,0 +1,135 @@ +-- +-- Demonstration that pg_trgm GIN indexes can produce false negatives +-- with ILIKE after ILIKE switches from str_tolower to str_casefold. +-- +-- pg_trgm extracts trigrams using str_tolower with DEFAULT_COLLATION_OID +-- (trgm_op.c:352, :948). After the patch, ILIKE matches using +-- str_casefold with DEFAULT_COLLATION_OID. For builtin collations, +-- casefold('ς') = 'σ' but tolower('ς') = 'ς'. The GIN consistent +-- function requires ALL pattern trigrams to be present, so any +-- mismatched trigram causes the row to be rejected before recheck. +-- +-- This requires a database whose default collation uses the builtin +-- provider. We create one within the test. +-- +-- skip if not UTF8 +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif +CREATE DATABASE test_trgm_builtin + LOCALE_PROVIDER=builtin + BUILTIN_LOCALE='C.UTF-8' + ENCODING='UTF8' + TEMPLATE=template0; +\c test_trgm_builtin +CREATE EXTENSION pg_trgm; +SET client_encoding TO UTF8; +-- ============================================================ +-- Setup: table with Greek sigma variants plus filler rows +-- to ensure the planner prefers the GIN index. +-- ============================================================ +CREATE TABLE test_trgm_sigma (t text); +INSERT INTO test_trgm_sigma VALUES ('abcςdefgh'); -- final sigma U+03C2 +INSERT INTO test_trgm_sigma VALUES ('abcσdefgh'); -- regular sigma U+03C3 +INSERT INTO test_trgm_sigma + SELECT 'row' || g || repeat('x', 20) + FROM generate_series(1, 1000) g; +CREATE INDEX test_trgm_sigma_gin ON test_trgm_sigma USING gin (t gin_trgm_ops); +ANALYZE test_trgm_sigma; +-- ============================================================ +-- Verify the planner uses the GIN index (no COLLATE override, +-- so the default builtin collation is used throughout). +-- ============================================================ +SET enable_seqscan = off; +EXPLAIN (COSTS OFF) +SELECT t FROM test_trgm_sigma WHERE t ILIKE '%abcσdefgh%'; + QUERY PLAN +------------------------------------------------- + Bitmap Heap Scan on test_trgm_sigma + Recheck Cond: (t ~~* '%abcσdefgh%'::text) + -> Bitmap Index Scan on test_trgm_sigma_gin + Index Cond: (t ~~* '%abcσdefgh%'::text) +(4 rows) + +-- ============================================================ +-- Test 1: Sequential scan. +-- +-- str_casefold with the builtin default collation maps ς→σ, +-- so 'abcςdefgh' matches pattern '%abcσdefgh%'. +-- Both sigma rows should be returned. +-- ============================================================ +SET enable_seqscan = on; +SET enable_indexscan = off; +SET enable_bitmapscan = off; +SELECT t FROM test_trgm_sigma WHERE t ILIKE '%abcσdefgh%' ORDER BY t; + t +----------- + abcςdefgh + abcσdefgh +(2 rows) + +-- ============================================================ +-- Test 2: GIN index scan. +-- +-- pg_trgm's trigrams for data 'abcςdefgh' were extracted via +-- str_tolower(DEFAULT_COLLATION_OID). With the builtin collation, +-- tolower('ς') = 'ς', producing trigrams 'bcς','cςd','ςde'. +-- The pattern trigrams are extracted via str_tolower('abcσdefgh'), +-- producing 'bcσ','cσd','σde'. These don't match the ς-based +-- trigrams in the index, so the GIN filter rejects the ς row. +-- +-- If this returns fewer rows than Test 1, that's a false negative: +-- the index scan missed a row the operator would have matched. +-- ============================================================ +SET enable_seqscan = off; +SET enable_indexscan = on; +SET enable_bitmapscan = on; +SELECT t FROM test_trgm_sigma WHERE t ILIKE '%abcσdefgh%' ORDER BY t; + t +----------- + abcσdefgh +(1 row) + +-- ============================================================ +-- Test 3: Reverse — final sigma (ς) in pattern. +-- Same issue, opposite direction. +-- ============================================================ +SET enable_seqscan = on; +SET enable_indexscan = off; +SET enable_bitmapscan = off; +SELECT t FROM test_trgm_sigma WHERE t ILIKE '%abcςdefgh%' ORDER BY t; + t +----------- + abcςdefgh + abcσdefgh +(2 rows) + +SET enable_seqscan = off; +SET enable_indexscan = on; +SET enable_bitmapscan = on; +SELECT t FROM test_trgm_sigma WHERE t ILIKE '%abcςdefgh%' ORDER BY t; + t +----------- + abcςdefgh +(1 row) + +-- ============================================================ +-- Sanity check: ASCII ILIKE with GIN works correctly. +-- ============================================================ +SELECT t FROM test_trgm_sigma WHERE t ILIKE '%DEFGH%' ORDER BY t; + t +----------- + abcςdefgh + abcσdefgh +(2 rows) + +-- ============================================================ +-- Cleanup +-- ============================================================ +RESET enable_seqscan; +RESET enable_indexscan; +RESET enable_bitmapscan; +DROP TABLE test_trgm_sigma; +\c contrib_regression +DROP DATABASE test_trgm_builtin; diff --git a/contrib/pg_trgm/meson.build b/contrib/pg_trgm/meson.build index 3ecf95ba862..808fa5bd0ad 100644 --- a/contrib/pg_trgm/meson.build +++ b/contrib/pg_trgm/meson.build @@ -42,6 +42,7 @@ tests += { 'pg_utf8_trgm', 'pg_word_trgm', 'pg_strict_word_trgm', + 'pg_trgm_ilike_false_negative', ], }, } diff --git a/contrib/pg_trgm/sql/pg_trgm_ilike_false_negative.sql b/contrib/pg_trgm/sql/pg_trgm_ilike_false_negative.sql new file mode 100644 index 00000000000..a0bf22c0dd4 --- /dev/null +++ b/contrib/pg_trgm/sql/pg_trgm_ilike_false_negative.sql @@ -0,0 +1,118 @@ +-- +-- Demonstration that pg_trgm GIN indexes can produce false negatives +-- with ILIKE after ILIKE switches from str_tolower to str_casefold. +-- +-- pg_trgm extracts trigrams using str_tolower with DEFAULT_COLLATION_OID +-- (trgm_op.c:352, :948). After the patch, ILIKE matches using +-- str_casefold with DEFAULT_COLLATION_OID. For builtin collations, +-- casefold('ς') = 'σ' but tolower('ς') = 'ς'. The GIN consistent +-- function requires ALL pattern trigrams to be present, so any +-- mismatched trigram causes the row to be rejected before recheck. +-- +-- This requires a database whose default collation uses the builtin +-- provider. We create one within the test. +-- + +-- skip if not UTF8 +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset +\if :skip_test +\quit +\endif + +CREATE DATABASE test_trgm_builtin + LOCALE_PROVIDER=builtin + BUILTIN_LOCALE='C.UTF-8' + ENCODING='UTF8' + TEMPLATE=template0; + +\c test_trgm_builtin + +CREATE EXTENSION pg_trgm; + +SET client_encoding TO UTF8; + +-- ============================================================ +-- Setup: table with Greek sigma variants plus filler rows +-- to ensure the planner prefers the GIN index. +-- ============================================================ +CREATE TABLE test_trgm_sigma (t text); +INSERT INTO test_trgm_sigma VALUES ('abcςdefgh'); -- final sigma U+03C2 +INSERT INTO test_trgm_sigma VALUES ('abcσdefgh'); -- regular sigma U+03C3 +INSERT INTO test_trgm_sigma + SELECT 'row' || g || repeat('x', 20) + FROM generate_series(1, 1000) g; +CREATE INDEX test_trgm_sigma_gin ON test_trgm_sigma USING gin (t gin_trgm_ops); +ANALYZE test_trgm_sigma; + +-- ============================================================ +-- Verify the planner uses the GIN index (no COLLATE override, +-- so the default builtin collation is used throughout). +-- ============================================================ +SET enable_seqscan = off; + +EXPLAIN (COSTS OFF) +SELECT t FROM test_trgm_sigma WHERE t ILIKE '%abcσdefgh%'; + +-- ============================================================ +-- Test 1: Sequential scan. +-- +-- str_casefold with the builtin default collation maps ς→σ, +-- so 'abcςdefgh' matches pattern '%abcσdefgh%'. +-- Both sigma rows should be returned. +-- ============================================================ +SET enable_seqscan = on; +SET enable_indexscan = off; +SET enable_bitmapscan = off; + +SELECT t FROM test_trgm_sigma WHERE t ILIKE '%abcσdefgh%' ORDER BY t; + +-- ============================================================ +-- Test 2: GIN index scan. +-- +-- pg_trgm's trigrams for data 'abcςdefgh' were extracted via +-- str_tolower(DEFAULT_COLLATION_OID). With the builtin collation, +-- tolower('ς') = 'ς', producing trigrams 'bcς','cςd','ςde'. +-- The pattern trigrams are extracted via str_tolower('abcσdefgh'), +-- producing 'bcσ','cσd','σde'. These don't match the ς-based +-- trigrams in the index, so the GIN filter rejects the ς row. +-- +-- If this returns fewer rows than Test 1, that's a false negative: +-- the index scan missed a row the operator would have matched. +-- ============================================================ +SET enable_seqscan = off; +SET enable_indexscan = on; +SET enable_bitmapscan = on; + +SELECT t FROM test_trgm_sigma WHERE t ILIKE '%abcσdefgh%' ORDER BY t; + +-- ============================================================ +-- Test 3: Reverse — final sigma (ς) in pattern. +-- Same issue, opposite direction. +-- ============================================================ +SET enable_seqscan = on; +SET enable_indexscan = off; +SET enable_bitmapscan = off; + +SELECT t FROM test_trgm_sigma WHERE t ILIKE '%abcςdefgh%' ORDER BY t; + +SET enable_seqscan = off; +SET enable_indexscan = on; +SET enable_bitmapscan = on; + +SELECT t FROM test_trgm_sigma WHERE t ILIKE '%abcςdefgh%' ORDER BY t; + +-- ============================================================ +-- Sanity check: ASCII ILIKE with GIN works correctly. +-- ============================================================ +SELECT t FROM test_trgm_sigma WHERE t ILIKE '%DEFGH%' ORDER BY t; + +-- ============================================================ +-- Cleanup +-- ============================================================ +RESET enable_seqscan; +RESET enable_indexscan; +RESET enable_bitmapscan; +DROP TABLE test_trgm_sigma; + +\c contrib_regression +DROP DATABASE test_trgm_builtin; -- 2.39.5 (Apple Git-154)