From 846f4a434a1cc7a72a5beb88326f9c03c9d599f1 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Thu, 20 Oct 2022 19:55:23 +0200 Subject: [PATCH 2/3] Fix handling of NULLs when building BRIN summaries The existing code initializes all_nulls = true for new summaries, but this poses issue when adding the first non-NULL value. We have to reset all_nulls to false, but we don't know whether to modify has_nulls. If there was a NULL value before, we need to set has_nulls=true. But if the range was empty and we're adding the first value, we must not set has_nulls. The current code resets all_nulls=false, without setting has_nulls, but that means we may forget the range contains NULL values. So this is a index corruption, producing incorrect results to IS NULL conditions. We might always set has_nulls=true whenever resetting all_nulls, which would resolve the index corruption, but it'd mean all ranges have either all_nulls or has_nulls set, making the index useless for IS [NOT] NULL queries. Ideally, we'd add a new flag to identify empty summaries, but that does not really work for backpatching - we'd need to add the flag to some struct, e.g. BrinValues, but that'd change stride of the bt_columns array, i.e. an ABI break. So instead we use an "impossible" combination with both all_nulls and has_nulls set to true to identify this case. And we never store index tuples with this combination. Note: This may be an issue because we won't store summaries for empty ranges, making them match any condition. So far we had the same issue for IS NULL conditions only. --- .../expected/pg_freespacemap.out | 11 ++- .../pg_freespacemap/sql/pg_freespacemap.sql | 8 +- src/backend/access/brin/brin.c | 88 +++++++++++++++++-- src/backend/access/brin/brin_tuple.c | 19 +++- ...summarization-and-inprogress-insertion.out | 20 +---- ...ummarization-and-inprogress-insertion.spec | 4 +- src/test/modules/brin/t/01_workitems.pl | 11 +-- src/test/regress/expected/brin.out | 2 +- src/test/regress/expected/brin_bloom.out | 2 +- src/test/regress/expected/brin_multi.out | 2 +- 10 files changed, 126 insertions(+), 41 deletions(-) diff --git a/contrib/pg_freespacemap/expected/pg_freespacemap.out b/contrib/pg_freespacemap/expected/pg_freespacemap.out index eb574c23736..fa0d78c88a4 100644 --- a/contrib/pg_freespacemap/expected/pg_freespacemap.out +++ b/contrib/pg_freespacemap/expected/pg_freespacemap.out @@ -1,8 +1,11 @@ CREATE EXTENSION pg_freespacemap; -CREATE TABLE freespace_tab (c1 int) WITH (autovacuum_enabled = off); -CREATE INDEX freespace_brin ON freespace_tab USING brin (c1); +CREATE TABLE freespace_tab (c1 int) WITH (autovacuum_enabled = off, fillfactor = 10); +CREATE INDEX freespace_brin ON freespace_tab USING brin (c1) WITH (pages_per_range=1); CREATE INDEX freespace_btree ON freespace_tab USING btree (c1); CREATE INDEX freespace_hash ON freespace_tab USING hash (c1); +-- necessary to build the first BRIN index tuple +INSERT INTO freespace_tab VALUES (1); +VACUUM; -- report all the sizes of the FSMs for all the relation blocks. WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace') SELECT rel.id, fsm.blkno, (fsm.avail > 0) AS is_avail @@ -10,10 +13,12 @@ WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace ORDER BY 1, 2; id | blkno | is_avail -----------------+-------+---------- + freespace_tab | 0 | t freespace_brin | 0 | f freespace_brin | 1 | f freespace_brin | 2 | t freespace_btree | 0 | f + freespace_btree | 1 | f freespace_hash | 0 | f freespace_hash | 1 | f freespace_hash | 2 | f @@ -24,7 +29,7 @@ WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace freespace_hash | 7 | f freespace_hash | 8 | f freespace_hash | 9 | f -(14 rows) +(16 rows) INSERT INTO freespace_tab VALUES (1); VACUUM freespace_tab; diff --git a/contrib/pg_freespacemap/sql/pg_freespacemap.sql b/contrib/pg_freespacemap/sql/pg_freespacemap.sql index 06275d8fac8..efc0699aa6f 100644 --- a/contrib/pg_freespacemap/sql/pg_freespacemap.sql +++ b/contrib/pg_freespacemap/sql/pg_freespacemap.sql @@ -1,10 +1,14 @@ CREATE EXTENSION pg_freespacemap; -CREATE TABLE freespace_tab (c1 int) WITH (autovacuum_enabled = off); -CREATE INDEX freespace_brin ON freespace_tab USING brin (c1); +CREATE TABLE freespace_tab (c1 int) WITH (autovacuum_enabled = off, fillfactor = 10); +CREATE INDEX freespace_brin ON freespace_tab USING brin (c1) WITH (pages_per_range=1); CREATE INDEX freespace_btree ON freespace_tab USING btree (c1); CREATE INDEX freespace_hash ON freespace_tab USING hash (c1); +-- necessary to build the first BRIN index tuple +INSERT INTO freespace_tab VALUES (1); +VACUUM; + -- report all the sizes of the FSMs for all the relation blocks. WITH rel AS (SELECT oid::regclass AS id FROM pg_class WHERE relname ~ 'freespace') SELECT rel.id, fsm.blkno, (fsm.avail > 0) AS is_avail diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 7e386250ae9..3ed8eefab86 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -1568,15 +1568,48 @@ form_and_insert_tuple(BrinBuildState *state) { BrinTuple *tup; Size size; + bool modified = false; + BrinMemTuple *dtuple = state->bs_dtuple; + int i; - tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart, - state->bs_dtuple, &size); - brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess, - &state->bs_currentInsertBuf, state->bs_currRangeStart, - tup, size); - state->bs_numtuples++; + /* + * Check if any rows were processed for the page range represented by this + * memtuple. We initially set both allnulls/hasnulls to true to identify + * if the range is in this initial/empty state. + * + * XXX It should be enough to check only the first summary - either all + * summaries are empty or none of them. + */ + for (i = 0; i < state->bs_bdesc->bd_tupdesc->natts; i++) + { + if (!(dtuple->bt_columns[i].bv_allnulls && + dtuple->bt_columns[i].bv_hasnulls)) + { + modified = true; + break; + } + } - pfree(tup); + /* + * If the memtuple was modified (i.e. we added any rows to it), insert it + * into the index. That is, we don't store index tuples not representing + * any rows from table. + * + * XXX This has the undesirable consequence, that if the table has a gap + * (a long sequence of pages with no remaining tuples), we won't have any + * BRIN summaries for this part of the table. Which means that we'll have + * to scan this gap for each bitmap index scan. + */ + if (modified) + { + tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart, + state->bs_dtuple, &size); + brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess, + &state->bs_currentInsertBuf, state->bs_currRangeStart, + tup, size); + state->bs_numtuples++; + pfree(tup); + } } /* @@ -1710,24 +1743,53 @@ add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup, Datum result; BrinValues *bval; FmgrInfo *addValue; + bool first_row; + bool has_nulls = false; bval = &dtup->bt_columns[keyno]; + /* + * Is this the first tuple we're adding to the range range? We track + * that by setting both bv_hasnulls and bval->bv_allnulls to true + * during initialization. But it's not a valid combination (at most + * one of those flags should be set), so we reset the second flag. + */ + first_row = (bval->bv_hasnulls && bval->bv_allnulls); + + if (bval->bv_hasnulls && bval->bv_allnulls) + { + bval->bv_hasnulls = false; + modified = true; + } + if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno]) { /* * If the new value is null, we record that we saw it if it's the * first one; otherwise, there's nothing to do. + * + * XXX This used to check "hasnulls" but now that might result in + * having both flags set. That used to be OK, because we just + * ignore hasnulls flag in brin_form_tuple when allnulls=true. + * But now we interpret this combination as "firt row" so it + * would confuse following calls. So make sure to only set one + * of the flags - when allnulls=true we're done, as it already + * marks the range as containing ranges. */ - if (!bval->bv_hasnulls) + if (!bval->bv_allnulls) { bval->bv_hasnulls = true; modified = true; } - continue; } + /* + * Does the range already has NULL values? Either of the flags can + * be set, but we ignore the state before adding first row. + */ + has_nulls = (bval->bv_hasnulls || bval->bv_allnulls) && !first_row; + addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); result = FunctionCall4Coll(addValue, @@ -1736,8 +1798,16 @@ add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup, PointerGetDatum(bval), values[keyno], nulls[keyno]); + /* if that returned true, we need to insert the updated tuple */ modified |= DatumGetBool(result); + + /* + * If we had NULLS, and the opclass didn't set allnulls=true, set + * the hasnulls so that we know there are NULL values. + */ + if (has_nulls && !bval->bv_allnulls) + bval->bv_hasnulls = true; } return modified; diff --git a/src/backend/access/brin/brin_tuple.c b/src/backend/access/brin/brin_tuple.c index c0e2dbd23ba..1b5e72cde24 100644 --- a/src/backend/access/brin/brin_tuple.c +++ b/src/backend/access/brin/brin_tuple.c @@ -136,6 +136,13 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, { int datumno; + /* + * We should never get here for memtuples in the initial state, i.e. + * before any rows were added to it. + */ + Assert(!(tuple->bt_columns[keyno].bv_hasnulls && + tuple->bt_columns[keyno].bv_allnulls)); + /* * "allnulls" is set when there's no nonnull value in any row in the * column; when this happens, there is no data to store. Thus set the @@ -516,8 +523,10 @@ brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc) for (i = 0; i < brdesc->bd_tupdesc->natts; i++) { dtuple->bt_columns[i].bv_attno = i + 1; + + /* each memtuple starts as if it represents no rows */ dtuple->bt_columns[i].bv_allnulls = true; - dtuple->bt_columns[i].bv_hasnulls = false; + dtuple->bt_columns[i].bv_hasnulls = true; dtuple->bt_columns[i].bv_values = (Datum *) currdatum; dtuple->bt_columns[i].bv_mem_value = PointerGetDatum(NULL); @@ -585,6 +594,14 @@ brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple) { int i; + Assert(!(allnulls[keyno] && hasnulls[keyno])); + + /* + * Make sure to overwrite the hasnulls flag, because it might have + * been initialized as true by brin_memtuple_initialize. + */ + dtup->bt_columns[keyno].bv_hasnulls = hasnulls[keyno]; + if (allnulls[keyno]) { valueno += brdesc->bd_info[keyno]->oi_nstored; diff --git a/src/test/modules/brin/expected/summarization-and-inprogress-insertion.out b/src/test/modules/brin/expected/summarization-and-inprogress-insertion.out index 02ef52d299a..2266012eac7 100644 --- a/src/test/modules/brin/expected/summarization-and-inprogress-insertion.out +++ b/src/test/modules/brin/expected/summarization-and-inprogress-insertion.out @@ -1,12 +1,6 @@ Parsed test spec with 2 sessions -starting permutation: s2check s1b s2b s1i s2summ s1c s2c s2check -step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass); -itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value -----------+------+------+--------+--------+-----------+-------- - 1| 0| 1|f |t |f |{1 .. 1} -(1 row) - +starting permutation: s1b s2b s1i s2summ s1c s2c s2check step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ; step s2b: BEGIN ISOLATION LEVEL REPEATABLE READ; SELECT 1; ?column? @@ -18,7 +12,7 @@ step s1i: INSERT INTO brin_iso VALUES (1000); step s2summ: SELECT brin_summarize_new_values('brinidx'::regclass); brin_summarize_new_values ------------------------- - 1 + 2 (1 row) step s1c: COMMIT; @@ -31,13 +25,7 @@ itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value (2 rows) -starting permutation: s2check s1b s1i s2vacuum s1c s2check -step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass); -itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value -----------+------+------+--------+--------+-----------+-------- - 1| 0| 1|f |t |f |{1 .. 1} -(1 row) - +starting permutation: s1b s1i s2vacuum s1c s2check step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ; step s1i: INSERT INTO brin_iso VALUES (1000); step s2vacuum: VACUUM brin_iso; @@ -45,7 +33,7 @@ step s1c: COMMIT; step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass); itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value ----------+------+------+--------+--------+-----------+----------- - 1| 0| 1|f |f |f |{1 .. 1} + 1| 0| 1|f |t |f |{1 .. 1} 2| 1| 1|f |f |f |{1 .. 1000} (2 rows) diff --git a/src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec b/src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec index 18ba92b7ba1..6319ae4c38d 100644 --- a/src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec +++ b/src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec @@ -41,5 +41,5 @@ step "s2vacuum" { VACUUM brin_iso; } step "s2check" { SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass); } -permutation "s2check" "s1b" "s2b" "s1i" "s2summ" "s1c" "s2c" "s2check" -permutation "s2check" "s1b" "s1i" "s2vacuum" "s1c" "s2check" +permutation "s1b" "s2b" "s1i" "s2summ" "s1c" "s2c" "s2check" +permutation "s1b" "s1i" "s2vacuum" "s1c" "s2check" diff --git a/src/test/modules/brin/t/01_workitems.pl b/src/test/modules/brin/t/01_workitems.pl index 3108c02cf4d..eeec44b0060 100644 --- a/src/test/modules/brin/t/01_workitems.pl +++ b/src/test/modules/brin/t/01_workitems.pl @@ -24,20 +24,21 @@ $node->safe_psql( create index brin_wi_idx on brin_wi using brin (a) with (pages_per_range=1, autosummarize=on); ' ); -my $count = $node->safe_psql('postgres', - "select count(*) from brin_page_items(get_raw_page('brin_wi_idx', 2), 'brin_wi_idx'::regclass)" -); -is($count, '1', "initial index state is correct"); $node->safe_psql('postgres', 'insert into brin_wi select * from generate_series(1, 100)'); +$node->poll_query_until( + 'postgres', + "select pg_relation_size('brin_wi_idx'::regclass) / current_setting('block_size')::int = 3", + 't'); + $node->poll_query_until( 'postgres', "select count(*) > 1 from brin_page_items(get_raw_page('brin_wi_idx', 2), 'brin_wi_idx'::regclass)", 't'); -$count = $node->safe_psql('postgres', +my $count = $node->safe_psql('postgres', "select count(*) > 1 from brin_page_items(get_raw_page('brin_wi_idx', 2), 'brin_wi_idx'::regclass)" ); is($count, 't', "index got summarized"); diff --git a/src/test/regress/expected/brin.out b/src/test/regress/expected/brin.out index 73fa38396e4..ebc31222354 100644 --- a/src/test/regress/expected/brin.out +++ b/src/test/regress/expected/brin.out @@ -454,7 +454,7 @@ $$; SELECT brin_summarize_range('brin_summarize_idx', 0); brin_summarize_range ---------------------- - 0 + 1 (1 row) -- nothing: already summarized diff --git a/src/test/regress/expected/brin_bloom.out b/src/test/regress/expected/brin_bloom.out index 32c56a996a2..6e847f9113d 100644 --- a/src/test/regress/expected/brin_bloom.out +++ b/src/test/regress/expected/brin_bloom.out @@ -373,7 +373,7 @@ $$; SELECT brin_summarize_range('brin_summarize_bloom_idx', 0); brin_summarize_range ---------------------- - 0 + 1 (1 row) -- nothing: already summarized diff --git a/src/test/regress/expected/brin_multi.out b/src/test/regress/expected/brin_multi.out index f3309f433f8..e65f1c20d4f 100644 --- a/src/test/regress/expected/brin_multi.out +++ b/src/test/regress/expected/brin_multi.out @@ -407,7 +407,7 @@ $$; SELECT brin_summarize_range('brin_summarize_multi_idx', 0); brin_summarize_range ---------------------- - 0 + 1 (1 row) -- nothing: already summarized -- 2.38.1