Thread: [BUGS] BUG #14565: query planner does not use partial index in partiton ifquery is performed on multiple partitions

The following bug has been logged on the website:

Bug reference:      14565
Logged by:          Zbigniew Szot
Email address:      zbigniew.szot@softiq.pl
PostgreSQL version: 9.5.6
Operating system:    linux and  AWS cloud (9.6.1)
Description:

drop schema if exists bug_showcase cascade;
create schema bug_showcase;
set search_path to bug_showcase;

-- crate table
CREATE TABLE  test_table (
    chec_key UUID  NOT NULL  ,
   some_date  date,
    some_data_2 varchar,
     some_data_3 varchar,
      some_data_4 varchar
    );
-- partiton the table + indexes in partitions    
DO $$
BEGIN
 FOR i IN 0..15 LOOP
   EXECUTE 'CREATE TABLE test_table_'||to_hex(i)||' ( CHECK (chec_key >=
UUID '''||to_hex(i)||'0000000-0000-0000-0000-000000000000'' AND chec_key <=
UUID '''||
        to_hex(i)||'fffffff-ffff-ffff-ffff-ffffffffffff'' ) ) INHERITS
(test_table);';
        EXECUTE 'CREATE INDEX test_table_'||to_hex(i)||'_brin on
test_table_'||to_hex(i)||' USING brin (chec_key, some_date);';
    END LOOP;
END$$;

-- this index would be used instead of partial ones but thats not what we
need 
drop index test_table_4_brin;

-- create partial indexes in one of partitions 
DO $$
BEGIN
 FOR i IN 0..15 LOOP
        EXECUTE 'create index partial_not_working_'||to_hex(i)||' on
test_table_4 USING brin ( chec_key, some_date) where (chec_key >= UUID
''4'||to_hex(i)||'000000-0000-0000-0000-000000000000'' AND chec_key <= UUID
''4'||
        to_hex(i)||'ffffff-ffff-ffff-ffff-ffffffffffff'' );';
END LOOP;   
END$$;

-- populate table

insert into test_table_1  (chec_key , some_date , some_data_2 ,
some_data_3,some_data_4 ) select
uuid('1'||lpad(to_hex(i),5,'0')||'00-0000-4000-a000-000000000000'),date
'2015-10-1' + random() *  interval '2 days' ,md5(random()::text)
,md5(random()::text),md5(random()::text)  from generate_Series(0,1048575) i
;
insert into test_table_3  (chec_key , some_date , some_data_2 ,
some_data_3,some_data_4  ) select
uuid('3'||lpad(to_hex(i),5,'0')||'00-0000-4000-a000-000000000000'),date
'2015-10-1' + random() *  interval '2 days' ,md5(random()::text)
,md5(random()::text),md5(random()::text)  from generate_Series(0,1048575) i
;
insert into test_table_4  (chec_key , some_date , some_data_2 ,
some_data_3,some_data_4  ) select
uuid('4'||lpad(to_hex(i),5,'0')||'00-0000-4000-a000-000000000000'),date
'2015-10-1' + random() *  interval '2 days' ,md5(random()::text)
,md5(random()::text),md5(random()::text)  from generate_Series(0,1048575) i
;
insert into test_table_7  (chec_key , some_date , some_data_2 ,
some_data_3,some_data_4  ) select
uuid('7'||lpad(to_hex(i),5,'0')||'00-0000-4000-a000-000000000000'),date
'2015-10-1' + random() *  interval '2 days' ,md5(random()::text)
,md5(random()::text),md5(random()::text)  from generate_Series(0,1048575) i
;

-- .. ;-)
analyse test_table;

-- make strong sugestion to use indexes if possible
set enable_seqscan = off;
-- this one uses  partial_not_working_4
explain select * from test_table where chec_key  in
('4400df00-0000-4000-a000-000000000000' )and some_date <'2015-11-02';
-- this one uses  partial_not_working_4
explain select * from test_table where chec_key  in
('4400df00-0000-4000-a000-000000000000','4401df00-0000-4000-a000-000000000000'
)and some_date <'2015-11-02';

-- this one DOES NOT use  partial_not_working_4 .. bug or feature ? ;-)
explain select * from test_table where chec_key  in
('4400df00-0000-4000-a000-000000000000'
,'1400df00-0000-4000-a000-000000000000')and some_date <'2015-11-02';


--
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

On 2017/02/23 20:10, zbigniew.szot@softiq.pl wrote:
> The following bug has been logged on the website:
> 
> Bug reference:      14565
> Logged by:          Zbigniew Szot
> Email address:      zbigniew.szot@softiq.pl
> PostgreSQL version: 9.5.6
> Operating system:    linux and  AWS cloud (9.6.1)
> Description:        
> 
> drop schema if exists bug_showcase cascade;
> create schema bug_showcase;
> set search_path to bug_showcase;
> 
> -- crate table
> CREATE TABLE  test_table (
>     chec_key UUID  NOT NULL  ,
>    some_date  date,
>     some_data_2 varchar,
>      some_data_3 varchar,
>       some_data_4 varchar
>     );
> -- partiton the table + indexes in partitions    
> DO $$
> BEGIN
>  FOR i IN 0..15 LOOP
>    EXECUTE 'CREATE TABLE test_table_'||to_hex(i)||' ( CHECK (chec_key >=
> UUID '''||to_hex(i)||'0000000-0000-0000-0000-000000000000'' AND chec_key <=
> UUID '''||
>         to_hex(i)||'fffffff-ffff-ffff-ffff-ffffffffffff'' ) ) INHERITS
> (test_table);';
>         EXECUTE 'CREATE INDEX test_table_'||to_hex(i)||'_brin on
> test_table_'||to_hex(i)||' USING brin (chec_key, some_date);';
>     END LOOP;
> END$$;
> 
> -- this index would be used instead of partial ones but thats not what we
> need 
> drop index test_table_4_brin;
> 
> -- create partial indexes in one of partitions 
> DO $$
> BEGIN
>  FOR i IN 0..15 LOOP
>         EXECUTE 'create index partial_not_working_'||to_hex(i)||' on
> test_table_4 USING brin ( chec_key, some_date) where (chec_key >= UUID
> ''4'||to_hex(i)||'000000-0000-0000-0000-000000000000'' AND chec_key <= UUID
> ''4'||
>         to_hex(i)||'ffffff-ffff-ffff-ffff-ffffffffffff'' );';
> END LOOP;   
> END$$;
> 
> -- populate table
> 
> insert into test_table_1  (chec_key , some_date , some_data_2 ,
> some_data_3,some_data_4 ) select  
> uuid('1'||lpad(to_hex(i),5,'0')||'00-0000-4000-a000-000000000000'),date
> '2015-10-1' + random() *  interval '2 days' ,md5(random()::text)
> ,md5(random()::text),md5(random()::text)  from generate_Series(0,1048575) i
> ;
> insert into test_table_3  (chec_key , some_date , some_data_2 ,
> some_data_3,some_data_4  ) select  
> uuid('3'||lpad(to_hex(i),5,'0')||'00-0000-4000-a000-000000000000'),date
> '2015-10-1' + random() *  interval '2 days' ,md5(random()::text)
> ,md5(random()::text),md5(random()::text)  from generate_Series(0,1048575) i
> ;
> insert into test_table_4  (chec_key , some_date , some_data_2 ,
> some_data_3,some_data_4  ) select  
> uuid('4'||lpad(to_hex(i),5,'0')||'00-0000-4000-a000-000000000000'),date
> '2015-10-1' + random() *  interval '2 days' ,md5(random()::text)
> ,md5(random()::text),md5(random()::text)  from generate_Series(0,1048575) i
> ;
> insert into test_table_7  (chec_key , some_date , some_data_2 ,
> some_data_3,some_data_4  ) select  
> uuid('7'||lpad(to_hex(i),5,'0')||'00-0000-4000-a000-000000000000'),date
> '2015-10-1' + random() *  interval '2 days' ,md5(random()::text)
> ,md5(random()::text),md5(random()::text)  from generate_Series(0,1048575) i
> ;
> 
> -- .. ;-)
> analyse test_table;
> 
> -- make strong sugestion to use indexes if possible
> set enable_seqscan = off;
> -- this one uses  partial_not_working_4
> explain select * from test_table where chec_key  in
> ('4400df00-0000-4000-a000-000000000000' )and some_date <'2015-11-02';
> -- this one uses  partial_not_working_4
> explain select * from test_table where chec_key  in
> ('4400df00-0000-4000-a000-000000000000','4401df00-0000-4000-a000-000000000000'
> )and some_date <'2015-11-02';
> 
> -- this one DOES NOT use  partial_not_working_4 .. bug or feature ? ;-)
> explain select * from test_table where chec_key  in
> ('4400df00-0000-4000-a000-000000000000'
> ,'1400df00-0000-4000-a000-000000000000')and some_date <'2015-11-02';

Not a bug, I'd think.

The WHERE condition does not imply partial_not_working_4's predicate, so
it cannot possibly be used.  The index does not contain entries for some
of the rows being requested by the query (i.e. those for which chec_key =
'1400df00-0000-4000-a000-000000000000').

A simpler example:

create table foo (a) as select generate_series(1, 100000);
create index on foo (a) where a >= 1 and a < 10;

explain (costs off) select * from foo where a in (1, 9);
                     QUERY PLAN
----------------------------------------------------
 Bitmap Heap Scan on foo
   Recheck Cond: (a = ANY ('{1,9}'::integer[]))
   ->  Bitmap Index Scan on foo_a_idx
         Index Cond: (a = ANY ('{1,9}'::integer[]))
(4 rows)

-- foo_a_idx does not contain 10
explain (costs off) select * from foo where a in (1, 10);
                QUERY PLAN
-------------------------------------------
 Seq Scan on foo
   Filter: (a = ANY ('{1,10}'::integer[]))
(2 rows)

Thanks,
Amit




-- 
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

Amit Langote <Langote_Amit_f8@lab.ntt.co.jp> writes:
> On 2017/02/23 20:10, zbigniew.szot@softiq.pl wrote:
>> -- this one DOES NOT use  partial_not_working_4 .. bug or feature ? ;-)
>> explain select * from test_table where chec_key  in
>> ('4400df00-0000-4000-a000-000000000000'
>> ,'1400df00-0000-4000-a000-000000000000')and some_date <'2015-11-02';

> Not a bug, I'd think.

After looking at this more closely, I think the OP is probably wishing
that the planner would consider a BitmapOr plan on two different partial
indexes.  You can get it to consider that if the query is phrased as an
OR, but not when it's written like this with IN (which will get converted
to an "= ANY(ARRAY[])" condition).

Trivial example:

regression=# create table foo (f1 int, f2 text);
CREATE TABLE
regression=# create index on foo(f1) where f1 >= 0 and f1 < 10;
CREATE INDEX
regression=# create index on foo(f1) where f1 >= 10 and f1 < 20;
CREATE INDEX
regression=# explain select * from foo where f1 in (7, 11);
                      QUERY PLAN
------------------------------------------------------
 Seq Scan on foo  (cost=0.00..25.88 rows=13 width=36)
   Filter: (f1 = ANY ('{7,11}'::integer[]))
(2 rows)

regression=# explain select * from foo where f1 = 7 or f1 = 11;
                                   QUERY PLAN
--------------------------------------------------------------------------------
 Bitmap Heap Scan on foo  (cost=8.27..19.00 rows=13 width=36)
   Recheck Cond: ((f1 = 7) OR (f1 = 11))
   ->  BitmapOr  (cost=8.27..8.27 rows=13 width=0)
         ->  Bitmap Index Scan on foo_f1_idx  (cost=0.00..4.13 rows=6 width=0)
               Index Cond: (f1 = 7)
         ->  Bitmap Index Scan on foo_f1_idx1  (cost=0.00..4.13 rows=6 width=0)
               Index Cond: (f1 = 11)
(7 rows)

You could certainly claim it's a bug that these two phrasings of the query
aren't treated 100% identically, but I'd tell you to get lost.  The IN
planning code is designed to handle fairly large numbers of IN items
without planner performance going into the toilet; it's not practical
for it to consider a different index for each item.

The underlying reason why I'm not very excited about this issue is that
I think the above-depicted index design is fundamentally stupid anyway.
It's much simpler, both for you and for the planner, just to make one
non-partial index on the whole range of f1.  And I know of no reason to
believe that multiple partial indexes would outperform that design for
any ordinary workload.

            regards, tom lane


--
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

On 2017-02-24 06:41, Tom Lane wrote:
> Amit Langote <Langote_Amit_f8@lab.ntt.co.jp> writes:
>> On 2017/02/23 20:10, zbigniew.szot@softiq.pl wrote:
>>> -- this one DOES NOT use  partial_not_working_4 .. bug or feature ? 
>>> ;-)
>>> explain select * from test_table where chec_key  in
>>> ('4400df00-0000-4000-a000-000000000000'
>>> ,'1400df00-0000-4000-a000-000000000000')and some_date <'2015-11-02';
> 
>> Not a bug, I'd think.
> 
> After looking at this more closely, I think the OP is probably wishing
> that the planner would consider a BitmapOr plan on two different 
> partial
> indexes.  You can get it to consider that if the query is phrased as an
> OR, but not when it's written like this with IN (which will get 
> converted
> to an "= ANY(ARRAY[])" condition).
> 
> Trivial example:
> 
> regression=# create table foo (f1 int, f2 text);
> CREATE TABLE
> regression=# create index on foo(f1) where f1 >= 0 and f1 < 10;
> CREATE INDEX
> regression=# create index on foo(f1) where f1 >= 10 and f1 < 20;
> CREATE INDEX
> regression=# explain select * from foo where f1 in (7, 11);
>                       QUERY PLAN
> ------------------------------------------------------
>  Seq Scan on foo  (cost=0.00..25.88 rows=13 width=36)
>    Filter: (f1 = ANY ('{7,11}'::integer[]))
> (2 rows)
> 
> regression=# explain select * from foo where f1 = 7 or f1 = 11;
>                                    QUERY PLAN
> --------------------------------------------------------------------------------
>  Bitmap Heap Scan on foo  (cost=8.27..19.00 rows=13 width=36)
>    Recheck Cond: ((f1 = 7) OR (f1 = 11))
>    ->  BitmapOr  (cost=8.27..8.27 rows=13 width=0)
>          ->  Bitmap Index Scan on foo_f1_idx  (cost=0.00..4.13 rows=6 
> width=0)
>                Index Cond: (f1 = 7)
>          ->  Bitmap Index Scan on foo_f1_idx1  (cost=0.00..4.13 rows=6 
> width=0)
>                Index Cond: (f1 = 11)
> (7 rows)
> 
> You could certainly claim it's a bug that these two phrasings of the 
> query
> aren't treated 100% identically, but I'd tell you to get lost.  The IN
> planning code is designed to handle fairly large numbers of IN items
> without planner performance going into the toilet; it's not practical
> for it to consider a different index for each item.
> 
> The underlying reason why I'm not very excited about this issue is that
> I think the above-depicted index design is fundamentally stupid anyway.
> It's much simpler, both for you and for the planner, just to make one
> non-partial index on the whole range of f1.  And I know of no reason to
> believe that multiple partial indexes would outperform that design for
> any ordinary workload.
> 
>             regards, tom lane

The thing that you are all (not) saying  and consider obvious is that 
indexes  created on partitions are considered by a query planer as if 
they were set on "main" table -in this case you are right t his query 
does not hit the index and sequence scan is the only option... but well 
... in fact it is not true ;-)
It's of course arguable if such a behaviour is a bug, but I find it sub 
optimal (for really big tables specially).

1)

You are right I made an assumption that in(..) with less then 10 
elements is evaluated to sequence of OR's (this is a redshift feature.. 
since redshift is based on postgresql I thought it's also a postgres one 
.. I was wrong). This is more like performance feature rather then bug 
anyway ;-)
But still.. This is not the problem.

2)
I made another more important  assumption ( and possibly also wrong) 
that if table is partitioned (means its technical set of tables) then 
the where condition  is  "split" and redistributed to partitions and 
merged later on.
In this case this would be sort of
with temp as (select * from test_table_1 where chec_key  = 
'1400df00-0000-4000-a000-000000000000' union select *  from  
test_table_4 where chec_key = '4400df00-0000-4000-a000-000000000000')
being so the  second part is straight shooter for  partial_not_working_4 
and the first part  hits test_table_1_brinn (this also could benefit 
from parallel processing anyway...)

If I don't drop test_table_4_brinn index then the query is hitting 
test_table_1_brinn and test_table4_brinn which is exactly what expected.

As I assume from yours post it hits those indexes just because they are 
condition less / non partial and operate on columns from where clause - 
so they are valid for a whole where clause. not just the partition 
specific as I thought.

background of "why to do that" :

What I was trying to achieve is sort of "partition the partition" since 
test_table_4 has grown big enough to make indexes choking..  I decided 
to split indexes.
This is (was)  an alternative to reparation the table - repartition => 
maintenance break => CEO heart attack ;-) .. and indexes can be build on 
a fly.

Ps.

I'm more Dev then Op ;-)

-- 
ZBIGNIEW SZOT



-- 
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs