Re: Anti join confusion - Mailing list pgsql-hackers

From Tender Wang
Subject Re: Anti join confusion
Date
Msg-id CAHewXN=iKJm7ZaCG0+VfvzL7zU-HZxUVRp4q5Mk-FpxcdmQj+w@mail.gmail.com
Whole thread Raw
In response to Anti join confusion  (wenhui qiu <qiuwenhuifx@gmail.com>)
List pgsql-hackers


wenhui qiu <qiuwenhuifx@gmail.com> 于2025年2月24日周一 09:48写道:
Hi Richard Guo
   I found this path https://commitfest.postgresql.org/patch/3235/ already supports anti join , But I've found that in many cases it doesn't work.It always uses SubPlan Here's my testing process.

###########
create table join1 (id integer,name varchar(300),k1 integer);    
create table join2 (id integer,name varchar(300),score integer);
insert into join1 values ( generate_series(1,20000),'aaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAaaAAAAAAAAAAAAAAAAAAAAaaaaaaaAAAAAAAAAAAAAAAAAAAAA',10);
insert into join1 values ( generate_series(1,20000),'aaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAaaAAAAAAAAAAAAAAAAAAAAaaaaaaaAAAAAAAAAAAAAAAAAAAAA',10);
insert into join1 values ( generate_series(1,20000),'aaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAaaAAAAAAAAAAAAAAAAAAAAaaaaaaaAAAAAAAAAAAAAAAAAAAAA',10);
insert into join1 values ( generate_series(50201,50300),'aaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAASSSSSAAAAAAAAAAAAAAAAaaAAAAAAAAAAAAAAAAAAAAaaaaaaaAAAAAAAAAAAAAAAAAAAAA',10);
insert into join1 values ( generate_series(50201,50300),'aaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAASSSSSAAAAAAAAAAAAAAAAaaAAAAAAAAAAAAAAAAAAAAaaaaaaaAAAAAAAAAAAAAAAAAAAAA',10);
insert into join1 values ( generate_series(150201,1350300),'aaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAASSSSSAAAAAAAAAAAAAAAAaaAAAAAAAAAAAAAAAAAAAAaaaaaaaAAAAAAAAAAAAAAAAAAAAA',10);
insert into join2 values ( generate_series(1,40000),'aaaaaaaaaaaaaaaaAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAaaAAAAAAAAAAAAAAAAAAAAaaaaaaaAAAAAAAAAAAAAAAAAAAAA',1);
insert into join2 values ( generate_series(1,40000),'aaaaaaaaaaaaaaaaAAAAAAABBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAaaAAAAAAAAAAAAAAAAAAAAaaaaaaaAAAAAAAAAAAAAAAAAAAAA',2);
insert into join2 values ( generate_series(20001,22000),'aaaaaaaaaaaaaaaaAACCCCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAaaAAAAAAAAAAAAAAAAAAAAaaaaaaaAAAAAAAAAAAAAAAAAAAAA',3);
insert into join2 values ( generate_series(150201,950300),'aaaaaaaaaaaaaaaaAACCCCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAaaAAAAAAAAAAAAAAAAAAAAaaaaaaaAAAAAAAAAAAAAAAAAAAAA',3);
create index idx_j1 on join1(id);
create index idx_j2 on join2(id);
VACUUM ANALYZE JOIN1;
VACUUM ANALYZE JOIN2;    

test=#  explain  SELECT T1.id,T1.K1 FROM join1 t1 WHERE T1.id NOT IN (SELECT T2.id FROM join2 t2 WHERE T2.ID>10000)
test-# ;
                                               QUERY PLAN
---------------------------------------------------------------------------------------------------------
 Gather  (cost=1000.42..9016319078.86 rows=630150 width=8)
   Workers Planned: 2
   ->  Parallel Seq Scan on join1 t1  (cost=0.42..9016255063.86 rows=262562 width=8)
         Filter: (NOT (ANY (id = (SubPlan 1).col1)))
         SubPlan 1
           ->  Materialize  (cost=0.42..32181.54 rows=863294 width=4)
                 ->  Index Only Scan using idx_j2 on join2 t2  (cost=0.42..24492.07 rows=863294 width=4)
                       Index Cond: (id > 10000)
(8 rows)

test=#  explain  SELECT T1.id,T1.K1 FROM join1 t1 WHERE T1.id NOT IN (SELECT T2.id FROM join2 t2 );
                                               QUERY PLAN
---------------------------------------------------------------------------------------------------------
 Gather  (cost=1000.42..8633476697.61 rows=630150 width=8)
   Workers Planned: 2
   ->  Parallel Seq Scan on join1 t1  (cost=0.42..8633412682.61 rows=262562 width=8)
         Filter: (NOT (ANY (id = (SubPlan 1).col1)))
         SubPlan 1
           ->  Materialize  (cost=0.42..30676.42 rows=882100 width=4)
                 ->  Index Only Scan using idx_j2 on join2 t2  (cost=0.42..22819.92 rows=882100 width=4)
(7 rows)

test=#  explain  SELECT T1.id,T1.K1 FROM join1 t1 WHERE T1.id NOT IN (SELECT T2.id FROM join2 t2 where T2.ID < 1000);
                                       QUERY PLAN
----------------------------------------------------------------------------------------
 Seq Scan on join1 t1  (cost=61.73..38730.47 rows=630150 width=8)
   Filter: (NOT (ANY (id = (hashed SubPlan 1).col1)))
   SubPlan 1
     ->  Index Only Scan using idx_j2 on join2 t2  (cost=0.42..57.06 rows=1865 width=4)
           Index Cond: (id < 1000)
(5 rows)

test=#  explain  SELECT T1.id,T1.K1 FROM join1 t1 WHERE T1.id NOT IN (SELECT T2.id FROM join2 t2 where T2.ID = 1000);
                                     QUERY PLAN
------------------------------------------------------------------------------------
 Seq Scan on join1 t1  (cost=4.45..38673.19 rows=630150 width=8)
   Filter: (NOT (ANY (id = (hashed SubPlan 1).col1)))
   SubPlan 1
     ->  Index Only Scan using idx_j2 on join2 t2  (cost=0.42..4.44 rows=1 width=4)
           Index Cond: (id = 1000)
(5 rows)


Planner now doesn't support pulling up the "NOT IN" sublink. The "NOT IN" sublink will be transformed into SubPlan. 


--
Thanks,
Tender Wang

pgsql-hackers by date:

Previous
From: Andy Alsup
Date:
Subject: Re: Update docs for UUID data type
Next
From: Tom Lane
Date:
Subject: Re: Anti join confusion