Thread: [Testcase] Queries running forever, because of wrong rowcount estimate

[Testcase] Queries running forever, because of wrong rowcount estimate

From
Peter
Date:
CREATE DATABASE ttc
    WITH
    OWNER = admin
    ENCODING = 'UTF8'
    LC_COLLATE = 'de_DE.UTF-8'
    LC_CTYPE = 'de_DE.UTF-8'
    TABLESPACE = pg_default
    CONNECTION LIMIT = -1
    IS_TEMPLATE = False;

select version();
PostgreSQL 12.13 on amd64-portbld-freebsd13.1, compiled by FreeBSD clang version 13.0.0
(git@github.com:llvm/llvm-project.gitllvmorg-13.0.0-0-gd7b669b3a303), 64-bit
 


CREATE TABLE  public.incoming
(
    id bigint NOT NULL,
    data text COLLATE pg_catalog."default",
    CONSTRAINT incoming_pkey PRIMARY KEY (id)
)
TABLESPACE pg_default;

insert into incoming(id) select generate_series(1,1000000);
update incoming set data = 'EINS' where data IS NULL;
insert into incoming(id) select generate_series(1000001,2000000);
update incoming set data = 'ZWEI' where data IS NULL;
insert into incoming(id) select generate_series(2000001,3000000);
update incoming set data = 'DREI' where data IS NULL;

CREATE TABLE IF NOT EXISTS public.data
(
    data text COLLATE pg_catalog."default" NOT NULL,
    CONSTRAINT data_pkey PRIMARY KEY (data)
)
TABLESPACE pg_default;

insert into data (data) values ('EINS'), ('DREI');

analyze public.data;
analyze public.incoming;

explain
select distinct data
from incoming 
Left outer join public.data using (data)
where data.data is null;

 HashAggregate  (cost=67371.04..67371.07 rows=3 width=5)
   Group Key: incoming.data
   ->  Hash Anti Join  (cost=0.55..64871.04 rows=1000000 width=5)
         Hash Cond: (incoming.data = data.data)
         ->  Seq Scan on incoming  (cost=0.00..44745.50 rows=3000000 width=5)
         ->  Hash  (cost=0.52..0.52 rows=2 width=5)
               ->  Seq Scan on data  (cost=0.00..0.52 rows=2 width=5)

delete from data;
vacuum analyze data;

explain
select distinct data
from incoming 
Left outer join public.data using (data)
where data.data is null;

 Unique  (cost=56056.08..56056.09 rows=1 width=5)
   ->  Sort  (cost=56056.08..56056.09 rows=1 width=5)
         Sort Key: incoming.data
         ->  Hash Anti Join  (cost=60.58..56056.07 rows=1 width=5)
               Hash Cond: (incoming.data = data.data)
               ->  Seq Scan on incoming  (cost=0.00..44745.50 rows=3000000 width=5)
               ->  Hash  (cost=29.70..29.70 rows=2470 width=5)
                     ->  Seq Scan on data  (cost=0.00..29.70 rows=2470 width=5)

"rows=1" in the "Hash Anti Join" line is WRONG. It should be
3000000. Or at least some thousands.
On the next-higher level there will now a Nested Loop chosen. And that
Nested Loop will do whatever costly things it needs to do - only not 1
time but three million times.

I think I have a workaround patch also.

-------------------------------------------------------------------
Server Configuration Tuning:

 
< #port = 5432                          # (change requires restart)
< #max_connections = 100                        # (change requires restart)
---
> port = 5434                           # (change requires restart)
> max_connections = 60                  # (change requires restart)

< #shared_buffers = 32MB                        # min 128kB
---
> shared_buffers = 40MB                 # min 128kB

< #temp_buffers = 8MB                   # min 800kB
---
> temp_buffers = 20MB                   # min 800kB

< #work_mem = 4MB                               # min 64kB
< #maintenance_work_mem = 64MB          # min 1MB
---
> work_mem = 50MB                       # min 64kB
> maintenance_work_mem = 50MB           # min 1MB

< #max_stack_depth = 2MB                        # min 100kB
---
> max_stack_depth = 40MB                        # min 100kB

< #max_files_per_process = 1000         # min 25
---
> max_files_per_process = 200           # min 25

< #effective_io_concurrency = 1         # 1-1000; 0 disables prefetching
---
> effective_io_concurrency = 5          # 1-1000; 0 disables prefetching

< #max_parallel_workers_per_gather = 2  # taken from max_parallel_workers
---
> max_parallel_workers_per_gather = 0   # taken from max_parallel_workers

< #synchronous_commit = on              # synchronization level;
---
> synchronous_commit = off              # synchronization level;

< #wal_sync_method = fsync              # the default is the first option
---
> wal_sync_method = fsync               # the default is the first option

< #full_page_writes = on                        # recover from partial page writes
---
> full_page_writes = off                        # recover from partial page writes

< #wal_init_zero = on                   # zero-fill new WAL files
---
> wal_init_zero = off                   # zero-fill new WAL files

< #wal_writer_delay = 200ms             # 1-10000 milliseconds
---
> wal_writer_delay = 2000ms             # 1-10000 milliseconds

< #checkpoint_timeout = 5min            # range 30s-1d
---
> checkpoint_timeout = 10min            # range 30s-1d

< #checkpoint_completion_target = 0.5   # checkpoint target duration, 0.0 - 1.0
---
> checkpoint_completion_target = 0.0    # checkpoint target duration, 0.0 - 1.0

< #seq_page_cost = 1.0                  # measured on an arbitrary scale
< #random_page_cost = 4.0                       # same scale as above
---
> seq_page_cost = 0.5                   # measured on an arbitrary scale
> random_page_cost = 0.7                        # same scale as above / PMc: SSD

< #effective_cache_size = 4GB
---
> effective_cache_size = 1GB

< update_process_title = off
---
> update_process_title = on

< #autovacuum = on                      # Enable autovacuum subprocess?  'on'
---
> autovacuum = off                      # Enable autovacuum subprocess?  'on' 

< #datestyle = 'iso, mdy'
---
> datestyle = 'german, dmy'

< #timezone = 'GMT'
---
> timezone = 'Europe/Berlin'

< #lc_messages = 'C'                    # locale for system error message
---
> lc_messages = 'en_US.UTF-8'                   # locale for system error message

< #lc_monetary = 'C'                    # locale for monetary formatting
< #lc_numeric = 'C'                     # locale for number formatting
< #lc_time = 'C'                                # locale for time formatting
---
> lc_monetary = 'en_US.UTF-8'                   # locale for monetary formatting
> lc_numeric = 'en_US.UTF-8'                    # locale for number formatting
> lc_time = 'de_DE.UTF-8'                               # locale for time formatting

< #default_text_search_config = 'pg_catalog.simple'
---
> default_text_search_config = 'pg_catalog.german'



Re: [Testcase] Queries running forever, because of wrong rowcount estimate

From
Tom Lane
Date:
Peter <pmc@citylink.dinoex.sub.org> writes:
> "rows=1" in the "Hash Anti Join" line is WRONG. It should be
> 3000000. Or at least some thousands.

FWIW, this behaves better in v14 and up.  In older versions there's
an ambiguity about what relpages=reltuples=0 means; it's the state
when ANALYZE has never yet examined the table contents, but it's
also the state when we have found the table to be empty.  Because
of the former case, older planner versions don't like to believe
that a table is empty even if reltuples says so.

            regards, tom lane



On Mon, Feb 13, 2023 at 12:38:12PM -0500, Tom Lane wrote:
! Peter <pmc@citylink.dinoex.sub.org> writes:
! > "rows=1" in the "Hash Anti Join" line is WRONG. It should be
! > 3000000. Or at least some thousands.
! 
! FWIW, this behaves better in v14 and up.  In older versions there's
! an ambiguity about what relpages=reltuples=0 means; it's the state
! when ANALYZE has never yet examined the table contents, but it's
! also the state when we have found the table to be empty.  Because
! of the former case, older planner versions don't like to believe
! that a table is empty even if reltuples says so.

Thanks, Tom, I found that as change 3d351d916b2 - that should
make the issue more unlikely and not so prone to happen during
testing.
OTOH, that code did apparently not do anything evil that would
not be possible to happen under other circumstances. It's unlikely,
because one would at some point change such queries to a more
performant style, but I am currently trying to understand why
it can happen at all...

cheers,
PMc



[Outcome] Queries running forever, because of wrong rowcount estimate

From
Peter
Date:
On Mon, Feb 13, 2023 at 12:38:12PM -0500, Tom Lane wrote:
! Peter <pmc@citylink.dinoex.sub.org> writes:
! > "rows=1" in the "Hash Anti Join" line is WRONG. It should be
! > 3000000. Or at least some thousands.
! 
! FWIW, this behaves better in v14 and up.  In older versions there's
! an ambiguity about what relpages=reltuples=0 means; it's the state
! when ANALYZE has never yet examined the table contents, but it's
! also the state when we have found the table to be empty.  Because
! of the former case, older planner versions don't like to believe
! that a table is empty even if reltuples says so.
! 
!             regards, tom lane

Okay, I got someway through with it.

Given, the destination table is empty, and VACUUMed. Then there is
relpages = reltuples = 0.

We do some
"source LEFT OUTER JOIN destination ON (matchfield) WHERE matchfield IS NULL"
The source is big, but carries only a few distinct matchfield values.

The so-called "hack" in heapam_handler.c:heapam_estimate_rel_size()
does then makes the planner believe that there are actually 10
curpages. This is not wrong, because the same might happen if one does
an INSERT amounting for 10 pages and not yet ANALYZE.

We then get a calculated rowcount of whatever, lets assume 2330 rows.

Then we go into eqjoinsel():

n_distinct for the left source table is known, it is (lets assume) 4.
n_distinct for the right destination table is not known because it is
not analyzed, but it is UNIQUE, so it becomes 2330.

We get into eqjoinsel_inner(): 

MCV (right) for the destination is not known because it is not analyzed.

selec = selec / nd2   => 0.000429

We get into eqjoinsel_semi():

                        if (nd1 <= nd2 || nd2 < 0)
                                selec = 1.0 - nullfrac1;

Voila, selec = 1.0

And so the ANTI JOIN will estimate to 0 (aka 1) result rows, instead
of whatever rowcount the source brings along (usually big).

----------------- cut -----------------------
Okay, so this should be fixed in R.14.

But, as mentioned above, the same can also happen in normal operations

Example:
I have a logfile from my webserver and a table to collect the (unique)
IP-addresses. I start with an empty table and feed in the log.

First round, the destination addresses. Lats assume there are 2330
different addresses.
Second round, the source addresses. There are only 4 different ones.

I don't do an ANALYZE between the two steps.

And voila, I get the same effect as above. (Been there, done
it: duration: 30216273.583 ms)

Agreed, this is an inefficient way of doing that. It would be better
to do a UNION on the source data, at that point filter it accordingly,
then DISTINCT, and only then the ANTI JOIN. 

But I'm a lazy guy and I may not care if a query that could run in
0.5 seconds takes 60 seconds instead.
However, it's not so nice when it takes 10 hours.

Anyway, I don't get a real clue on what this stance should actually
achieve (from selfuncs.c:eqjoinsel_semi()#2773):

            if (nd1 <= nd2 || nd2 < 0)
                selec = 1.0 - nullfrac1;
            else
                selec = (nd2 / nd1) * (1.0 - nullfrac1);

This seems to assume a 100% match whenever the left (source) table
brings fewer distinct(!) values than the right (destination) table
already carries.

For now, I have just disabled this behaviour, in the rough&dirty way:

--- src/backend/optimizer/path/costsize.c.orig  2022-11-07 22:47:13.000000000 +0100
+++ src/backend/optimizer/path/costsize.c       2023-02-13 00:04:54.156489000 +0100
@@ -4685,8 +4685,11 @@
                        /* pselec not used */
                        break;
                case JOIN_ANTI:
+                       if(fkselec * jselec > 0.9975) {
+                               jselec = 0.9975;
+                       }
                        nrows = outer_rows * (1.0 - fkselec * jselec);
                        nrows *= pselec;
                        break;
                default:
                        /* other values not expected here */


cheerio,
PMc