Thread: Question about query optimization

Question about query optimization

From

Matthias.Pitzl@izb.de

Date:

15 November 2006, 09:47:37

Hello!

I have to tables, component with unchanging component data and a
component_history table containing the history of some other values that can
change in time.
The table component_history holds a foreign key to the component_id column
in the component table. The table component_history has a primary key over
the columns component_id and history_timestamp.

Now, we often need to get the situation at a given time out of these tables
and at moment we use following query:

--------------------------------------------------------
SELECT * FROM component JOIN component_history AS c_h USING(component_id)
WHERE history_timestamp = (
    SELECT history_timestamp FROM component_history
    WHERE c_h.component_id = component_history.component_id AND
history_timestamp <= '2006-10-01'
    ORDER BY history_timestamp DESC LIMIT 1
)
--------------------------------------------------------

The query gets executed like this:
--------------------------------------------------------
 Hash Join  (cost=32540.55..32665.07 rows=32 width=78) (actual
time=118.958..136.416 rows=4160 loops=1)
   Hash Cond: ("outer".component_id = "inner".component_id)
   ->  Seq Scan on component  (cost=0.00..71.31 rows=4231 width=19) (actual
time=0.004..3.685 rows=4231 loops=1)
   ->  Hash  (cost=32540.47..32540.47 rows=32 width=63) (actual
time=118.165..118.165 rows=0 loops=1)
         ->  Seq Scan on component_history c_h  (cost=0.00..32540.47 rows=32
width=63) (actual time=0.092..111.985 rows=4160 loops=1)
               Filter: (history_timestamp = (subplan))
               SubPlan
                 ->  Limit  (cost=6.27..6.28 rows=1 width=8) (actual
time=0.016..0.017 rows=1 loops=5165)
                       ->  Sort  (cost=6.27..6.28 rows=2 width=8) (actual
time=0.014..0.014 rows=1 loops=5165)
                             Sort Key: history_timestamp
                             ->  Index Scan using component_history_pkey on
component_history  (cost=0.00..6.26 rows=2 width=8) (actual
time=0.007..0.009 rows=1 loops=5165)
                                   Index Cond: (($0 = component_id) AND
(history_timestamp <= '01.10.2006 00:00:00'::timestamp without time zone))
 Total runtime: 139.044 ms
--------------------------------------------------------

Is there any other, and more performat way, to get the last history entry
for a given date than this query?
Queries of this kind are often used in our application and getting a more
performant solution would speed up things a lot.

Thank's for your suggestions!

Greetings,
Matthias

Re: Question about query optimization

From

"Gurjeet Singh"

Date:

15 November 2006, 11:17:53

On 11/15/06, Matthias.Pitzl@izb.de <Matthias.Pitzl@izb.de> wrote:

Is there any other, and more performat way, to get the last history entry
for a given date than this query?

Create an (independent) index on history_timestamp column and use a min/max in the subquery.

More specifically, your query should look like this:
SELECT    *
FROM    component
JOIN    component_history AS c_h
    USING(component_id)
WHERE    history_timestamp =    (SELECT    max(history_timestamp)
                            FROM    component_history
                            WHERE    c_h.component_id =
                                        component_history.component_id
                            )

Here's a session snippet for an example of how drastically that can reduce the cost and the run-time:

postgres=# drop table t;
DROP TABLE

postgres=# create table t ( a int, b int );
CREATE TABLE

postgres=# insert into t select s, 99999-s from generate_series(0,99999) as s;
INSERT 0 100000

postgres=# analyze t;
ANALYZE

postgres=# explain select count(*) from t o where a = (select max(a) from t i wh
ere i.b = o.b );
                                QUERY PLAN
--------------------------------------------------------------------------
Aggregate (cost=179103292.25..179103292.26 rows=1 width=0)
   -> Seq Scan on t o (cost=0.00..179103291.00 rows=500 width=0)
         Filter: (a = (subplan))
         SubPlan
           -> Aggregate (cost= 1791.01..1791.02 rows=1 width=4)
                 -> Seq Scan on t i (cost=0.00..1791.00 rows=1 width=4)
                       Filter: (b = $0)
(7 rows)
Time: 0.000 ms

postgres=# create index ind_t_a on t(a) ;
CREATE INDEX
Time: 719.000 ms

postgres=# create index ind_t_b on t(b);
CREATE INDEX
Time: 750.000 ms

postgres=# explain select count(*) from t o where a = (select max(a) from t i wh
ere i.b = o.b );
                                      QUERY PLAN

--------------------------------------------------------------------------------
-------
Aggregate (cost=806146.25..806146.26 rows=1 width=0)
   -> Seq Scan on t o (cost= 0.00..806145.00 rows=500 width=0)
         Filter: (a = (subplan))
         SubPlan
           -> Aggregate (cost=8.03..8.04 rows=1 width=4)
                 -> Index Scan using ind_t_b on t i (cost= 0.00..8.03 rows=1 wi
dth=4)
                       Index Cond: (b = $0)
(7 rows)
Time: 15.000 ms

/* and now the execution times */

postgres=# drop index ind_t_a, ind_t_b;
DROP INDEX
Time: 0.000 ms

postgres=# select count(*) from t o where a = (select max(a) from t i where i.b
= o.b );
Cancel request sent (had to cancel after 1 minute)
ERROR: canceling statement due to user request

postgres=# create index ind_t_a on t(a) ;
CREATE INDEX
Time: 687.000 ms

postgres=# create index ind_t_b on t(b);
CREATE INDEX
Time: 765.000 ms

postgres=# select count(*) from t o where a = (select max(a) from t i where i.b
= o.b );
count
--------
100000
(1 row)
Time: 2704.000 ms

postgres=#

--
gurjeet[.singh]@EnterpriseDB.com
singh.gurjeet@{ gmail | hotmail | yahoo }.com

Re: Question about query optimization

From

Matthias.Pitzl@izb.de

Date:

15 November 2006, 11:29:11

Hello Gurjeet!

Tried your suggestion but this is just a marginal improvement.

Our query needs 126 ms time, your query 110 ms.

Greetings,

Matthias

-----Original Message-----
From: pgsql-general-owner@postgresql.org [mailto:pgsql-general-owner@postgresql.org] On Behalf Of Gurjeet Singh
Sent: Wednesday, November 15, 2006 4:18 PM
To: Matthias.Pitzl@izb.de
Cc: pgsql-general@postgresql.org
Subject: Re: [GENERAL] Question about query optimization

On 11/15/06, Matthias.Pitzl@izb.de <Matthias.Pitzl@izb.de> wrote:
Is there any other, and more performat way, to get the last history entry
for a given date than this query?

Create an (independent) index on history_timestamp column and use a min/max in the subquery.

More specifically, your query should look like this:
SELECT    *
FROM    component
JOIN    component_history AS c_h
    USING(component_id)
WHERE    history_timestamp =    (SELECT    max(history_timestamp)
                            FROM    component_history
                            WHERE    c_h.component_id =
                                        component_history.component_id
                            )

Here's a session snippet for an example of how drastically that can reduce the cost and the run-time:

postgres=# drop table t;
DROP TABLE

postgres=# create table t ( a int, b int );
CREATE TABLE

postgres=# insert into t select s, 99999-s from generate_series(0,99999) as s;
INSERT 0 100000

postgres=# analyze t;
ANALYZE

postgres=# explain select count(*) from t o where a = (select max(a) from t i wh
ere i.b = o.b );
                                QUERY PLAN
--------------------------------------------------------------------------
Aggregate (cost=179103292.25..179103292.26 rows=1 width=0)
   -> Seq Scan on t o (cost=0.00..179103291.00 rows=500 width=0)
         Filter: (a = (subplan))
         SubPlan
           -> Aggregate (cost= 1791.01..1791.02 rows=1 width=4)
                 -> Seq Scan on t i (cost=0.00..1791.00 rows=1 width=4)
                       Filter: (b = $0)
(7 rows)
Time: 0.000 ms

postgres=# create index ind_t_a on t(a) ;
CREATE INDEX
Time: 719.000 ms

postgres=# create index ind_t_b on t(b);
CREATE INDEX
Time: 750.000 ms

postgres=# explain select count(*) from t o where a = (select max(a) from t i wh
ere i.b = o.b );
                                      QUERY PLAN

--------------------------------------------------------------------------------
-------
Aggregate (cost=806146.25..806146.26 rows=1 width=0)
   -> Seq Scan on t o (cost= 0.00..806145.00 rows=500 width=0)
         Filter: (a = (subplan))
         SubPlan
           -> Aggregate (cost=8.03..8.04 rows=1 width=4)
                 -> Index Scan using ind_t_b on t i (cost= 0.00..8.03 rows=1 wi
dth=4)
                       Index Cond: (b = $0)
(7 rows)
Time: 15.000 ms

/* and now the execution times */

postgres=# drop index ind_t_a, ind_t_b;
DROP INDEX
Time: 0.000 ms

postgres=# select count(*) from t o where a = (select max(a) from t i where i.b
= o.b );
Cancel request sent (had to cancel after 1 minute)
ERROR: canceling statement due to user request

postgres=# create index ind_t_a on t(a) ;
CREATE INDEX
Time: 687.000 ms

postgres=# create index ind_t_b on t(b);
CREATE INDEX
Time: 765.000 ms

postgres=# select count(*) from t o where a = (select max(a) from t i where i.b
= o.b );
count
--------
100000
(1 row)
Time: 2704.000 ms

postgres=#

--
gurjeet[.singh]@EnterpriseDB.com
singh.gurjeet@{ gmail | hotmail | yahoo }.com

Re: Question about query optimization

From

"Gurjeet Singh"

Date:

15 November 2006, 11:32:41

On 11/15/06, Gurjeet Singh <singh.gurjeet@gmail.com> wrote:

On 11/15/06, Matthias.Pitzl@izb.de < Matthias.Pitzl@izb.de> wrote:
Is there any other, and more performat way, to get the last history entry
for a given date than this query?

Create an (independent) index on history_timestamp column and use a min/max in the subquery.

More specifically, your query should look like this:
SELECT    *
FROM    component
JOIN    component_history AS c_h
    USING(component_id)
WHERE    history_timestamp =    (SELECT    max(history_timestamp)
                            FROM    component_history
                            WHERE    c_h.component_id =
                                        component_history.component_id
                            )

Here's a session snippet for an example of how drastically that can reduce the cost and the run-time:

Sorry for such a bad example... In case you haven't noticed, ind_t_a was not used anywhere in those plans. My mistake... I was trying some other non-correlated subqueries, and ind_t_a got picked up for those; so I assumed that it'd get picked up for correlated subqueries too! But it didn't.

BTW, here's a query that would use ind_t_a:

explain select * from t where a = (select max(a) from t);

I'll try for a better examples for correlated subqueries.

--
gurjeet[.singh]@ EnterpriseDB.com
singh.gurjeet@{ gmail | hotmail | yahoo }.com

Re: Question about query optimization

From

"Gurjeet Singh"

Date:

15 November 2006, 11:58:11

On 11/15/06, Matthias.Pitzl@izb.de <Matthias.Pitzl@izb.de> wrote:

Hello Gurjeet!

Tried your suggestion but this is just a marginal improvement.
Our query needs 126 ms time, your query 110 ms.

I do not see an index access on the component table.... Do you have an index on component.component_id? Even a non-unique index will be of great help.

Correcting my previous mistake: Here's a query that looks more or less like that of yours. T1 is your component table, t2 id comp_hist and t3 is again comp_hist. And, as can be seen from the plan, ind_t_b is used for all these three aliases. What this means for you is that, create index es on component_id columns of both these tables.

The cost with an index on B is 440 times less than without it.

postgres=# explain
postgres-# select       count(*)
postgres-# from t as t1,
postgres-#      t as t2
postgres-# where        t1.b = t2.b
postgres-# and t2.a = (select max(a)
postgres(#              from    t as t3
postgres(#              where   t3.b = t1.b )
postgres-# ;
                                      QUERY PLAN

--------------------------------------------------------------------------------
-------
Aggregate (cost=358227614.66..358227614.67 rows=1 width=0)
   -> Merge Join (cost= 23114.64..358227614.65 rows=1 width=0)
         Merge Cond: (("outer"."?column2?" = t2.a) AND (t1.b = t2.b))
         -> Sort (cost=11557.32..11807.32 rows=100000 width=4)
               Sort Key: (subplan), t1.b
               -> Seq Scan on t t1 (cost=0.00..1541.00 rows=100000 width=4)
                     SubPlan
                       -> Aggregate (cost=1791.01..1791.02 rows=1 width=4)
                             -> Seq Scan on t t3 (cost= 0.00..1791.00 rows=1 wi
dth=4)
                                   Filter: (b = $0)
         -> Sort (cost=11557.32..11807.32 rows=100000 width=8)
               Sort Key: t2.a, t2.b
               -> Seq Scan on t t2 (cost= 0.00..1541.00 rows=100000 width=8)
(13 rows)

postgres=# \e
postgres=# create index ind_t_a on t(a); create index ind_t_b on t(b);
CREATE INDEX
CREATE INDEX
postgres=# explain
postgres-# select       count(*)
postgres-# from t as t1,
postgres-#      t as t2
postgres-# where        t1.b = t2.b
postgres-# and t2.a = (select max(a)
postgres(#              from    t as t3
postgres(#              where   t3.b = t1.b )
postgres-# ;
                                       QUERY PLAN

--------------------------------------------------------------------------------
--------
Aggregate (cost=812400.03.. 812400.04 rows=1 width=0)
   -> Merge Join (cost=0.00..812400.02 rows=1 width=0)
         Merge Cond: (t1.b = t2.b)
         Join Filter: (t2.a = (subplan))
         -> Index Scan using ind_t_b on t t1 (cost= 0.00..3148.01 rows=100000 w
idth=4)
         -> Index Scan using ind_t_b on t t2 (cost=0.00..3148.01 rows=100000 w
idth=8)
         SubPlan
           -> Aggregate (cost=8.03..8.04 rows=1 width=4)
                 -> Index Scan using ind_t_b on t t3 (cost=0.00..8.03 rows=1 w
idth=4)
                       Index Cond: (b = $0)
(10 rows)

postgres=# select       count(*)
postgres-# from t as t1,
postgres-#      t as t2
postgres-# where        t1.b = t2.b
postgres-# and t2.a = (select max(a)
postgres(#              from    t as t3
postgres(#              where   t3.b = t1.b )
postgres-# ;
count
--------
100000
(1 row)

Time: 1500.000 ms
postgres=#

Hope this helps.

Best regards,

--
gurjeet[.singh]@EnterpriseDB.com
singh.gurjeet @{ gmail | hotmail | yahoo }.com