Re: a JOIN on same table, but 'slided over' - Mailing list pgsql-general

From PFC
Subject Re: a JOIN on same table, but 'slided over'
Date
Msg-id op.tujt3m0ucigqcu@apollo13
Whole thread Raw
In response to Re: a JOIN on same table, but 'slided over'  (Rafal Pietrak <rafal@zorro.isa-geek.com>)
List pgsql-general
    OK, check...

test=> CREATE TABLE test (id INTEGER PRIMARY KEY);
test=> INSERT INTO test SELECT random()*5 + n*10 FROM
generate_series( 1,100000 ) AS n;
test=> SELECT * FROM test LIMIT 10;
   id
-----
   11
   23
   31
   41
   52
   63
   70
   85
   94
  103

test=> ANALYZE test;
ANALYZE

    -- Self Join 1

test=> EXPLAIN ANALYZE SELECT t1.id AS current_id, t2.id AS next_id
 FROM test t1, test t2
WHERE t2.id = ( SELECT min(id) FROM test AS t3 WHERE t3.id > t1.id )
ORDER BY t1.id;
                                                                         QUERY
PLAN

-----------------------------------------------------------------------------------------------------------------------------------------------------------
  Sort  (cost=26703.19..26953.19 rows=100000 width=8) (actual
time=5240.392..5271.529 rows=99999 loops=1)
    Sort Key: t1.id
    ->  Hash Join  (cost=2691.00..18398.37 rows=100000 width=8) (actual
time=106.588..5179.737 rows=99999 loops=1)
          Hash Cond: ((subplan) = t2.id)
          ->  Seq Scan on test t1  (cost=0.00..1441.00 rows=100000 width=4)
(actual time=0.013..34.782 rows=100000 loops=1)
          ->  Hash  (cost=1441.00..1441.00 rows=100000 width=4) (actual
time=106.420..106.420 rows=100000 loops=1)
                ->  Seq Scan on test t2  (cost=0.00..1441.00 rows=100000
width=4) (actual time=0.007..43.077 rows=100000 loops=1)
          SubPlan
            ->  Result  (cost=0.03..0.04 rows=1 width=0) (actual
time=0.023..0.023 rows=1 loops=199999)
                  InitPlan
                    ->  Limit  (cost=0.00..0.03 rows=1 width=4) (actual
time=0.021..0.022 rows=1 loops=199999)
                          ->  Index Scan using test_pkey on test t3
(cost=0.00..1029.59 rows=33333 width=4) (actual time=0.020..0.020 rows=1
loops=199999)
                                Index Cond: (id > $0)
                                Filter: (id IS NOT NULL)
  Total runtime: 5295.677 ms

    -- Self Join 2

test=> set enable_hashjoin TO 0;
test=> EXPLAIN ANALYZE SELECT t1.id AS current_id, t2.id AS next_id
 FROM test t1, test t2
WHERE t2.id = ( SELECT min(id) FROM test AS t3 WHERE t3.id > t1.id )
ORDER BY t1.id;
                                                                               QUERY
PLAN

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
  Sort  (cost=30806.48..31056.48 rows=100000 width=8) (actual
time=2876.249..2903.011 rows=99999 loops=1)
    Sort Key: t1.id
    ->  Merge Join  (cost=9745.82..22501.66 rows=100000 width=8) (actual
time=2547.830..2820.347 rows=99999 loops=1)
          Merge Cond: (t2.id = "inner"."?column2?")
          ->  Index Scan using test_pkey on test t2  (cost=0.00..2828.26
rows=100000 width=4) (actual time=0.035..67.747 rows=100000 loops=1)
          ->  Sort  (cost=9745.82..9995.82 rows=100000 width=4) (actual
time=2547.779..2582.889 rows=100000 loops=1)
                Sort Key: (subplan)
                ->  Seq Scan on test t1  (cost=0.00..1441.00 rows=100000
width=4) (actual time=0.060..2487.728 rows=100000 loops=1)
                      SubPlan
                        ->  Result  (cost=0.03..0.04 rows=1 width=0)
(actual time=0.023..0.023 rows=1 loops=100000)
                              InitPlan
                                ->  Limit  (cost=0.00..0.03 rows=1 width=4)
(actual time=0.021..0.022 rows=1 loops=100000)
                                      ->  Index Scan using test_pkey on
test t3  (cost=0.00..1029.59 rows=33333 width=4) (actual time=0.020..0.020
rows=1 loops=100000)
                                            Index Cond: (id > $0)
                                            Filter: (id IS NOT NULL)
  Total runtime: 2923.804 ms

    -- DISTINCT ON

test=> EXPLAIN SELECT DISTINCT ON (t1.id) t1.id AS current_id, t2.id AS
next_id
 FROM test t1 JOIN test t2 ON t2.id > t1.id
ORDER BY t1.id, t2.id;
                                            QUERY PLAN
-------------------------------------------------------------------------------------------------
  Unique  (cost=729806679.75..746473346.41 rows=100000 width=8)
    ->  Sort  (cost=729806679.75..738140013.08 rows=3333333333 width=8)
          Sort Key: t1.id, t2.id
          ->  Nested Loop  (cost=0.00..100028973.00 rows=3333333333 width=8)
                ->  Seq Scan on test t1  (cost=0.00..1441.00 rows=100000
width=4)
                ->  Index Scan using test_pkey on test t2
(cost=0.00..583.61 rows=33333 width=4)
                      Index Cond: (t2.id > t1.id)
(7 lignes)

    This one takes much longer (I interrupted it).

    -- Using a function

CREATE TYPE test_type AS ( current_id INTEGER, next_id INTEGER );

CREATE OR REPLACE FUNCTION testfunc( )
     RETURNS SETOF test_type
     LANGUAGE plpgsql
     AS
$$
DECLARE
    _row        test_type;
BEGIN
    _row.current_id = NULL;

    FOR _row.next_id IN SELECT id FROM test ORDER BY id LOOP
        IF _row.current_id IS NOT NULL THEN
            RETURN NEXT _row;
        END IF;
        _row.current_id = _row.next_id;
    END LOOP;
END;
$$;

test=> EXPLAIN ANALYZE SELECT * FROM testfunc();
                                                      QUERY PLAN
--------------------------------------------------------------------------------------------------------------------
  Function Scan on testfunc  (cost=0.00..12.50 rows=1000 width=8) (actual
time=211.702..238.322 rows=100000 loops=1)
  Total runtime: 262.369 ms

    Same results, at least 10x faster on large datasets...








pgsql-general by date:

Previous
From: PFC
Date:
Subject: Re: Ordering in SELECT statement
Next
From: Bruce McAlister
Date:
Subject: Re: AutoVacuum Behaviour Question