Thread: select top N entries from several groups

select top N entries from several groups

From
David Orme
Date:
Hi,

Suppose I have a table (called temp) like this:

gp    val
A    10
A    8
A    6
A    4
B    3
B    2
B    1
B    0

How can I get the largest two values for each group in a single pass? I
want to end up with:

gp    val
A    10
A    8
B    3
B    2

I can do this a group at a time using...

SELECT gp, val FROM temp where gp = 'A' ORDER BY val DESC LIMIT 2;

... and then insert the results from each group into a final table. Can
I get it in one go?

Thanks,
David


Re: select top N entries from several groups

From
Sean Davis
Date:
On Apr 12, 2005, at 7:21 AM, David Orme wrote:

> Hi,
>
> Suppose I have a table (called temp) like this:
>
> gp    val
> A    10
> A    8
> A    6
> A    4
> B    3
> B    2
> B    1
> B    0
>
> How can I get the largest two values for each group in a single pass?
> I want to end up with:
>
> gp    val
> A    10
> A    8
> B    3
> B    2
>
> I can do this a group at a time using...

How about:

create table temp (
    gp char,
    val int
);

insert into temp values ('A',10);
insert into temp values ('A',8);
insert into temp values ('A',6);
insert into temp values ('A',4);
insert into temp values ('B',3);
insert into temp values ('B',2);
insert into temp values ('B',1);

select a.gp,a.val
from   temp a
where  a.val in (
                select b.val
                from   temp b
                where  a.gp=b.gp
                order by b.val desc
                limit 2);

  gp | val
----+-----
  A  |  10
  A  |   8
  B  |   3
  B  |   2
(4 rows)

I have found this link is useful for beginning to think about
subqueries:

http://www.postgresql.org/files/documentation/books/aw_pgsql/node81.html

Sean


Re: select top N entries from several groups

From
"Reinhard Hnat"
Date:
Would you like:

INSERT INTO finaltable SELECT gp, val FROM temp where gp = 'A' ORDER BY val
DESC LIMIT 2;

best regards
R.Hnat



Re: SUMMARY - select top N entries from several groups

From
David Orme
Date:
Hi,

I asked a question about pulling out the top N entries from each set of
rows identified by a grouping column, without having to run a query for
each group separately

Sean Davis initially responded with a structure using a subquery to
check whether a particular row was in the top N for the group. This is
fine but ran slowly on my table (54720 rows). Sean suggested an
alternate plpgsql function to pull out the same set of cells and Greg
Sabino Mullane suggested a way of combining individual group queries
within a single statement.  I've appended a commented set of the actual
queries.

Subquery - elegant code but very slow (4427465.67 ms)
PLPGSQL - ran very quickly (904.88 ms) - slight glitch somewhere that
means that the column names of the returned table have been shifted
which is a bit puzzling. I did have to hack Sean's code about a little
to fit it to my precise real problem so it could be that!
Combined Queries - Ran fastest (622.44 ms) but requires the unique
groups to be known.

Thanks to all for their input.

Cheers,
David


On 12 Apr 2005, at 19:40, Sean Davis wrote:

>
> On Apr 12, 2005, at 8:21 AM, Sean Davis wrote:
>
>>
>> On Apr 12, 2005, at 7:21 AM, David Orme wrote:
>>
>>> Hi,
>>>
>>> Suppose I have a table (called temp) like this:
>>>
>>> gp    val
>>> A    10
>>> A    8
>>> A    6
>>> A    4
>>> B    3
>>> B    2
>>> B    1
>>> B    0
>>>
>>> How can I get the largest two values for each group in a single
>>> pass? I want to end up with:
>>>
>>> gp    val
>>> A    10
>>> A    8
>>> B    3
>>> B    2
>>>
>>> I can do this a group at a time using...
>>
>> How about:
>>
>> create table temp (
>>     gp char,
>>     val int
>> );
>>
>> insert into temp values ('A',10);
>> insert into temp values ('A',8);
>> insert into temp values ('A',6);
>> insert into temp values ('A',4);
>> insert into temp values ('B',3);
>> insert into temp values ('B',2);
>> insert into temp values ('B',1);
>>
>> select a.gp,a.val
>> from   temp a
>> where  a.val in (
>>                 select b.val
>>                 from   temp b
>>                 where  a.gp=b.gp
>>                 order by b.val desc
>>                 limit 2);
>>
>>  gp | val
>> ----+-----
>>  A  |  10
>>  A  |   8
>>  B  |   3
>>  B  |   2
>> (4 rows)
>>
>> I have found this link is useful for beginning to think about
>> subqueries:
>>
>> http://www.postgresql.org/files/documentation/books/aw_pgsql/
>> node81.html
>>
>> Sean
>>
>
>
> Does the code below operate faster for you?  I would be curious to
> hear how the two approaches compare on your larger table.  If it does,
> could you post back a quick summary of what works, what doesn't, etc.
> to the NOVICE list so that we can all learn?
>
> Sean
>
> CREATE OR REPLACE FUNCTION get_top_2() RETURNS setof temp1 AS $$
> DECLARE
>     gps RECORD;
>     query varchar;
>     ret temp%ROWTYPE;
> BEGIN
>     FOR gps IN SELECT DISTINCT(gp) FROM temp1 LOOP
>         query := 'SELECT gp,val from temp1 where gp = ' ||
> quote_literal(gps.gp) ||
>             ' ORDER BY val DESC LIMIT 2';
>         FOR ret IN EXECUTE query LOOP
>             return next ret;
>         END LOOP;
>     END LOOP;
>     return;
> END;
> $$ LANGUAGE plpgsql;
>
> select * from get_top_2();
>
>  gp | val
> ----+-----
>  A  |  10
>  A  |  10
>  B  |   3
>  B  |   3
> (4 rows)
>
>


ACTUAL QUERIES RUN:


-- original toy example using subquery...

create table temp (
     gp char,
     val int
);

insert into temp values ('A',10);
insert into temp values ('A',8);
insert into temp values ('A',6);
insert into temp values ('A',4);
insert into temp values ('B',3);
insert into temp values ('B',2);
insert into temp values ('B',1);

select a.gp,a.val
from   temp a
where  a.val in (
                 select b.val
                 from   temp b
                 where  a.gp=b.gp
                 order by b.val desc
                 limit 2);

-- REAL USE
-- table behr_grid containing columns realm_id, grid_id, total_richness
-- find the grid_id of the 20 rows in each realm that
-- have the highest total richness value

\timing
create index behr_grid_gid_idx on behr_grid(grid_id);

--SEAN DAVIS - procedural language function

CREATE OR REPLACE FUNCTION get_top_20() RETURNS setof behr_grid AS '
DECLARE
     gps RECORD;
     query varchar;
     ret behr_grid%ROWTYPE;
BEGIN
     FOR gps IN SELECT DISTINCT(realm_id) FROM behr_grid LOOP
         query := ''SELECT realm_id, grid_id from behr_grid where
realm_id = ''
             || quote_literal(gps.realm_id) ||
             '' ORDER BY total_richness DESC LIMIT 20'';
         FOR ret IN EXECUTE query LOOP
             return next ret;
         END LOOP;
     END LOOP;
     return;
END;
' LANGUAGE plpgsql;

select grid_id, row from get_top_20();

-- Time: 904.88 ms

-- SEAN DAVIS - subquery as list

select a.realm_id, a.grid_id
from   behr_grid a
where  a.grid_id in (
                 select b.grid_id
                 from   behr_grid b
                 where  a.realm_id=b.realm_id
                 order by b.total_richness desc
                 limit 20);

-- Time: 4427465.67 ms


-- GREG SABINO MULLANE

(SELECT realm_id, grid_id FROM behr_grid WHERE realm_id='1' ORDER BY
total_richness DESC LIMIT 20)
UNION ALL
(SELECT realm_id, grid_id FROM behr_grid WHERE realm_id='2' ORDER BY
total_richness DESC LIMIT 20)
UNION ALL
(SELECT realm_id, grid_id FROM behr_grid WHERE realm_id='3' ORDER BY
total_richness DESC LIMIT 20)
UNION ALL
(SELECT realm_id, grid_id FROM behr_grid WHERE realm_id='4' ORDER BY
total_richness DESC LIMIT 20)
UNION ALL
(SELECT realm_id, grid_id FROM behr_grid WHERE realm_id='5' ORDER BY
total_richness DESC LIMIT 20)
UNION ALL
(SELECT realm_id, grid_id FROM behr_grid WHERE realm_id='6' ORDER BY
total_richness DESC LIMIT 20)
UNION ALL
(SELECT realm_id, grid_id FROM behr_grid WHERE realm_id='7' ORDER BY
total_richness DESC LIMIT 20)
UNION ALL
(SELECT realm_id, grid_id FROM behr_grid WHERE realm_id='8' ORDER BY
total_richness DESC LIMIT 20)
ORDER BY 1,2 DESC;

-- Time: 622.44 ms