Thread: autovacuum hung?

autovacuum hung?

From
Brian Cox
Date:
autovacuum has been running on 2 tables for > 5 hours. There tables are
not huge (see below). For the past ~1 hour, I've shut off all other
activity on this database. The other table being vacuumed has more rows
(1897810). Anyone have any ideas about why this is taking so long?

Thanks,
Brian


[root@rdl64xeoserv01 log]# fgrep autov /var/lib/pgsql/data/postgresql.conf
autovacuum = on                 # enable autovacuum subprocess?
autovacuum_naptime = 60s                # time between autovacuum runs,
in secs
autovacuum_vacuum_threshold = 200       # min # of tuple updates before
autovacuum_analyze_threshold = 50       # min # of tuple updates before
autovacuum_vacuum_scale_factor = 0.2    # fraction of rel size before
autovacuum_analyze_scale_factor = 0.1   # fraction of rel size before
#autovacuum_vacuum_cost_delay = -1      # default vacuum cost delay for
                                         # autovac, -1 means use
#autovacuum_vacuum_cost_limit = -1      # default vacuum cost limit for



Welcome to psql 8.3.5, the PostgreSQL interactive terminal.

Type:  \copyright for distribution terms
        \h for help with SQL commands
        \? for help with psql commands
        \g or terminate with semicolon to execute query
        \q to quit

cemdb=# select procpid,query_start,current_query from pg_stat_activity;
  procpid |          query_start          |
current_query
---------+-------------------------------+-----------------------------------------------------------------
    24866 | 2009-05-29 13:50:11.251397-07 | autovacuum: VACUUM
public.ts_user_sessions_map
    24869 | 2009-05-29 11:46:54.221713-07 | autovacuum: VACUUM ANALYZE
public.ts_stats_transet_user_daily
    24872 | 2009-05-29 11:31:28.324954-07 | autovacuum: VACUUM ANALYZE
public.ts_stats_transet_user_weekly
    28097 | 2009-05-29 15:58:49.24832-07  | select
procpid,query_start,current_query from pg_stat_activity;
(4 rows)

cemdb=# select count(*) from ts_stats_transet_user_daily;
  count
--------
  558321
(1 row)

cemdb=# select count(*) from ts_stats_transet_user_weekly;
  count
--------
  333324
(1 row)

cemdb=# select c.oid,c.relname,l.pid,l.mode,l.granted from pg_class c
join pg_locks l on c.oid=l.relation order by l.pid;
    oid    |                        relname                        |
pid  |           mode           | granted
----------+-------------------------------------------------------+-------+--------------------------+---------
  26612062 | ts_user_sessions_map                                  |
24866 | ShareUpdateExclusiveLock | t
  26613644 | ts_user_sessions_map_interimsessionidindex            |
24866 | RowExclusiveLock         | t
  26613645 | ts_user_sessions_map_sessionidindex                   |
24866 | RowExclusiveLock         | t
  26612846 | ts_user_sessions_map_appindex                         |
24866 | RowExclusiveLock         | t
  26612417 | ts_user_sessions_map_pkey                             |
24866 | RowExclusiveLock         | t
  27208308 | ts_stats_transet_user_daily_userindex                 |
24869 | RowExclusiveLock         | t
  27208305 | ts_stats_transet_user_daily_transetincarnationidindex |
24869 | RowExclusiveLock         | t
  27208310 | ts_stats_transet_user_daily_yearindex                 |
24869 | RowExclusiveLock         | t
  27208307 | ts_stats_transet_user_daily_userincarnationidindex    |
24869 | RowExclusiveLock         | t
  27208302 | ts_stats_transet_user_daily_lastaggregatedrowindex    |
24869 | RowExclusiveLock         | t
  27208309 | ts_stats_transet_user_daily_weekindex                 |
24869 | RowExclusiveLock         | t
  26612320 | ts_stats_transet_user_daily_pkey                      |
24869 | RowExclusiveLock         | t
  27208306 | ts_stats_transet_user_daily_transetindex              |
24869 | RowExclusiveLock         | t
  26611722 | ts_stats_transet_user_daily                           |
24869 | ShareUpdateExclusiveLock | t
  27208303 | ts_stats_transet_user_daily_monthindex                |
24869 | RowExclusiveLock         | t
  27208304 | ts_stats_transet_user_daily_starttimeindex            |
24869 | RowExclusiveLock         | t
  27208300 | ts_stats_transet_user_daily_dayindex                  |
24869 | RowExclusiveLock         | t
  27208301 | ts_stats_transet_user_daily_hourindex                 |
24869 | RowExclusiveLock         | t
  26612551 | ts_stats_transet_user_weekly_lastaggregatedrowindex   |
24872 | RowExclusiveLock         | t
  26612558 | ts_stats_transet_user_weekly_yearindex                |
24872 | RowExclusiveLock         | t
  26612326 | ts_stats_transet_user_weekly_pkey                     |
24872 | RowExclusiveLock         | t
  26612554 | ts_stats_transet_user_weekly_transetindex             |
24872 | RowExclusiveLock         | t
  26612555 | ts_stats_transet_user_weekly_userincarnationidindex   |
24872 | RowExclusiveLock         | t
  26611743 | ts_stats_transet_user_weekly                          |
24872 | ShareUpdateExclusiveLock | t
  26612556 | ts_stats_transet_user_weekly_userindex                |
24872 | RowExclusiveLock         | t
  26612553 | ts_stats_transet_user_weekly_starttimeindex           |
24872 | RowExclusiveLock         | t
  26612557 | ts_stats_transet_user_weekly_weekindex                |
24872 | RowExclusiveLock         | t
  26612550 | ts_stats_transet_user_weekly_hourindex                |
24872 | RowExclusiveLock         | t
  26612552 | ts_stats_transet_user_weekly_monthindex               |
24872 | RowExclusiveLock         | t
  26612549 | ts_stats_transet_user_weekly_dayindex                 |
24872 | RowExclusiveLock         | t
      2663 | pg_class_relname_nsp_index                            |
28097 | AccessShareLock          | t
     10969 | pg_locks                                              |
28097 | AccessShareLock          | t
      1259 | pg_class                                              |
28097 | AccessShareLock          | t
      2662 | pg_class_oid_index                                    |
28097 | AccessShareLock          | t
(34 rows)


Re: autovacuum hung?

From
Alvaro Herrera
Date:
Brian Cox wrote:
> autovacuum has been running on 2 tables for > 5 hours. There tables are
> not huge (see below). For the past ~1 hour, I've shut off all other
> activity on this database. The other table being vacuumed has more rows
> (1897810). Anyone have any ideas about why this is taking so long?

What's vacuum_cost_delay?

--
Alvaro Herrera                                http://www.CommandPrompt.com/
The PostgreSQL Company - Command Prompt, Inc.

Re: autovacuum hung?

From
Brian Cox
Date:
Alvaro Herrera [alvherre@commandprompt.com] wrote:
> What's vacuum_cost_delay?
#vacuum_cost_delay = 0                  # 0-1000 milliseconds
#vacuum_cost_page_hit = 1               # 0-10000 credits
#vacuum_cost_page_miss = 10             # 0-10000 credits
#vacuum_cost_page_dirty = 20            # 0-10000 credits
#vacuum_cost_limit = 200                # 0-10000 credits

so, whatever the default happens to be.

Thanks,
Brian

Re: autovacuum hung?

From
Tom Lane
Date:
Brian Cox <brian.cox@ca.com> writes:
> autovacuum has been running on 2 tables for > 5 hours. There tables are
> not huge (see below). For the past ~1 hour, I've shut off all other
> activity on this database. The other table being vacuumed has more rows
> (1897810). Anyone have any ideas about why this is taking so long?

Are those processes actually doing anything, or just waiting?  strace
or local equivalent would be the most conclusive check.

> cemdb=# select c.oid,c.relname,l.pid,l.mode,l.granted from pg_class c
> join pg_locks l on c.oid=l.relation order by l.pid;

This query isn't very helpful because it fails to show locks that are
not directly associated with tables.

            regards, tom lane

Re: autovacuum hung?

From
Brian Cox
Date:
Tom Lane [tgl@sss.pgh.pa.us] wrote:
>
> Are those processes actually doing anything, or just waiting?  strace
> or local equivalent would be the most conclusive check.
These must not have been hung, because they finally completed (after
10-15 hrs - some time between 11pm and 8am). Question is why does it
take so long to do this on such a relatively small table?

> This query isn't very helpful because it fails to show locks that are
> not directly associated with tables.
How can that (locks not directly associated...) be determined?


Thanks,
Brian

Re: autovacuum hung?

From
Tom Lane
Date:
Brian Cox <brian.cox@ca.com> writes:
> Tom Lane [tgl@sss.pgh.pa.us] wrote:
>> Are those processes actually doing anything, or just waiting?  strace
>> or local equivalent would be the most conclusive check.

> These must not have been hung, because they finally completed (after
> 10-15 hrs - some time between 11pm and 8am). Question is why does it
> take so long to do this on such a relatively small table?

They might have been blocked behind some other process that was sitting
in an open transaction for some reason.  The other likely cause is badly
chosen autovacuum delay, but I think that was already covered.

>> This query isn't very helpful because it fails to show locks that are
>> not directly associated with tables.

> How can that (locks not directly associated...) be determined?

Don't assume every row in pg_locks has a join partner in pg_class.
You could use an outer join ...

            regards, tom lane

Re: autovacuum hung?

From
Brian Cox
Date:
Tom Lane [tgl@sss.pgh.pa.us] wrote:
> They might have been blocked behind some other process that was sitting
> in an open transaction for some reason.  The other likely cause is badly
> chosen autovacuum delay, but I think that was already covered.
Well, after I noticed this running for a while, I shutdown the postgres
port and restarted postgres. The autovacuum of these tables kicked in
promptly when postgres was back up. I then let them run. So, I don't
think that surmise #1 is likely.
As for #2, I'm using the default. These tables get updated once a day
with each row (potentially) being updated 1-24 times over many minutes
to a handful of hours. Dp you think it would be better to manually
vacuum these tables? If so, would it be best to disable autovacuum of
them? And while I'm at it, if you disable autovacuum of the master table
will that disable it for the actual partitions?

  > Don't assume every row in pg_locks has a join partner in pg_class.
> You could use an outer join ...
Yes, of course. It never occurred that there could be db locks not
associated with tables.

Thanks,
Brian


Re: autovacuum hung?

From
Tom Lane
Date:
Brian Cox <brian.cox@ca.com> writes:
> Dp you think it would be better to manually
> vacuum these tables? If so, would it be best to disable autovacuum of
> them? And while I'm at it, if you disable autovacuum of the master table
> will that disable it for the actual partitions?

No, no, and no.  What would be best is to find out what actually
happened.  The evidence is gone now, but if you see it again please
take a closer look.

            regards, tom lane

Re: autovacuum hung?

From
Brian Cox
Date:
Tom Lane [tgl@sss.pgh.pa.us] wrote:
> No, no, and no.  What would be best is to find out what actually
> happened.  The evidence is gone now, but if you see it again please
> take a closer look.
OK. You mentioned strace. It's got a lot of options; any in particular
that would be useful if this happens again?

Brian


Re: autovacuum hung?

From
Tom Lane
Date:
Brian Cox <brian.cox@ca.com> writes:
> OK. You mentioned strace. It's got a lot of options; any in particular
> that would be useful if this happens again?

I'd just do "strace -p processID" and watch it for a little while.
If it's not hung, you'll see the process issuing kernel calls at
some rate or other.

If it is hung, you'll most likely see something like

    semop(...)

and it just sits there.  Also, if you see nothing but a series of
select()s with varying timeouts, that would suggest a stuck spinlock
(although I doubt that was happening, as it would eventually timeout
and report a failure).

            regards, tom lane

Re: autovacuum hung?

From
Tom Lane
Date:
Brian Cox <brian.cox@ca.com> writes:
> OK. You mentioned strace. It's got a lot of options; any in particular
> that would be useful if this happens again?

Oh, and don't forget the more-complete pg_locks state.  We'll want all
the columns of pg_locks, not just the ones you showed before.

            regards, tom lane

Re: autovacuum hung?

From
Brian Cox
Date:
Tom Lane [tgl@sss.pgh.pa.us] wrote:
> Oh, and don't forget the more-complete pg_locks state.  We'll want all
> the columns of pg_locks, not just the ones you showed before.
auto vacuum of ts_user_sessions_map has been running for > 17 hours.
This table has 2,204,488 rows. I hope that I've captured enough info.

Thanks,
Brian


cemdb=# select procpid,current_query,query_start from pg_stat_activity;
  procpid |                          current_query
     |          query_start
---------+-----------------------------------------------------------------+-------------------------------
     8817 | <IDLE>
     | 2009-07-09 16:48:12.656419-07
     8818 | autovacuum: VACUUM public.ts_user_sessions_map
     | 2009-07-09 16:48:18.873677-07


cemdb=# select
l.pid,c.relname,l.mode,l.granted,l.virtualxid,l.virtualtransaction from
pg_locks l left outer join pg_class c on c.oid=l.relation order by l.pid;
   pid  |                  relname                   |           mode
         | granted | virtualxid | virtualtransaction

-------+--------------------------------------------+--------------------------+---------+------------+--------------------

  8818 | ts_user_sessions_map_interimsessionidindex | RowExclusiveLock
        | t       |            | 2/3
   8818 | ts_user_sessions_map_appindex              | RowExclusiveLock
         | t       |            | 2/3
   8818 | ts_user_sessions_map_sessionidindex        | RowExclusiveLock
         | t       |            | 2/3
   8818 | ts_user_sessions_map                       |
ShareUpdateExclusiveLock | t       |            | 2/3
   8818 |                                            | ExclusiveLock
         | t       | 2/3        | 2/3
   8818 | ts_user_sessions_map_pkey                  | RowExclusiveLock
         | t       |            | 2/3
  13706 |                                            | ExclusiveLock
         | t       | 6/50       | 6/50
  13706 | pg_class_oid_index                         | AccessShareLock
         | t       |            | 6/50
  13706 | pg_class_relname_nsp_index                 | AccessShareLock
         | t       |            | 6/50
  13706 | pg_locks                                   | AccessShareLock
         | t       |            | 6/50
  13706 | pg_class                                   | AccessShareLock
         | t       |            | 6/50
(11 rows)


[root@rdl64xeoserv01 log]# strace -p 8818 -o /tmp/strace.log
Process 8818 attached - interrupt to quit
Process 8818 detached
[root@rdl64xeoserv01 log]# more /tmp/strace.log
select(0, NULL, NULL, NULL, {0, 13000}) = 0 (Timeout)
read(36, "`\0\0\0\370\354\250u\1\0\0\0\34\0\264\37\360\37\4 \0\0"...,
8192) = 8192
read(36, "`\0\0\0\340\f\251u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0"...,
8192) = 8192
read(36, "`\0\0\0\300,\251u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0\0"...,
8192) = 8192
read(36, "`\0\0\0(L\251u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0\0\264"...,
8192) = 8192
read(36, "`\0\0\0|M\251u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0\0\264"...,
8192) = 8192
read(36, "`\0\0\0\\~\251u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0\0\264"...,
8192) = 8192
read(36, "`\0\0\0D\234\251u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0\0"...,
8192) = 8192
read(36, "`\0\0\0\34\255\251u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0"...,
8192) = 8192
read(36, "`\0\0\0\4\315\251u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0"...,
8192) = 8192
read(36, "`\0\0\0\234\2334x\1\0\0\0\34\0\264\37\360\37\4 \0\0\0\0"...,
8192) = 8192
read(36, "`\0\0\0\354\354\251u\1\0\0\0\34\0\264\37\360\37\4 \0\0"...,
8192) = 8192
read(36, "`\0\0\0\324\f\252u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0"...,
8192) = 8192
read(36, "`\0\0\0\274,\252u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0\0"...,
8192) = 8192
read(36, "`\0\0\0\244L\252u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0\0"...,
8192) = 8192
read(36, "`\0\0\0008^\252u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0\0"...,
8192) = 8192
read(36, "`\0\0\0,\233\252u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0\0"...,
8192) = 8192
read(36, "`\0\0\0\370\330\252u\1\0\0\0\34\0\264\37\360\37\4 \0\0"...,
8192) = 8192
read(36, "`\0\0\0000\371\252u\1\0\0\0\34\0\270\37\360\37\4 \0\0\0"...,
8192) = 8192
read(36, "`\0\0\0\364\30\253u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0"...,
8192) = 8192
read(36, "`\0\0\0\2448\253u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0\0"...,
8192) = 8192
select(0, NULL, NULL, NULL, {0, 20000}) = 0 (Timeout)
read(36, "`\0\0\0dX\253u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0\0\264"...,
8192) = 8192
read(36, "`\0\0\0X\216\253u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0\0"...,
8192) = 8192
read(36, "`\0\0\0\10\256\253u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0"...,
8192) = 8192
read(36, "`\0\0\0\300\315\253u\1\0\0\0\34\0\264\37\360\37\4 \0\0"...,
8192) = 8192
read(36, "`\0\0\0\304\f\254u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0"...,
8192) = 8192
read(36, "`\0\0\0\354=\254u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0\0"...,
8192) = 8192
read(36, "`\0\0\0\254]\254u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0\0"...,
8192) = 8192
read(36, "`\0\0\0d}\254u\1\0\0\0\34\0\270\37\360\37\4 \0\0\0\0\270"...,
8192) = 8192
read(36, "`\0\0\0\24\235\254u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0"...,
8192) = 8192
read(36, "`\0\0\0\314\274\254u\1\0\0\0\34\0\264\37\360\37\4 \0\0"...,
8192) = 8192
read(36, "`\0\0\0\314\330\254u\1\0\0\0\34\0\264\37\360\37\4 \0\0"...,
8192) = 8192
read(36, "`\0\0\0X\354\254u\1\0\0\0\34\0\264\37\360\37\4 \0\0\0\0"...,
8192) = 8192
read(36, "`\0\0\0\350\253\30x\1\0\0\0\34\0\264\37\360\37\4 \0\0\0"...,
8192) = 8192
_llseek(36, 1068474368, [1068474368], SEEK_SET) = 0
read(36, "`\0\0\0\350\253\30x\1\0\0\0\24\1h\21\360\37\4 \0\0\0\0"...,
8192) = 8192
_llseek(36, 1068220416, [1068220416], SEEK_SET) = 0
read(36, "`\0\0\0P\356\254u\1\0\0\0\34\0\270\37\360\37\4 \0\0\0\0"...,
8192) = 8192