Re: We need to log aborted autovacuums - Mailing list pgsql-hackers

From Greg Smith
Subject Re: We need to log aborted autovacuums
Date
Msg-id 4D33376E.2060909@2ndquadrant.com
Whole thread Raw
In response to Re: We need to log aborted autovacuums  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: We need to log aborted autovacuums  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-hackers
Tom Lane wrote:
> No, I don't believe we should be messing with the semantics of
> try_relation_open.  It is what it is.
>

With only four pretty simple callers to the thing, and two of them
needing the alternate behavior, it seemed a reasonable place to modify
to me.  I thought the "nowait" boolean idea was in enough places that it
was reasonable to attach to try_relation_open.

Attached patch solves the "wait for lock forever" problem, and
introduces a new log message when AV or auto-analyze fail to obtain a
lock on something that needs to be cleaned up:

DEBUG:  autovacuum: processing database "gsmith"
INFO:  autovacuum skipping relation 65563 --- cannot open or obtain lock
INFO:  autoanalyze skipping relation 65563 --- cannot open or obtain lock

My main concern is that this may cause AV to constantly fail to get
access to a busy table, where in the current code it would queue up and
eventually get the lock needed.  A secondary issue is that while the
autovacuum messages only show up if you have log_autovacuum_min_duration
set to not -1, the autoanalyze ones can't be stopped.

If you don't like the way I structured the code, you can certainly do it
some other way instead.  I thought this approach was really simple and
not unlike similar code elsewhere.

Here's the test case that worked for me here again:

psql
SHOW log_autovacuum_min_duration;
DROP TABLE t;
CREATE TABLE t(s serial,i integer);
INSERT INTO t(i) SELECT generate_series(1,100000);
SELECT relname,last_autovacuum,autovacuum_count FROM pg_stat_user_tables
WHERE relname='t';
DELETE FROM t WHERE s<50000;
\q
psql
BEGIN;
LOCK t;

Leave that open, then go to anther session with old "tail -f" on the
logs to wait for the errors to show up.

--
Greg Smith   2ndQuadrant US    greg@2ndQuadrant.com   Baltimore, MD
PostgreSQL Training, Services, and 24x7 Support  www.2ndQuadrant.us
"PostgreSQL 9.0 High Performance": http://www.2ndQuadrant.com/books

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 25d9fde..4193fff 100644
*** a/src/backend/access/heap/heapam.c
--- b/src/backend/access/heap/heapam.c
*************** relation_open(Oid relationId, LOCKMODE l
*** 918,936 ****
   *
   *        Same as relation_open, except return NULL instead of failing
   *        if the relation does not exist.
   * ----------------
   */
  Relation
! try_relation_open(Oid relationId, LOCKMODE lockmode)
  {
      Relation    r;
!
      Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);

      /* Get the lock first */
      if (lockmode != NoLock)
!         LockRelationOid(relationId, lockmode);
!
      /*
       * Now that we have the lock, probe to see if the relation really exists
       * or not.
--- 918,951 ----
   *
   *        Same as relation_open, except return NULL instead of failing
   *        if the relation does not exist.
+  *
+  *        If called with nowait enabled, this will immediately exit
+  *        if a lock is requested and it can't be acquired.  The
+  *        return code in this case doesn't distinguish between this
+  *        situation and the one where the relation was locked, but
+  *        doesn't exist.  Callers using nowait must not care that
+  *        they won't be able to tell the difference.
   * ----------------
   */
  Relation
! try_relation_open(Oid relationId, LOCKMODE lockmode, bool nowait)
  {
      Relation    r;
!     bool        locked;
      Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);

      /* Get the lock first */
      if (lockmode != NoLock)
!         {
!         if (nowait)
!             {
!             locked=ConditionalLockRelationOid(relationId, lockmode);
!             if (!locked)
!                 return NULL;
!             }
!         else
!             LockRelationOid(relationId, lockmode);
!         }
      /*
       * Now that we have the lock, probe to see if the relation really exists
       * or not.
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 7bc5f11..24bfb16 100644
*** a/src/backend/commands/analyze.c
--- b/src/backend/commands/analyze.c
*************** analyze_rel(Oid relid, VacuumStmt *vacst
*** 147,156 ****
       * concurrent VACUUM, which doesn't matter much at the moment but might
       * matter if we ever try to accumulate stats on dead tuples.) If the rel
       * has been dropped since we last saw it, we don't need to process it.
       */
!     onerel = try_relation_open(relid, ShareUpdateExclusiveLock);
      if (!onerel)
!         return;

      /*
       * Check permissions --- this should match vacuum's check!
--- 147,168 ----
       * concurrent VACUUM, which doesn't matter much at the moment but might
       * matter if we ever try to accumulate stats on dead tuples.) If the rel
       * has been dropped since we last saw it, we don't need to process it.
+      *
+      * If this is a manual analyze, opening will wait forever to acquire
+      * the requested lock on the relation.  Autovacuum will just give up
+      * immediately if it can't get the lock.  This prevents a series of locked
+      * relations from potentially hanging all of the AV workers waiting
+      * for locks.
       */
!     onerel = try_relation_open(relid, ShareUpdateExclusiveLock, IsAutoVacuumWorkerProcess());
      if (!onerel)
!         {
!             if (IsAutoVacuumWorkerProcess())
!                 ereport(INFO,
!                     (errmsg("autoanalyze skipping relation %d --- cannot open or obtain lock",
!                         relid)));
!             return;
!         }

      /*
       * Check permissions --- this should match vacuum's check!
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index 19c3cf9..b5d5caa 100644
*** a/src/backend/commands/cluster.c
--- b/src/backend/commands/cluster.c
*************** cluster_rel(Oid tableOid, Oid indexOid,
*** 276,282 ****
       * case, since cluster() already did it.)  The index lock is taken inside
       * check_index_is_clusterable.
       */
!     OldHeap = try_relation_open(tableOid, AccessExclusiveLock);

      /* If the table has gone away, we can skip processing it */
      if (!OldHeap)
--- 276,282 ----
       * case, since cluster() already did it.)  The index lock is taken inside
       * check_index_is_clusterable.
       */
!     OldHeap = try_relation_open(tableOid, AccessExclusiveLock,false);

      /* If the table has gone away, we can skip processing it */
      if (!OldHeap)
diff --git a/src/backend/commands/lockcmds.c b/src/backend/commands/lockcmds.c
index b2c6ea5..488504d 100644
*** a/src/backend/commands/lockcmds.c
--- b/src/backend/commands/lockcmds.c
*************** LockTableRecurse(Oid reloid, RangeVar *r
*** 106,112 ****
       * Now that we have the lock, check to see if the relation really exists
       * or not.
       */
!     rel = try_relation_open(reloid, NoLock);

      if (!rel)
      {
--- 106,112 ----
       * Now that we have the lock, check to see if the relation really exists
       * or not.
       */
!     rel = try_relation_open(reloid, NoLock,false);

      if (!rel)
      {
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 9098c5d..cb5b0e0 100644
*** a/src/backend/commands/vacuum.c
--- b/src/backend/commands/vacuum.c
*************** vacuum_rel(Oid relid, VacuumStmt *vacstm
*** 844,856 ****
       *
       * There's a race condition here: the rel may have gone away since the
       * last time we saw it.  If so, we don't need to vacuum it.
       */
!     onerel = try_relation_open(relid, lmode);

      if (!onerel)
      {
          PopActiveSnapshot();
          CommitTransactionCommand();
          return;
      }

--- 844,868 ----
       *
       * There's a race condition here: the rel may have gone away since the
       * last time we saw it.  If so, we don't need to vacuum it.
+      *
+      * If this is a manual vacuum, opening will wait forever to acquire
+      * the requested lock on the relation.  Autovacuum will just give up
+      * immediately if it can't get the lock.  This prevents a series of locked
+      * relations from potentially hanging all of the AV workers waiting
+      * for locks.
       */
!     onerel = try_relation_open(relid, lmode, IsAutoVacuumWorkerProcess());

      if (!onerel)
      {
          PopActiveSnapshot();
          CommitTransactionCommand();
+         if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
+         {
+         ereport(INFO,
+             (errmsg("autovacuum skipping relation %d --- cannot open or obtain lock",
+                 relid)));
+         }
          return;
      }

diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 9efab4c..981dfe8 100644
*** a/src/include/access/heapam.h
--- b/src/include/access/heapam.h
*************** typedef enum
*** 48,54 ****

  /* in heap/heapam.c */
  extern Relation relation_open(Oid relationId, LOCKMODE lockmode);
! extern Relation try_relation_open(Oid relationId, LOCKMODE lockmode);
  extern Relation relation_openrv(const RangeVar *relation, LOCKMODE lockmode);
  extern Relation try_relation_openrv(const RangeVar *relation, LOCKMODE lockmode);
  extern void relation_close(Relation relation, LOCKMODE lockmode);
--- 48,54 ----

  /* in heap/heapam.c */
  extern Relation relation_open(Oid relationId, LOCKMODE lockmode);
! extern Relation try_relation_open(Oid relationId, LOCKMODE lockmode, bool nowait);
  extern Relation relation_openrv(const RangeVar *relation, LOCKMODE lockmode);
  extern Relation try_relation_openrv(const RangeVar *relation, LOCKMODE lockmode);
  extern void relation_close(Relation relation, LOCKMODE lockmode);

pgsql-hackers by date:

Previous
From: Magnus Hagander
Date:
Subject: Re: pg_basebackup for streaming base backups
Next
From: Tom Lane
Date:
Subject: Re: We need to log aborted autovacuums