Re: pg_stat_activity crashes - Mailing list pgsql-hackers

From Kyotaro HORIGUCHI
Subject Re: pg_stat_activity crashes
Date
Msg-id 20160421.115640.176305136.horiguchi.kyotaro@lab.ntt.co.jp
Whole thread Raw
In response to pg_stat_activity crashes  (Petr Jelinek <petr@2ndquadrant.com>)
Responses Re: pg_stat_activity crashes  (Robert Haas <robertmhaas@gmail.com>)
List pgsql-hackers
Hello,

At Wed, 20 Apr 2016 15:14:16 +0200, Petr Jelinek <petr@2ndquadrant.com> wrote in <571780A8.4070902@2ndquadrant.com>
> I noticed sporadic segfaults when selecting from pg_stat_activity on
> current HEAD.
> 
> The culprit is the 53be0b1add7064ca5db3cd884302dfc3268d884e commit
> which added more wait info into the pg_stat_get_activity(). More
> specifically, the following code is broken:
> 
> +                       proc = BackendPidGetProc(beentry->st_procpid);
> + wait_event_type = pgstat_get_wait_event_type(proc->wait_event_info);
> 
> This needs to check if proc is NULL. When reading the code I noticed
> that the new functions pg_stat_get_backend_wait_event_type() and
> pg_stat_get_backend_wait_event() suffer from the same problem.

Good catch.

> Here is PoC patch which fixes the problem. I am wondering if we should
> raise warning in the pg_stat_get_backend_wait_event_type() and
> pg_stat_get_backend_wait_event() like the pg_signal_backend() does
> when proc is NULL instead of just returning NULL which is what this
> patch does though.

It still makes the two relevant columns in pg_stat_activity
inconsistent each other since it reads the procarray entry twice
without a lock on procarray.

The attached patch adds pgstat_get_wait_event_info to read
wait_event_info in more appropriate way. Then change
pg_stat_get_wait_event(_type) to take the wait_event_info.

Does this work for you?

We still may have an inconsistency between weit_event and query,
or beentry itself but preventing it would need to have local
copies of wait_event_info of all corresponding entries in
procarray, which will be overdone.

regards,

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 41f4374..999b7e7 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -55,6 +55,7 @@#include "storage/latch.h"#include "storage/lmgr.h"#include "storage/pg_shmem.h"
+#include "storage/procarray.h"#include "storage/procsignal.h"#include "storage/sinvaladt.h"#include "utils/ascii.h"
@@ -3118,6 +3119,27 @@ pgstat_read_current_status(void)}/* ----------
+ * pgstat_get_wait_event_info() -
+ *
+ *    Return the wait_event_info for a procid. 0 if the proc is no longer
+ *    living or has no waiting event.
+ */
+uint32
+pgstat_get_wait_event_info(int procpid)
+{
+    uint32        ret = 0;
+    PGPROC       *proc;
+
+    LWLockAcquire(ProcArrayLock, LW_SHARED);
+    proc = BackendPidGetProcWithLock(procpid);
+    if (proc)
+        ret = proc->wait_event_info;
+    LWLockRelease(ProcArrayLock);
+
+    return ret;
+}
+
+/* ---------- * pgstat_get_wait_event_type() - * *    Return a string representing the current wait event type,
backendis
 
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 64c4cc4..72776ab 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -679,7 +679,7 @@ pg_stat_get_activity(PG_FUNCTION_ARGS)        bool        nulls[PG_STAT_GET_ACTIVITY_COLS];
LocalPgBackendStatus*local_beentry;        PgBackendStatus *beentry;
 
-        PGPROC       *proc;
+        uint32        wait_event_info;        const char *wait_event_type;        const char *wait_event;
@@ -716,6 +716,14 @@ pg_stat_get_activity(PG_FUNCTION_ARGS)            continue;        }
+        /*
+         * Read wait event. This may be inconsistent with the beentry when the
+         * corresponding procarray entry has been removed or modified after
+         * the beentry was copied, but we don't need such strict consistency
+         * for this information.
+         */
+        wait_event_info = pgstat_get_wait_event_info(beentry->st_procpid);
+        /* Values available to all callers */        values[0] = ObjectIdGetDatum(beentry->st_databaseid);
values[1]= Int32GetDatum(beentry->st_procpid);
 
@@ -782,14 +790,13 @@ pg_stat_get_activity(PG_FUNCTION_ARGS)            values[5] =
CStringGetTextDatum(beentry->st_activity);
-            proc = BackendPidGetProc(beentry->st_procpid);
-            wait_event_type = pgstat_get_wait_event_type(proc->wait_event_info);
+            wait_event_type = pgstat_get_wait_event_type(wait_event_info);            if (wait_event_type)
  values[6] = CStringGetTextDatum(wait_event_type);            else                nulls[6] = true;
 
-            wait_event = pgstat_get_wait_event(proc->wait_event_info);
+            wait_event = pgstat_get_wait_event(wait_event_info);            if (wait_event)                values[7] =
CStringGetTextDatum(wait_event);           else
 
@@ -983,7 +990,6 @@ pg_stat_get_backend_wait_event_type(PG_FUNCTION_ARGS){    int32        beid = PG_GETARG_INT32(0);
PgBackendStatus *beentry;
 
-    PGPROC       *proc;    const char *wait_event_type;    if ((beentry = pgstat_fetch_stat_beentry(beid)) == NULL)
@@ -992,8 +998,8 @@ pg_stat_get_backend_wait_event_type(PG_FUNCTION_ARGS)        wait_event_type = "<insufficient
privilege>";   else    {
 
-        proc = BackendPidGetProc(beentry->st_procpid);
-        wait_event_type = pgstat_get_wait_event_type(proc->wait_event_info);
+        wait_event_type = pgstat_get_wait_event_type(
+            pgstat_get_wait_event_info(beentry->st_procpid));    }    if (!wait_event_type)
@@ -1007,7 +1013,6 @@ pg_stat_get_backend_wait_event(PG_FUNCTION_ARGS){    int32        beid = PG_GETARG_INT32(0);
PgBackendStatus*beentry;
 
-    PGPROC       *proc;    const char *wait_event;    if ((beentry = pgstat_fetch_stat_beentry(beid)) == NULL)
@@ -1016,8 +1021,8 @@ pg_stat_get_backend_wait_event(PG_FUNCTION_ARGS)        wait_event = "<insufficient privilege>";
 else    {
 
-        proc = BackendPidGetProc(beentry->st_procpid);
-        wait_event = pgstat_get_wait_event(proc->wait_event_info);
+        wait_event = pgstat_get_wait_event(
+            pgstat_get_wait_event_info(beentry->st_procpid));    }    if (!wait_event)
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 4be09fe..228e865 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -969,6 +969,7 @@ extern void pgstat_report_activity(BackendState state, const char *cmd_str);extern void
pgstat_report_tempfile(size_tfilesize);extern void pgstat_report_appname(const char *appname);extern void
pgstat_report_xact_timestamp(TimestampTztstamp);
 
+extern uint32 pgstat_get_wait_event_info(int procpid);extern const char *pgstat_get_wait_event(uint32
wait_event_info);externconst char *pgstat_get_wait_event_type(uint32 wait_event_info);extern const char
*pgstat_get_backend_current_activity(intpid, bool checkUser); 

pgsql-hackers by date:

Previous
From: Stephen Frost
Date:
Subject: Re: pg_dump dump catalog ACLs
Next
From: Robert Haas
Date:
Subject: Re: "parallel= " information is not coming in pg_dumpall for create aggregate