reducing statistics write overhead - Mailing list pgsql-hackers

From Martin Pihlak
Subject reducing statistics write overhead
Date
Msg-id 48C181D4.8030807@gmail.com
Whole thread Raw
Responses Re: reducing statistics write overhead  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-hackers
Howdy,

The statistics collector currently dumps the stats file at every 500ms.  This
is a major problem if the file becomes large -- occasionally we've been forced
to disable stats collection to cope with it.  Another issue is that while the
file is frequently written, it is seldom read. Typically once a minute -
autovacuum plus possible user initiated stats queries.

So, as a simple optimization I am proposing that the file should be only
written when some backend requests statistics. This would significantly reduce
the undesired write traffic at the cost of slightly slower stats access.

Attached is a WIP patch, which basically implements this:

Disable periodic writing of the stats file. Introduce new stats message type -
PGSTAT_MTYPE_INQUIRY. Backends send this to notify collector that stats is needed.
Pid of the requestor is provided in the message. Backend then installs an alarm
handler and starts a timer.  Collector processes the messages and compiles a list
of pids to be notified. If there are any, the stats file is written and SIGALRM
is sent to the requestors. Backend then proceeds to read the stats file a usual.

Thoughts, comments?

regards,
Martin
Index: src/backend/postmaster/pgstat.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/postmaster/pgstat.c,v
retrieving revision 1.181
diff -c -r1.181 pgstat.c
*** src/backend/postmaster/pgstat.c    25 Aug 2008 18:55:43 -0000    1.181
--- src/backend/postmaster/pgstat.c    5 Sep 2008 18:36:24 -0000
***************
*** 85,90 ****
--- 85,92 ----
  #define PGSTAT_SELECT_TIMEOUT    2        /* How often to check for postmaster
                                           * death; in seconds. */

+ #define PGSTAT_WRITE_TIMEOUT    5        /* How long to wait for the collector
+                                          * to finish writing the file; in seconds. */

  /* ----------
   * The initial size hints for the hash tables used in the collector.
***************
*** 94,100 ****
  #define PGSTAT_TAB_HASH_SIZE    512
  #define PGSTAT_FUNCTION_HASH_SIZE    512

-
  /* ----------
   * GUC parameters
   * ----------
--- 96,101 ----
***************
*** 201,208 ****
--- 202,213 ----
   */
  static PgStat_GlobalStats globalStats;

+ /* Last time we wrote the stats file */
+ static TimestampTz last_statwrite;
+
  static volatile bool need_exit = false;
  static volatile bool need_statwrite = false;
+ static volatile bool stats_ready = false;
  static volatile bool got_SIGHUP = false;

  /*
***************
*** 213,218 ****
--- 218,229 ----
  static instr_time total_func_time;


+ #define PGSTAT_MAX_INQUIRIES    16        /* Max number of outstanding stats inquiries */
+
+ /* List of backends that need notifications */
+ static pid_t inquiring_backends[PGSTAT_MAX_INQUIRIES];
+ static int num_inquiries = 0;
+
  /* ----------
   * Local function forward declarations
   * ----------
***************
*** 223,229 ****

  NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]);
  static void pgstat_exit(SIGNAL_ARGS);
! static void force_statwrite(SIGNAL_ARGS);
  static void pgstat_beshutdown_hook(int code, Datum arg);
  static void pgstat_sighup_handler(SIGNAL_ARGS);

--- 234,240 ----

  NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]);
  static void pgstat_exit(SIGNAL_ARGS);
! static void pgstat_alarm_handler(SIGNAL_ARGS);
  static void pgstat_beshutdown_hook(int code, Datum arg);
  static void pgstat_sighup_handler(SIGNAL_ARGS);

***************
*** 254,259 ****
--- 265,271 ----
  static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
  static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
  static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
+ static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len);


  /* ------------------------------------------------------------
***************
*** 2463,2468 ****
--- 2475,2496 ----
      hdr->m_type = mtype;
  }

+ /* ----------
+  * pgstat_send_inquiry() -
+  *
+  *        Notify collector that we need fresh data.
+  * ----------
+  */
+ static void
+ pgstat_send_inquiry(void)
+ {
+     PgStat_MsgInquiry msg;
+
+     elog(DEBUG1, "pgstat_send_inquiry: I am %u", getpid());
+     pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY);
+     msg.m_pid = getpid();
+     pgstat_send(&msg, sizeof(msg));
+ }

  /* ----------
   * pgstat_send() -
***************
*** 2538,2545 ****
  NON_EXEC_STATIC void
  PgstatCollectorMain(int argc, char *argv[])
  {
-     struct itimerval write_timeout;
-     bool        need_timer = false;
      int            len;
      PgStat_Msg    msg;

--- 2566,2571 ----
***************
*** 2577,2583 ****
      pqsignal(SIGINT, SIG_IGN);
      pqsignal(SIGTERM, SIG_IGN);
      pqsignal(SIGQUIT, pgstat_exit);
-     pqsignal(SIGALRM, force_statwrite);
      pqsignal(SIGPIPE, SIG_IGN);
      pqsignal(SIGUSR1, SIG_IGN);
      pqsignal(SIGUSR2, SIG_IGN);
--- 2603,2608 ----
***************
*** 2598,2608 ****
       */
      need_statwrite = true;

-     /* Preset the delay between status file writes */
-     MemSet(&write_timeout, 0, sizeof(struct itimerval));
-     write_timeout.it_value.tv_sec = PGSTAT_STAT_INTERVAL / 1000;
-     write_timeout.it_value.tv_usec = (PGSTAT_STAT_INTERVAL % 1000) * 1000;
-
      /*
       * Read in an existing statistics stats file or initialize the stats to
       * zero.
--- 2623,2628 ----
***************
*** 2628,2634 ****
       */
      for (;;)
      {
!         int            got_data;

          /*
           * Quit if we get SIGQUIT from the postmaster.
--- 2648,2654 ----
       */
      for (;;)
      {
!         int            i, got_data;

          /*
           * Quit if we get SIGQUIT from the postmaster.
***************
*** 2649,2670 ****
          }

          /*
!          * If time to write the stats file, do so.    Note that the alarm
!          * interrupt isn't re-enabled immediately, but only after we next
!          * receive a stats message; so no cycles are wasted when there is
!          * nothing going on.
           */
          if (need_statwrite)
          {
              /* Check for postmaster death; if so we'll write file below */
              if (!PostmasterIsAlive(true))
                  break;

!             pgstat_write_statsfile(false);
              need_statwrite = false;
-             need_timer = true;
          }

          /*
           * Wait for a message to arrive; but not for more than
           * PGSTAT_SELECT_TIMEOUT seconds. (This determines how quickly we will
--- 2669,2708 ----
          }

          /*
!          * If time to write the stats file, do so.
           */
          if (need_statwrite)
          {
+             TimestampTz now = GetCurrentTimestamp();
+             long sec;
+             int us;
+
              /* Check for postmaster death; if so we'll write file below */
              if (!PostmasterIsAlive(true))
                  break;

!             /* Write the file, but only if enough time has passed since last write */
!             if (TimestampDifferenceExceeds(last_statwrite, now, PGSTAT_STAT_INTERVAL))
!             {
!                 pgstat_write_statsfile(false);
!                 last_statwrite = now;
!             }
!
!             TimestampDifference(now, GetCurrentTimestamp(), &sec, &us);
!             elog(DEBUG1, "pgstat_write_statsfile: written in %ldms",
!                 (long)(sec*1000 + us/1000.0));
!
              need_statwrite = false;
          }

+         /* Notify inquiring backends that we are done with the file */
+         for (i = 0; i < num_inquiries; i++)
+         {
+             elog(DEBUG1, "signalling backend %u", (unsigned)inquiring_backends[i]);
+             kill(inquiring_backends[i], SIGALRM);
+         }
+         num_inquiries = 0;
+
          /*
           * Wait for a message to arrive; but not for more than
           * PGSTAT_SELECT_TIMEOUT seconds. (This determines how quickly we will
***************
*** 2797,2817 ****
                      pgstat_recv_funcpurge((PgStat_MsgFuncpurge *) &msg, len);
                      break;

!                 default:
                      break;
-             }

!             /*
!              * If this is the first message after we wrote the stats file the
!              * last time, enable the alarm interrupt to make it be written
!              * again later.
!              */
!             if (need_timer)
!             {
!                 if (setitimer(ITIMER_REAL, &write_timeout, NULL))
!                     ereport(ERROR,
!                     (errmsg("could not set statistics collector timer: %m")));
!                 need_timer = false;
              }
          }
          else
--- 2835,2846 ----
                      pgstat_recv_funcpurge((PgStat_MsgFuncpurge *) &msg, len);
                      break;

!                 case PGSTAT_MTYPE_INQUIRY:
!                     pgstat_recv_inquiry((PgStat_MsgInquiry *)&msg, len);
                      break;

!                 default:
!                     break;
              }
          }
          else
***************
*** 2841,2853 ****
      need_exit = true;
  }

- /* SIGALRM signal handler for collector process */
- static void
- force_statwrite(SIGNAL_ARGS)
- {
-     need_statwrite = true;
- }
-
  /* SIGHUP handler for collector process */
  static void
  pgstat_sighup_handler(SIGNAL_ARGS)
--- 2870,2875 ----
***************
*** 2855,2860 ****
--- 2877,2888 ----
      got_SIGHUP = true;
  }

+ /* SIGALRM signal handler for backend process */
+ static void
+ pgstat_alarm_handler(SIGNAL_ARGS)
+ {
+     stats_ready = true;
+ }

  /*
   * Lookup the hash table entry for the specified database. If no hash
***************
*** 3297,3307 ****
--- 3325,3368 ----
  static void
  backend_read_statsfile(void)
  {
+     TimestampTz start;
+     pqsigfunc oldalarm;
+     static struct itimerval stat_timeout;
+     int count = 0;
+     long sec;
+     int usec;
+
      /* already read it? */
      if (pgStatDBHash)
          return;
      Assert(!pgStatRunningInCollector);

+     start = GetCurrentTimestamp();
+
+     /* These need only be done once */
+     pqsignal(SIGALRM, pgstat_alarm_handler);
+     stat_timeout.it_value.tv_sec = PGSTAT_WRITE_TIMEOUT;
+
+     stats_ready = false;
+     if (setitimer(ITIMER_REAL, &stat_timeout, NULL))
+        ereport(ERROR, (errmsg("could not set statistics collector poll timer: %m")));
+
+     while (!stats_ready)
+     {
+         CHECK_FOR_INTERRUPTS();
+
+         pgstat_send_inquiry();
+         pg_usleep(5000);
+         count += 1;
+     }
+
+     /* Stats ready or timeout - either way we have to proceed */
+
+     TimestampDifference(start, GetCurrentTimestamp(), &sec, &usec);
+     elog(DEBUG1, "backend_read_statsfile: %u: waited for %lu ms, looped %d times.",
+         getpid(),
+         (unsigned long)(sec*1000 + usec/1000.0), count);
+
      /* Autovacuum launcher wants stats about all databases */
      if (IsAutoVacuumLauncherProcess())
          pgStatDBHash = pgstat_read_statsfile(InvalidOid, false);
***************
*** 3779,3781 ****
--- 3840,3870 ----
                             HASH_REMOVE, NULL);
      }
  }
+
+ /* ----------
+  * pgstat_recv_inquiry() -
+  *
+  *    Process stat inquiry requests.
+  * ----------
+  */
+ static void
+ pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
+ {
+     int i;
+
+     /* filter out duplicates */
+     for (i = 0; i < num_inquiries; i++)
+         if (inquiring_backends[i] == msg->m_pid)
+             return;
+
+     if (num_inquiries >= PGSTAT_MAX_INQUIRIES)
+     {
+         /* List is full - just drop the message, it'll be resent */
+         elog(DEBUG1, "pgstat_recv_inquiry: too many outstanding inquiries - %d",
+             num_inquiries);
+         return;
+     }
+     inquiring_backends[num_inquiries++] = msg->m_pid;
+     need_statwrite = true;
+ }
+
Index: src/include/pgstat.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/pgstat.h,v
retrieving revision 1.78
diff -c -r1.78 pgstat.h
*** src/include/pgstat.h    15 Aug 2008 08:37:40 -0000    1.78
--- src/include/pgstat.h    5 Sep 2008 18:36:27 -0000
***************
*** 42,48 ****
      PGSTAT_MTYPE_ANALYZE,
      PGSTAT_MTYPE_BGWRITER,
      PGSTAT_MTYPE_FUNCSTAT,
!     PGSTAT_MTYPE_FUNCPURGE
  } StatMsgType;

  /* ----------
--- 42,49 ----
      PGSTAT_MTYPE_ANALYZE,
      PGSTAT_MTYPE_BGWRITER,
      PGSTAT_MTYPE_FUNCSTAT,
!     PGSTAT_MTYPE_FUNCPURGE,
!     PGSTAT_MTYPE_INQUIRY
  } StatMsgType;

  /* ----------
***************
*** 385,390 ****
--- 386,404 ----


  /* ----------
+  * PgStat_MsgInquiry            Sent by the backend to tell the collector
+  *                                to write the stats file.
+  * ----------
+  */
+
+ typedef struct PgStat_MsgInquiry
+ {
+     PgStat_MsgHdr m_hdr;
+     int            m_pid;
+ } PgStat_MsgInquiry;
+
+
+ /* ----------
   * PgStat_Msg                    Union over all possible messages.
   * ----------
   */
***************
*** 402,407 ****
--- 416,422 ----
      PgStat_MsgBgWriter msg_bgwriter;
      PgStat_MsgFuncstat msg_funcstat;
      PgStat_MsgFuncpurge msg_funcpurge;
+     PgStat_MsgInquiry msg_inquiry;
  } PgStat_Msg;



pgsql-hackers by date:

Previous
From: "Merlin Moncure"
Date:
Subject: Re: CVS head has broken make
Next
From: "Merlin Moncure"
Date:
Subject: Re: libpq events update