Re: reducing statistics write overhead - Mailing list pgsql-hackers

From Martin Pihlak
Subject Re: reducing statistics write overhead
Date
Msg-id 48C594D8.6050504@gmail.com
Whole thread Raw
In response to Re: reducing statistics write overhead  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: reducing statistics write overhead  (Alvaro Herrera <alvherre@commandprompt.com>)
Re: reducing statistics write overhead  (Tom Lane <tgl@sss.pgh.pa.us>)
List pgsql-hackers
Tom Lane wrote:
> Hmm.  With the timestamp in the file, ISTM that we could put all the
> intelligence on the reader side.  Reader checks file, sends message if
... snip ...
> remember the file timestamp it last wrote out.  There are various ways
> you could design it but what comes to mind here is for readers to send
> a timestamp defined as minimum acceptable file timestamp (ie, their
> current clock time less whatever slop they want to allow).  Then,
> when the collector gets a request with that timestamp <= its last
> write timestamp, it knows it need not do anything.

Attached is a patch that implements the described signalling. Additionally
following non-related changes have been made:
1. fopen/fclose replaced with AllocateFile/FreeFile
2. pgstat_report_stat() now also checks if there are functions to report
before returning. Previous behavior was to return immediately if no table
stats were available for reporting.

> responding to a new live write request.  It's sort of annoying that
> the readers have to sleep for an arbitrary interval though.  If we could
> get rid of that part...
>
It is, but I guess its unavoidable if we have to wait for the file to be
written. I'll try to do some load testing tomorrow, to see if something
nasty comes out.

regards,
Martin

Index: src/backend/postmaster/pgstat.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/postmaster/pgstat.c,v
retrieving revision 1.181
diff -c -r1.181 pgstat.c
*** src/backend/postmaster/pgstat.c    25 Aug 2008 18:55:43 -0000    1.181
--- src/backend/postmaster/pgstat.c    8 Sep 2008 21:00:17 -0000
***************
*** 85,90 ****
--- 85,97 ----
  #define PGSTAT_SELECT_TIMEOUT    2        /* How often to check for postmaster
                                           * death; in seconds. */

+ #define PGSTAT_POLL_RETRY_DELAY    5        /* How long to pause between statistics
+                                          * update requests; in milliseconds. */
+
+ #define PGSTAT_POLL_TIME        5000    /* How long are we allowed to poll for
+                                          * the stats file; in milliseconds. */
+
+ #define PGSTAT_POLL_LOOP_COUNT    (PGSTAT_POLL_TIME / PGSTAT_POLL_RETRY_DELAY)

  /* ----------
   * The initial size hints for the hash tables used in the collector.
***************
*** 159,164 ****
--- 166,177 ----
  static HTAB *pgStatFunctions = NULL;

  /*
+  * Indicates if backend has some function stats which it hasn't yet
+  * sent to the collector.
+  */
+ static bool have_function_stats = false;
+
+ /*
   * Tuple insertion/deletion counts for an open transaction can't be propagated
   * into PgStat_TableStatus counters until we know if it is going to commit
   * or abort.  Hence, we keep these counts in per-subxact structs that live
***************
*** 201,208 ****
   */
  static PgStat_GlobalStats globalStats;

  static volatile bool need_exit = false;
- static volatile bool need_statwrite = false;
  static volatile bool got_SIGHUP = false;

  /*
--- 214,225 ----
   */
  static PgStat_GlobalStats globalStats;

+ /* Last time the collector wrote the stats file */
+ static TimestampTz last_statwrite;
+ /* Latest statistics request from backend */
+ static TimestampTz last_statrequest;
+
  static volatile bool need_exit = false;
  static volatile bool got_SIGHUP = false;

  /*
***************
*** 223,229 ****

  NON_EXEC_STATIC void PgstatCollectorMain(int argc, char *argv[]);
  static void pgstat_exit(SIGNAL_ARGS);
- static void force_statwrite(SIGNAL_ARGS);
  static void pgstat_beshutdown_hook(int code, Datum arg);
  static void pgstat_sighup_handler(SIGNAL_ARGS);

--- 240,245 ----
***************
*** 254,259 ****
--- 270,276 ----
  static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
  static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
  static void pgstat_recv_funcpurge(PgStat_MsgFuncpurge *msg, int len);
+ static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len);


  /* ------------------------------------------------------------
***************
*** 655,663 ****
      int            i;

      /* Don't expend a clock check if nothing to do */
!     /* Note: we ignore pending function stats in this test ... OK? */
!     if (pgStatTabList == NULL ||
!         pgStatTabList->tsa_used == 0)
          return;

      /*
--- 672,679 ----
      int            i;

      /* Don't expend a clock check if nothing to do */
!     if ((pgStatTabList == NULL || pgStatTabList->tsa_used == 0)
!         && !have_function_stats)
          return;

      /*
***************
*** 823,828 ****
--- 839,846 ----
      if (msg.m_nentries > 0)
          pgstat_send(&msg, offsetof(PgStat_MsgFuncstat, m_entry[0]) +
                      msg.m_nentries * sizeof(PgStat_FunctionEntry));
+
+     have_function_stats = false;
  }


***************
*** 1341,1346 ****
--- 1359,1367 ----
          fs->f_numcalls++;
      fs->f_time = f_total;
      INSTR_TIME_ADD(fs->f_time_self, f_self);
+
+     /* indicate that we have something to upload */
+     have_function_stats = true;
  }


***************
*** 2463,2468 ****
--- 2484,2505 ----
      hdr->m_type = mtype;
  }

+ /* ----------
+  * pgstat_send_inquiry() -
+  *
+  *        Notify collector that we need fresh data.
+  *        ts is used to specify the minimum valid timestamp for the file.
+  * ----------
+  */
+ static void
+ pgstat_send_inquiry(TimestampTz ts)
+ {
+     PgStat_MsgInquiry msg;
+
+     pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY);
+     msg.inquiry_time = ts;
+     pgstat_send(&msg, sizeof(msg));
+ }

  /* ----------
   * pgstat_send() -
***************
*** 2538,2545 ****
  NON_EXEC_STATIC void
  PgstatCollectorMain(int argc, char *argv[])
  {
-     struct itimerval write_timeout;
-     bool        need_timer = false;
      int            len;
      PgStat_Msg    msg;

--- 2575,2580 ----
***************
*** 2571,2583 ****

      /*
       * Ignore all signals usually bound to some action in the postmaster,
!      * except SIGQUIT and SIGALRM.
       */
      pqsignal(SIGHUP, pgstat_sighup_handler);
      pqsignal(SIGINT, SIG_IGN);
      pqsignal(SIGTERM, SIG_IGN);
      pqsignal(SIGQUIT, pgstat_exit);
-     pqsignal(SIGALRM, force_statwrite);
      pqsignal(SIGPIPE, SIG_IGN);
      pqsignal(SIGUSR1, SIG_IGN);
      pqsignal(SIGUSR2, SIG_IGN);
--- 2606,2617 ----

      /*
       * Ignore all signals usually bound to some action in the postmaster,
!      * except SIGQUIT
       */
      pqsignal(SIGHUP, pgstat_sighup_handler);
      pqsignal(SIGINT, SIG_IGN);
      pqsignal(SIGTERM, SIG_IGN);
      pqsignal(SIGQUIT, pgstat_exit);
      pqsignal(SIGPIPE, SIG_IGN);
      pqsignal(SIGUSR1, SIG_IGN);
      pqsignal(SIGUSR2, SIG_IGN);
***************
*** 2594,2609 ****
      init_ps_display("stats collector process", "", "", "");

      /*
-      * Arrange to write the initial status file right away
-      */
-     need_statwrite = true;
-
-     /* Preset the delay between status file writes */
-     MemSet(&write_timeout, 0, sizeof(struct itimerval));
-     write_timeout.it_value.tv_sec = PGSTAT_STAT_INTERVAL / 1000;
-     write_timeout.it_value.tv_usec = (PGSTAT_STAT_INTERVAL % 1000) * 1000;
-
-     /*
       * Read in an existing statistics stats file or initialize the stats to
       * zero.
       */
--- 2628,2633 ----
***************
*** 2645,2668 ****
          {
              ProcessConfigFile(PGC_SIGHUP);
              got_SIGHUP = false;
!             need_statwrite = true;
          }

          /*
!          * If time to write the stats file, do so.    Note that the alarm
!          * interrupt isn't re-enabled immediately, but only after we next
!          * receive a stats message; so no cycles are wasted when there is
!          * nothing going on.
           */
!         if (need_statwrite)
          {
              /* Check for postmaster death; if so we'll write file below */
              if (!PostmasterIsAlive(true))
                  break;
-
              pgstat_write_statsfile(false);
-             need_statwrite = false;
-             need_timer = true;
          }

          /*
--- 2669,2687 ----
          {
              ProcessConfigFile(PGC_SIGHUP);
              got_SIGHUP = false;
!             last_statrequest = last_statwrite;
          }

          /*
!          * Write the stats file if so requested and not satisfied by
!          * existing file.
           */
!         if (TimestampDifferenceExceeds(last_statwrite, last_statrequest, 0))
          {
              /* Check for postmaster death; if so we'll write file below */
              if (!PostmasterIsAlive(true))
                  break;
              pgstat_write_statsfile(false);
          }

          /*
***************
*** 2797,2817 ****
                      pgstat_recv_funcpurge((PgStat_MsgFuncpurge *) &msg, len);
                      break;

!                 default:
                      break;
-             }

!             /*
!              * If this is the first message after we wrote the stats file the
!              * last time, enable the alarm interrupt to make it be written
!              * again later.
!              */
!             if (need_timer)
!             {
!                 if (setitimer(ITIMER_REAL, &write_timeout, NULL))
!                     ereport(ERROR,
!                     (errmsg("could not set statistics collector timer: %m")));
!                 need_timer = false;
              }
          }
          else
--- 2816,2827 ----
                      pgstat_recv_funcpurge((PgStat_MsgFuncpurge *) &msg, len);
                      break;

!                 case PGSTAT_MTYPE_INQUIRY:
!                     pgstat_recv_inquiry((PgStat_MsgInquiry *)&msg, len);
                      break;

!                 default:
!                     break;
              }
          }
          else
***************
*** 2841,2853 ****
      need_exit = true;
  }

- /* SIGALRM signal handler for collector process */
- static void
- force_statwrite(SIGNAL_ARGS)
- {
-     need_statwrite = true;
- }
-
  /* SIGHUP handler for collector process */
  static void
  pgstat_sighup_handler(SIGNAL_ARGS)
--- 2851,2856 ----
***************
*** 2943,2949 ****
      /*
       * Open the statistics temp file to write out the current values.
       */
!     fpout = fopen(tmpfile, PG_BINARY_W);
      if (fpout == NULL)
      {
          ereport(LOG,
--- 2946,2952 ----
      /*
       * Open the statistics temp file to write out the current values.
       */
!     fpout = AllocateFile(tmpfile, PG_BINARY_W);
      if (fpout == NULL)
      {
          ereport(LOG,
***************
*** 2954,2963 ****
      }

      /*
!      * Write the file header --- currently just a format ID.
       */
      format_id = PGSTAT_FILE_FORMAT_ID;
      fwrite(&format_id, sizeof(format_id), 1, fpout);

      /*
       * Write global stats struct
--- 2957,2968 ----
      }

      /*
!      * Write the file header --- currently just a format ID and a timestamp.
       */
      format_id = PGSTAT_FILE_FORMAT_ID;
      fwrite(&format_id, sizeof(format_id), 1, fpout);
+     last_statwrite = GetCurrentTimestamp();
+     fwrite(&last_statwrite, sizeof(last_statwrite), 1, fpout);

      /*
       * Write global stats struct
***************
*** 3017,3026 ****
                  (errcode_for_file_access(),
                 errmsg("could not write temporary statistics file \"%s\": %m",
                        tmpfile)));
!         fclose(fpout);
          unlink(tmpfile);
      }
!     else if (fclose(fpout) < 0)
      {
          ereport(LOG,
                  (errcode_for_file_access(),
--- 3022,3031 ----
                  (errcode_for_file_access(),
                 errmsg("could not write temporary statistics file \"%s\": %m",
                        tmpfile)));
!         FreeFile(fpout);
          unlink(tmpfile);
      }
!     else if (FreeFile(fpout) < 0)
      {
          ereport(LOG,
                  (errcode_for_file_access(),
***************
*** 3108,3113 ****
--- 3113,3126 ----
          goto done;
      }

+     /* Skip over the timestamp */
+     if (fseek(fpin, sizeof(TimestampTz), SEEK_CUR) != 0)
+     {
+         ereport(pgStatRunningInCollector ? LOG : WARNING,
+                 (errmsg("corrupted pgstat.stat file")));
+         goto done;
+     }
+
      /*
       * Read global stats struct
       */
***************
*** 3297,3307 ****
--- 3310,3363 ----
  static void
  backend_read_statsfile(void)
  {
+     TimestampTz file_min_ts;
+     bool stats_ready = false;
+     int count;
+
      /* already read it? */
      if (pgStatDBHash)
          return;
      Assert(!pgStatRunningInCollector);

+     /*
+      * This is the earliest timestamp we can use. Usually we can tolerate
+      * up to PGSTAT_STAT_INTERVAL ms stale files.
+      */
+     file_min_ts = TimestampTzPlusMilliseconds(GetCurrentTimestamp(),
+         -PGSTAT_STAT_INTERVAL);
+
+     /*
+      * Loop until fresh enough stats file is available or we ran out of time.
+      * The stats inquiry message is resent in case collector drops our first
+      * message.
+      */
+     for (count = 0; !stats_ready && count < PGSTAT_POLL_LOOP_COUNT; count++)
+     {
+         TimestampTz ts;
+         FILE *fp;
+
+         CHECK_FOR_INTERRUPTS();
+
+         /* Read the timestamp from beginning of the stats file */
+         if ((fp = AllocateFile(pgstat_stat_filename, PG_BINARY_R)) != NULL)
+         {
+             fseek(fp, sizeof(int32), SEEK_SET);
+             fread(&ts, sizeof(ts), 1, fp);
+             if (!ferror(fp) && TimestampDifferenceExceeds(file_min_ts, ts, 0))
+                 stats_ready = true;
+             FreeFile(fp);
+         }
+
+         if (!stats_ready)
+         {
+             pgstat_send_inquiry(file_min_ts);
+             pg_usleep(PGSTAT_POLL_RETRY_DELAY * 1000);
+         }
+     }
+
+     if (!stats_ready)
+         elog(WARNING, "pgstat wait timeout");
+
      /* Autovacuum launcher wants stats about all databases */
      if (IsAutoVacuumLauncherProcess())
          pgStatDBHash = pgstat_read_statsfile(InvalidOid, false);
***************
*** 3779,3781 ****
--- 3835,3851 ----
                             HASH_REMOVE, NULL);
      }
  }
+
+ /* ----------
+  * pgstat_recv_inquiry() -
+  *
+  *    Process stat inquiry requests.
+  * ----------
+  */
+ static void
+ pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len)
+ {
+     if (msg->inquiry_time > last_statrequest)
+         last_statrequest = msg->inquiry_time;
+ }
+
Index: src/include/pgstat.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/pgstat.h,v
retrieving revision 1.78
diff -c -r1.78 pgstat.h
*** src/include/pgstat.h    15 Aug 2008 08:37:40 -0000    1.78
--- src/include/pgstat.h    8 Sep 2008 21:00:20 -0000
***************
*** 42,48 ****
      PGSTAT_MTYPE_ANALYZE,
      PGSTAT_MTYPE_BGWRITER,
      PGSTAT_MTYPE_FUNCSTAT,
!     PGSTAT_MTYPE_FUNCPURGE
  } StatMsgType;

  /* ----------
--- 42,49 ----
      PGSTAT_MTYPE_ANALYZE,
      PGSTAT_MTYPE_BGWRITER,
      PGSTAT_MTYPE_FUNCSTAT,
!     PGSTAT_MTYPE_FUNCPURGE,
!     PGSTAT_MTYPE_INQUIRY
  } StatMsgType;

  /* ----------
***************
*** 385,390 ****
--- 386,404 ----


  /* ----------
+  * PgStat_MsgInquiry            Sent by the backend to tell the collector
+  *                                to write the stats file.
+  * ----------
+  */
+
+ typedef struct PgStat_MsgInquiry
+ {
+     PgStat_MsgHdr    m_hdr;
+     TimestampTz        inquiry_time;
+ } PgStat_MsgInquiry;
+
+
+ /* ----------
   * PgStat_Msg                    Union over all possible messages.
   * ----------
   */
***************
*** 402,407 ****
--- 416,422 ----
      PgStat_MsgBgWriter msg_bgwriter;
      PgStat_MsgFuncstat msg_funcstat;
      PgStat_MsgFuncpurge msg_funcpurge;
+     PgStat_MsgInquiry msg_inquiry;
  } PgStat_Msg;


***************
*** 413,419 ****
   * ------------------------------------------------------------
   */

! #define PGSTAT_FILE_FORMAT_ID    0x01A5BC97

  /* ----------
   * PgStat_StatDBEntry            The collector's data per database
--- 428,434 ----
   * ------------------------------------------------------------
   */

! #define PGSTAT_FILE_FORMAT_ID    0x01A5BC98

  /* ----------
   * PgStat_StatDBEntry            The collector's data per database

pgsql-hackers by date:

Previous
From: Jeff Davis
Date:
Subject: Re: Common Table Expressions (WITH RECURSIVE) patch
Next
From: Bruce Momjian
Date:
Subject: Re: Synchronous Log Shipping Replication