Re: We really ought to do something about O_DIRECT and data=journalled on ext4 - Mailing list pgsql-hackers

From Bruce Momjian
Subject Re: We really ought to do something about O_DIRECT and data=journalled on ext4
Date
Msg-id 201103110125.p2B1Prd19420@momjian.us
Whole thread Raw
In response to Re: We really ought to do something about O_DIRECT and data=journalled on ext4  (Josh Berkus <josh@agliodbs.com>)
List pgsql-hackers
Josh Berkus wrote:
> On 12/6/10 6:10 PM, Tom Lane wrote:
> > Robert Haas <robertmhaas@gmail.com> writes:
> >> On Mon, Dec 6, 2010 at 9:04 PM, Josh Berkus <josh@agliodbs.com> wrote:
> >>> Actually, on OSX 10.5.8, o_dsync and fdatasync aren't even available.
> >>> From my run, it looks like even so regular fsync might be better than
> >>> open_sync.
> >
> >> But I think you need to use fsync_writethrough if you actually want durability.
> >
> > Yeah.  Unless your laptop contains an SSD, those numbers are garbage on
> > their face.  So that's another problem with test_fsync: it omits
> > fsync_writethrough.
>
> Yeah, the issue with test_fsync appears to be that it's designed to work
> without os-specific switches no matter what, not to accurately reflect
> how we access wal.

I have now modified pg_test_fsync to use O_DIRECT for O_SYNC/O_FSYNC,
and O_DSYNC, if supported, so it now matches how we use WAL (except we
don't use O_DIRECT when in 'archive' and 'hot standby' mode).  Applied
patch attached.

--
  Bruce Momjian  <bruce@momjian.us>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com

  + It's impossible for everything to be true. +
diff --git a/contrib/pg_test_fsync/pg_test_fsync.c b/contrib/pg_test_fsync/pg_test_fsync.c
new file mode 100644
index d075483..49a7b3c
*** a/contrib/pg_test_fsync/pg_test_fsync.c
--- b/contrib/pg_test_fsync/pg_test_fsync.c
***************
*** 23,29 ****
  #define XLOG_BLCKSZ_K    (XLOG_BLCKSZ / 1024)

  #define LABEL_FORMAT        "        %-32s"
! #define NA_FORMAT            LABEL_FORMAT "%18s"
  #define OPS_FORMAT            "%9.3f ops/sec"

  static const char *progname;
--- 23,29 ----
  #define XLOG_BLCKSZ_K    (XLOG_BLCKSZ / 1024)

  #define LABEL_FORMAT        "        %-32s"
! #define NA_FORMAT            "%18s"
  #define OPS_FORMAT            "%9.3f ops/sec"

  static const char *progname;
*************** handle_args(int argc, char *argv[])
*** 134,139 ****
--- 134,144 ----
      }

      printf("%d operations per test\n", ops_per_test);
+ #if PG_O_DIRECT != 0
+     printf("O_DIRECT supported on this platform for open_datasync and open_sync.\n");
+ #else
+     printf("Direct I/O is not supported on this platform.\n");
+ #endif
  }

  static void
*************** test_sync(int writes_per_op)
*** 184,226 ****
      /*
       * Test open_datasync if available
       */
! #ifdef OPEN_DATASYNC_FLAG
!     printf(LABEL_FORMAT, "open_datasync"
! #if PG_O_DIRECT != 0
!         " (non-direct I/O)*"
! #endif
!         );
      fflush(stdout);

!     if ((tmpfile = open(filename, O_RDWR | O_DSYNC, 0)) == -1)
!         die("could not open output file");
!     gettimeofday(&start_t, NULL);
!     for (ops = 0; ops < ops_per_test; ops++)
!     {
!         for (writes = 0; writes < writes_per_op; writes++)
!             if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
!                 die("write failed");
!         if (lseek(tmpfile, 0, SEEK_SET) == -1)
!             die("seek failed");
!     }
!     gettimeofday(&stop_t, NULL);
!     close(tmpfile);
!     print_elapse(start_t, stop_t);
!
!     /*
!      * If O_DIRECT is enabled, test that with open_datasync
!      */
! #if PG_O_DIRECT != 0
      if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_O_DIRECT, 0)) == -1)
      {
!         printf(NA_FORMAT, "o_direct", "n/a**\n");
          fs_warning = true;
      }
      else
      {
!         printf(LABEL_FORMAT, "open_datasync (direct I/O)");
!         fflush(stdout);
!
          gettimeofday(&start_t, NULL);
          for (ops = 0; ops < ops_per_test; ops++)
          {
--- 189,207 ----
      /*
       * Test open_datasync if available
       */
!     printf(LABEL_FORMAT, "open_datasync");
      fflush(stdout);

! #ifdef OPEN_DATASYNC_FLAG
      if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_O_DIRECT, 0)) == -1)
      {
!         printf(NA_FORMAT, "n/a*\n");
          fs_warning = true;
      }
      else
      {
!         if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_O_DIRECT, 0)) == -1)
!             die("could not open output file");
          gettimeofday(&start_t, NULL);
          for (ops = 0; ops < ops_per_test; ops++)
          {
*************** test_sync(int writes_per_op)
*** 234,252 ****
          close(tmpfile);
          print_elapse(start_t, stop_t);
      }
- #endif
-
  #else
!     printf(NA_FORMAT, "open_datasync", "n/a\n");
  #endif

  /*
   * Test fdatasync if available
   */
- #ifdef HAVE_FDATASYNC
      printf(LABEL_FORMAT, "fdatasync");
      fflush(stdout);

      if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
          die("could not open output file");
      gettimeofday(&start_t, NULL);
--- 215,231 ----
          close(tmpfile);
          print_elapse(start_t, stop_t);
      }
  #else
!     printf(NA_FORMAT, "n/a\n");
  #endif

  /*
   * Test fdatasync if available
   */
      printf(LABEL_FORMAT, "fdatasync");
      fflush(stdout);

+ #ifdef HAVE_FDATASYNC
      if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
          die("could not open output file");
      gettimeofday(&start_t, NULL);
*************** test_sync(int writes_per_op)
*** 263,269 ****
      close(tmpfile);
      print_elapse(start_t, stop_t);
  #else
!     printf(NA_FORMAT, "fdatasync", "n/a\n");
  #endif

  /*
--- 242,248 ----
      close(tmpfile);
      print_elapse(start_t, stop_t);
  #else
!     printf(NA_FORMAT, "n/a\n");
  #endif

  /*
*************** test_sync(int writes_per_op)
*** 292,301 ****
  /*
   * If fsync_writethrough is available, test as well
   */
- #ifdef HAVE_FSYNC_WRITETHROUGH
      printf(LABEL_FORMAT, "fsync_writethrough");
      fflush(stdout);

      if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
          die("could not open output file");
      gettimeofday(&start_t, NULL);
--- 271,280 ----
  /*
   * If fsync_writethrough is available, test as well
   */
      printf(LABEL_FORMAT, "fsync_writethrough");
      fflush(stdout);

+ #ifdef HAVE_FSYNC_WRITETHROUGH
      if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
          die("could not open output file");
      gettimeofday(&start_t, NULL);
*************** test_sync(int writes_per_op)
*** 313,361 ****
      close(tmpfile);
      print_elapse(start_t, stop_t);
  #else
!     printf(NA_FORMAT, "fsync_writethrough", "n/a\n");
  #endif

  /*
   * Test open_sync if available
   */
! #ifdef OPEN_SYNC_FLAG
!     printf(LABEL_FORMAT, "open_sync"
! #if PG_O_DIRECT != 0
!         " (non-direct I/O)*"
! #endif
!         );
      fflush(stdout);

!     if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG, 0)) == -1)
!         die("could not open output file");
!     gettimeofday(&start_t, NULL);
!     for (ops = 0; ops < ops_per_test; ops++)
!     {
!         for (writes = 0; writes < writes_per_op; writes++)
!             if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
!                 die("write failed");
!         if (lseek(tmpfile, 0, SEEK_SET) == -1)
!             die("seek failed");
!     }
!     gettimeofday(&stop_t, NULL);
!     close(tmpfile);
!     print_elapse(start_t, stop_t);
!
!     /*
!      * If O_DIRECT is enabled, test that with open_sync
!      */
! #if PG_O_DIRECT != 0
      if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT, 0)) == -1)
      {
!         printf(NA_FORMAT, "o_direct", "n/a**\n");
          fs_warning = true;
      }
      else
      {
-         printf(LABEL_FORMAT, "open_sync (direct I/O)");
-         fflush(stdout);
-
          gettimeofday(&start_t, NULL);
          for (ops = 0; ops < ops_per_test; ops++)
          {
--- 292,314 ----
      close(tmpfile);
      print_elapse(start_t, stop_t);
  #else
!     printf(NA_FORMAT, "n/a\n");
  #endif

  /*
   * Test open_sync if available
   */
!     printf(LABEL_FORMAT, "open_sync");
      fflush(stdout);

! #ifdef OPEN_SYNC_FLAG
      if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT, 0)) == -1)
      {
!         printf(NA_FORMAT, "n/a*\n");
          fs_warning = true;
      }
      else
      {
          gettimeofday(&start_t, NULL);
          for (ops = 0; ops < ops_per_test; ops++)
          {
*************** test_sync(int writes_per_op)
*** 369,388 ****
          close(tmpfile);
          print_elapse(start_t, stop_t);
      }
- #endif
-
  #else
!     printf(NA_FORMAT, "open_sync", "n/a\n");
! #endif
!
! #if defined(OPEN_DATASYNC_FLAG) || defined(OPEN_SYNC_FLAG)
!     if (PG_O_DIRECT != 0)
!         printf("* This non-direct I/O mode is not used by Postgres.\n");
  #endif

      if (fs_warning)
      {
!         printf("** This file system and its mount options do not support direct\n");
          printf("I/O, e.g. ext4 in journaled mode.\n");
      }
  }
--- 322,334 ----
          close(tmpfile);
          print_elapse(start_t, stop_t);
      }
  #else
!     printf(NA_FORMAT, "n/a\n");
  #endif

      if (fs_warning)
      {
!         printf("* This file system and its mount options do not support direct\n");
          printf("I/O, e.g. ext4 in journaled mode.\n");
      }
  }
*************** test_open_syncs(void)
*** 407,422 ****
  static void
  test_open_sync(const char *msg, int writes_size)
  {
- #ifdef OPEN_SYNC_FLAG
      int        tmpfile, ops, writes;

      if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT, 0)) == -1)
!         printf(NA_FORMAT, "o_direct", "n/a**\n");
      else
      {
-         printf(LABEL_FORMAT, msg);
-         fflush(stdout);
-
          gettimeofday(&start_t, NULL);
          for (ops = 0; ops < ops_per_test; ops++)
          {
--- 353,368 ----
  static void
  test_open_sync(const char *msg, int writes_size)
  {
      int        tmpfile, ops, writes;

+     printf(LABEL_FORMAT, msg);
+     fflush(stdout);
+
+ #ifdef OPEN_SYNC_FLAG
      if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT, 0)) == -1)
!         printf(NA_FORMAT, "n/a*\n");
      else
      {
          gettimeofday(&start_t, NULL);
          for (ops = 0; ops < ops_per_test; ops++)
          {
*************** test_open_sync(const char *msg, int writ
*** 433,439 ****
      }

  #else
!     printf(NA_FORMAT, "open_sync", "n/a\n");
  #endif
  }

--- 379,385 ----
      }

  #else
!     printf(NA_FORMAT, "n/a\n");
  #endif
  }


pgsql-hackers by date:

Previous
From: Pavel Stehule
Date:
Subject: Re: patch: fix performance problems with repated decomprimation of varlena values in plpgsql
Next
From: Bruce Momjian
Date:
Subject: Re: Default mode for shutdown