Re: [PERFORM] Direct I/O issues - Mailing list pgsql-patches

From Bruce Momjian
Subject Re: [PERFORM] Direct I/O issues
Date
Msg-id 200611241858.kAOIwjh23094@momjian.us
Whole thread Raw
Responses Re: [PERFORM] Direct I/O issues
List pgsql-patches
Greg Smith wrote:
> On Thu, 23 Nov 2006, Tom Lane wrote:
>
> > * It does not check for errors (if it had, you might have realized the
> >  other problem).
>
> All the test_fsync code needs to check for errors better; there have been
> multiple occasions where I've run that with quesiontable input and it
> didn't complain, it just happily ran and reported times that were almost
> 0.
>
> Thanks for the note about alignment, I had seen something about that in
> the xlog.c but wasn't sure if that was important in this case.
>
> It's very important to the project I'm working on that I get this cleared
> up, and I think I'm in a good position to fix it myself now.  I just
> wanted to report the issue and get some initial feedback on what's wrong.
> I'll try to rewrite that code with an eye toward the "Determine optimal
> fdatasync/fsync, O_SYNC/O_DSYNC options" to-do item, which is what I'd
> really like to have.

I have developed a patch that moves the defines into a include file
where they can be used by the backend and test_fsync.c.  I have also set
up things so there is proper alignment for O_DIRECT, and added error
checking.

Not sure if people want this for 8.2.  I think we can modify
test_fsync.c anytime but the movement of the defines into an include
file is a backend code change.

--
  Bruce Momjian   bruce@momjian.us
  EnterpriseDB    http://www.enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.257
diff -c -c -r1.257 xlog.c
*** src/backend/access/transam/xlog.c    21 Nov 2006 20:59:52 -0000    1.257
--- src/backend/access/transam/xlog.c    24 Nov 2006 18:57:39 -0000
***************
*** 49,127 ****
  #include "utils/pg_locale.h"


- /*
-  *    Because O_DIRECT bypasses the kernel buffers, and because we never
-  *    read those buffers except during crash recovery, it is a win to use
-  *    it in all cases where we sync on each write().    We could allow O_DIRECT
-  *    with fsync(), but because skipping the kernel buffer forces writes out
-  *    quickly, it seems best just to use it for O_SYNC.  It is hard to imagine
-  *    how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT.
-  *    Also, O_DIRECT is never enough to force data to the drives, it merely
-  *    tries to bypass the kernel cache, so we still need O_SYNC or fsync().
-  */
- #ifdef O_DIRECT
- #define PG_O_DIRECT                O_DIRECT
- #else
- #define PG_O_DIRECT                0
- #endif
-
- /*
-  * This chunk of hackery attempts to determine which file sync methods
-  * are available on the current platform, and to choose an appropriate
-  * default method.    We assume that fsync() is always available, and that
-  * configure determined whether fdatasync() is.
-  */
- #if defined(O_SYNC)
- #define BARE_OPEN_SYNC_FLAG        O_SYNC
- #elif defined(O_FSYNC)
- #define BARE_OPEN_SYNC_FLAG        O_FSYNC
- #endif
- #ifdef BARE_OPEN_SYNC_FLAG
- #define OPEN_SYNC_FLAG            (BARE_OPEN_SYNC_FLAG | PG_O_DIRECT)
- #endif
-
- #if defined(O_DSYNC)
- #if defined(OPEN_SYNC_FLAG)
- /* O_DSYNC is distinct? */
- #if O_DSYNC != BARE_OPEN_SYNC_FLAG
- #define OPEN_DATASYNC_FLAG        (O_DSYNC | PG_O_DIRECT)
- #endif
- #else                            /* !defined(OPEN_SYNC_FLAG) */
- /* Win32 only has O_DSYNC */
- #define OPEN_DATASYNC_FLAG        (O_DSYNC | PG_O_DIRECT)
- #endif
- #endif
-
- #if defined(OPEN_DATASYNC_FLAG)
- #define DEFAULT_SYNC_METHOD_STR "open_datasync"
- #define DEFAULT_SYNC_METHOD        SYNC_METHOD_OPEN
- #define DEFAULT_SYNC_FLAGBIT    OPEN_DATASYNC_FLAG
- #elif defined(HAVE_FDATASYNC)
- #define DEFAULT_SYNC_METHOD_STR "fdatasync"
- #define DEFAULT_SYNC_METHOD        SYNC_METHOD_FDATASYNC
- #define DEFAULT_SYNC_FLAGBIT    0
- #elif defined(HAVE_FSYNC_WRITETHROUGH_ONLY)
- #define DEFAULT_SYNC_METHOD_STR "fsync_writethrough"
- #define DEFAULT_SYNC_METHOD        SYNC_METHOD_FSYNC_WRITETHROUGH
- #define DEFAULT_SYNC_FLAGBIT    0
- #else
- #define DEFAULT_SYNC_METHOD_STR "fsync"
- #define DEFAULT_SYNC_METHOD        SYNC_METHOD_FSYNC
- #define DEFAULT_SYNC_FLAGBIT    0
- #endif
-
-
- /*
-  * Limitation of buffer-alignment for direct IO depends on OS and filesystem,
-  * but XLOG_BLCKSZ is assumed to be enough for it.
-  */
- #ifdef O_DIRECT
- #define ALIGNOF_XLOG_BUFFER        XLOG_BLCKSZ
- #else
- #define ALIGNOF_XLOG_BUFFER        ALIGNOF_BUFFER
- #endif
-
-
  /* File path names (all relative to $PGDATA) */
  #define BACKUP_LABEL_FILE        "backup_label"
  #define BACKUP_LABEL_OLD        "backup_label.old"
--- 49,54 ----
Index: src/include/access/xlog.h
===================================================================
RCS file: /cvsroot/pgsql/src/include/access/xlog.h,v
retrieving revision 1.75
diff -c -c -r1.75 xlog.h
*** src/include/access/xlog.h    5 Nov 2006 22:42:10 -0000    1.75
--- src/include/access/xlog.h    24 Nov 2006 18:57:42 -0000
***************
*** 90,95 ****
--- 90,167 ----
  extern int    sync_method;

  /*
+  *    Because O_DIRECT bypasses the kernel buffers, and because we never
+  *    read those buffers except during crash recovery, it is a win to use
+  *    it in all cases where we sync on each write().    We could allow O_DIRECT
+  *    with fsync(), but because skipping the kernel buffer forces writes out
+  *    quickly, it seems best just to use it for O_SYNC.  It is hard to imagine
+  *    how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT.
+  *    Also, O_DIRECT is never enough to force data to the drives, it merely
+  *    tries to bypass the kernel cache, so we still need O_SYNC or fsync().
+  */
+ #ifdef O_DIRECT
+ #define PG_O_DIRECT                O_DIRECT
+ #else
+ #define PG_O_DIRECT                0
+ #endif
+
+ /*
+  * This chunk of hackery attempts to determine which file sync methods
+  * are available on the current platform, and to choose an appropriate
+  * default method.    We assume that fsync() is always available, and that
+  * configure determined whether fdatasync() is.
+  */
+ #if defined(O_SYNC)
+ #define BARE_OPEN_SYNC_FLAG        O_SYNC
+ #elif defined(O_FSYNC)
+ #define BARE_OPEN_SYNC_FLAG        O_FSYNC
+ #endif
+ #ifdef BARE_OPEN_SYNC_FLAG
+ #define OPEN_SYNC_FLAG            (BARE_OPEN_SYNC_FLAG | PG_O_DIRECT)
+ #endif
+
+ #if defined(O_DSYNC)
+ #if defined(OPEN_SYNC_FLAG)
+ /* O_DSYNC is distinct? */
+ #if O_DSYNC != BARE_OPEN_SYNC_FLAG
+ #define OPEN_DATASYNC_FLAG        (O_DSYNC | PG_O_DIRECT)
+ #endif
+ #else                            /* !defined(OPEN_SYNC_FLAG) */
+ /* Win32 only has O_DSYNC */
+ #define OPEN_DATASYNC_FLAG        (O_DSYNC | PG_O_DIRECT)
+ #endif
+ #endif
+
+ #if defined(OPEN_DATASYNC_FLAG)
+ #define DEFAULT_SYNC_METHOD_STR "open_datasync"
+ #define DEFAULT_SYNC_METHOD        SYNC_METHOD_OPEN
+ #define DEFAULT_SYNC_FLAGBIT    OPEN_DATASYNC_FLAG
+ #elif defined(HAVE_FDATASYNC)
+ #define DEFAULT_SYNC_METHOD_STR "fdatasync"
+ #define DEFAULT_SYNC_METHOD        SYNC_METHOD_FDATASYNC
+ #define DEFAULT_SYNC_FLAGBIT    0
+ #elif defined(HAVE_FSYNC_WRITETHROUGH_ONLY)
+ #define DEFAULT_SYNC_METHOD_STR "fsync_writethrough"
+ #define DEFAULT_SYNC_METHOD        SYNC_METHOD_FSYNC_WRITETHROUGH
+ #define DEFAULT_SYNC_FLAGBIT    0
+ #else
+ #define DEFAULT_SYNC_METHOD_STR "fsync"
+ #define DEFAULT_SYNC_METHOD        SYNC_METHOD_FSYNC
+ #define DEFAULT_SYNC_FLAGBIT    0
+ #endif
+
+
+ /*
+  * Limitation of buffer-alignment for direct IO depends on OS and filesystem,
+  * but XLOG_BLCKSZ is assumed to be enough for it.
+  */
+ #ifdef O_DIRECT
+ #define ALIGNOF_XLOG_BUFFER        XLOG_BLCKSZ
+ #else
+ #define ALIGNOF_XLOG_BUFFER        ALIGNOF_BUFFER
+ #endif
+
+ /*
   * The rmgr data to be written by XLogInsert() is defined by a chain of
   * one or more XLogRecData structs.  (Multiple structs would be used when
   * parts of the source data aren't physically adjacent in memory, or when
Index: src/tools/fsync/test_fsync.c
===================================================================
RCS file: /cvsroot/pgsql/src/tools/fsync/test_fsync.c,v
retrieving revision 1.16
diff -c -c -r1.16 test_fsync.c
*** src/tools/fsync/test_fsync.c    23 Nov 2006 17:20:47 -0000    1.16
--- src/tools/fsync/test_fsync.c    24 Nov 2006 18:57:43 -0000
***************
*** 1,10 ****
  /*
   *    test_fsync.c
!  *        tests if fsync can be done from another process than the original write
   */

! #include "../../include/pg_config.h"
! #include "../../include/pg_config_os.h"

  #include <sys/types.h>
  #include <sys/stat.h>
--- 1,12 ----
  /*
   *    test_fsync.c
!  *        test various fsync() methods
   */

! #include "postgres.h"
!
! #include "access/xlog_internal.h"
! #include "access/xlog.h"

  #include <sys/types.h>
  #include <sys/stat.h>
***************
*** 14,42 ****
  #include <time.h>
  #include <sys/time.h>
  #include <unistd.h>

  #ifdef WIN32
  #define FSYNC_FILENAME    "./test_fsync.out"
  #else
  #define FSYNC_FILENAME    "/var/tmp/test_fsync.out"
  #endif

! /* O_SYNC and O_FSYNC are the same */
! #if defined(O_SYNC)
! #define OPEN_SYNC_FLAG        O_SYNC
! #elif defined(O_FSYNC)
! #define OPEN_SYNC_FLAG        O_FSYNC
! #elif defined(O_DSYNC)
! #define OPEN_DATASYNC_FLAG    O_DSYNC
! #endif
!
! #if defined(OPEN_SYNC_FLAG)
! #if defined(O_DSYNC) && (O_DSYNC != OPEN_SYNC_FLAG)
! #define OPEN_DATASYNC_FLAG    O_DSYNC
! #endif
! #endif

! #define WAL_FILE_SIZE    (16 * 1024 * 1024)

  void        die(char *str);
  void        print_elapse(struct timeval start_t, struct timeval elapse_t);
--- 16,34 ----
  #include <time.h>
  #include <sys/time.h>
  #include <unistd.h>
+ #include <string.h>

  #ifdef WIN32
  #define FSYNC_FILENAME    "./test_fsync.out"
  #else
+ /* /tmp might be a memory file system */
  #define FSYNC_FILENAME    "/var/tmp/test_fsync.out"
  #endif

! #define WRITE_SIZE    (16 * 1024)

! /* We allocate extra to  guarantee ALIGNOF_XLOG_BUFFER alignment */
! #define ALLOCATE_WAL_FILE_SIZE    (WRITE_SIZE + ALIGNOF_XLOG_BUFFER)

  void        die(char *str);
  void        print_elapse(struct timeval start_t, struct timeval elapse_t);
***************
*** 49,55 ****
      int            tmpfile,
                  i,
                  loops = 1000;
!     char       *strout = (char *) malloc(WAL_FILE_SIZE);
      char       *filename = FSYNC_FILENAME;

      if (argc > 2 && strcmp(argv[1], "-f") == 0)
--- 41,47 ----
      int            tmpfile,
                  i,
                  loops = 1000;
!     char       *full_buf = (char *) malloc(ALLOCATE_WAL_FILE_SIZE), *buf;
      char       *filename = FSYNC_FILENAME;

      if (argc > 2 && strcmp(argv[1], "-f") == 0)
***************
*** 62,74 ****
      if (argc > 1)
          loops = atoi(argv[1]);

!     for (i = 0; i < WAL_FILE_SIZE; i++)
!         strout[i] = 'a';

      if ((tmpfile = open(filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR)) == -1)
          die("Cannot open output file.");
!     write(tmpfile, strout, WAL_FILE_SIZE);
!     fsync(tmpfile);                /* fsync so later fsync's don't have to do it */
      close(tmpfile);

      printf("Simple write timing:\n");
--- 54,70 ----
      if (argc > 1)
          loops = atoi(argv[1]);

!     buf = (char *)TYPEALIGN(ALIGNOF_XLOG_BUFFER, full_buf);
!     for (i = 0; i < WRITE_SIZE; i++)
!         buf[i] = 'a';

      if ((tmpfile = open(filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR)) == -1)
          die("Cannot open output file.");
!     if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE)
!         die("write failed");
!     /* fsync so later fsync's don't have to do it */
!     if (fsync(tmpfile) != 0)
!         die("fsync failed");
      close(tmpfile);

      printf("Simple write timing:\n");
***************
*** 78,84 ****
      {
          if ((tmpfile = open(filename, O_RDWR)) == -1)
              die("Cannot open output file.");
!         write(tmpfile, strout, 8192);
          close(tmpfile);
      }
      gettimeofday(&elapse_t, NULL);
--- 74,81 ----
      {
          if ((tmpfile = open(filename, O_RDWR)) == -1)
              die("Cannot open output file.");
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
          close(tmpfile);
      }
      gettimeofday(&elapse_t, NULL);
***************
*** 95,102 ****
      {
          if ((tmpfile = open(filename, O_RDWR)) == -1)
              die("Cannot open output file.");
!         write(tmpfile, strout, 8192);
!         fsync(tmpfile);
          close(tmpfile);
          if ((tmpfile = open(filename, O_RDWR)) == -1)
              die("Cannot open output file.");
--- 92,101 ----
      {
          if ((tmpfile = open(filename, O_RDWR)) == -1)
              die("Cannot open output file.");
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
!         if (fsync(tmpfile) != 0)
!             die("fsync failed");
          close(tmpfile);
          if ((tmpfile = open(filename, O_RDWR)) == -1)
              die("Cannot open output file.");
***************
*** 114,125 ****
      {
          if ((tmpfile = open(filename, O_RDWR)) == -1)
              die("Cannot open output file.");
!         write(tmpfile, strout, 8192);
          close(tmpfile);
          /* reopen file */
          if ((tmpfile = open(filename, O_RDWR)) == -1)
              die("Cannot open output file.");
!         fsync(tmpfile);
          close(tmpfile);
      }
      gettimeofday(&elapse_t, NULL);
--- 113,126 ----
      {
          if ((tmpfile = open(filename, O_RDWR)) == -1)
              die("Cannot open output file.");
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
          close(tmpfile);
          /* reopen file */
          if ((tmpfile = open(filename, O_RDWR)) == -1)
              die("Cannot open output file.");
!         if (fsync(tmpfile) != 0)
!             die("fsync failed");
          close(tmpfile);
      }
      gettimeofday(&elapse_t, NULL);
***************
*** 135,141 ****
          die("Cannot open output file.");
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
!         write(tmpfile, strout, 16384);
      gettimeofday(&elapse_t, NULL);
      close(tmpfile);
      printf("\tone 16k o_sync write   ");
--- 136,143 ----
          die("Cannot open output file.");
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
!         if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE)
!             die("write failed");
      gettimeofday(&elapse_t, NULL);
      close(tmpfile);
      printf("\tone 16k o_sync write   ");
***************
*** 148,155 ****
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
      {
!         write(tmpfile, strout, 8192);
!         write(tmpfile, strout, 8192);
      }
      gettimeofday(&elapse_t, NULL);
      close(tmpfile);
--- 150,159 ----
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
      {
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
      }
      gettimeofday(&elapse_t, NULL);
      close(tmpfile);
***************
*** 169,175 ****
          die("Cannot open output file.");
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
!         write(tmpfile, strout, 8192);
      gettimeofday(&elapse_t, NULL);
      close(tmpfile);
      printf("\topen o_dsync, write    ");
--- 173,180 ----
          die("Cannot open output file.");
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
      gettimeofday(&elapse_t, NULL);
      close(tmpfile);
      printf("\topen o_dsync, write    ");
***************
*** 181,187 ****
          die("Cannot open output file.");
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
!         write(tmpfile, strout, 8192);
      gettimeofday(&elapse_t, NULL);
      close(tmpfile);
      printf("\topen o_sync, write     ");
--- 186,193 ----
          die("Cannot open output file.");
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
      gettimeofday(&elapse_t, NULL);
      close(tmpfile);
      printf("\topen o_sync, write     ");
***************
*** 199,205 ****
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
      {
!         write(tmpfile, strout, 8192);
          fdatasync(tmpfile);
      }
      gettimeofday(&elapse_t, NULL);
--- 205,212 ----
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
      {
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
          fdatasync(tmpfile);
      }
      gettimeofday(&elapse_t, NULL);
***************
*** 217,224 ****
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
      {
!         write(tmpfile, strout, 8192);
!         fsync(tmpfile);
      }
      gettimeofday(&elapse_t, NULL);
      close(tmpfile);
--- 224,233 ----
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
      {
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
!         if (fsync(tmpfile) != 0)
!             die("fsync failed");
      }
      gettimeofday(&elapse_t, NULL);
      close(tmpfile);
***************
*** 235,242 ****
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
      {
!         write(tmpfile, strout, 8192);
!         write(tmpfile, strout, 8192);
      }
      gettimeofday(&elapse_t, NULL);
      close(tmpfile);
--- 244,253 ----
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
      {
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
      }
      gettimeofday(&elapse_t, NULL);
      close(tmpfile);
***************
*** 254,261 ****
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
      {
!         write(tmpfile, strout, 8192);
!         write(tmpfile, strout, 8192);
      }
      gettimeofday(&elapse_t, NULL);
      close(tmpfile);
--- 265,274 ----
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
      {
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
      }
      gettimeofday(&elapse_t, NULL);
      close(tmpfile);
***************
*** 271,278 ****
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
      {
!         write(tmpfile, strout, 8192);
!         write(tmpfile, strout, 8192);
          fdatasync(tmpfile);
      }
      gettimeofday(&elapse_t, NULL);
--- 284,293 ----
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
      {
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
          fdatasync(tmpfile);
      }
      gettimeofday(&elapse_t, NULL);
***************
*** 290,298 ****
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
      {
!         write(tmpfile, strout, 8192);
!         write(tmpfile, strout, 8192);
!         fsync(tmpfile);
      }
      gettimeofday(&elapse_t, NULL);
      close(tmpfile);
--- 305,316 ----
      gettimeofday(&start_t, NULL);
      for (i = 0; i < loops; i++)
      {
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
!         if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
!             die("write failed");
!         if (fsync(tmpfile) != 0)
!             die("fsync failed");
      }
      gettimeofday(&elapse_t, NULL);
      close(tmpfile);
***************
*** 300,305 ****
--- 318,324 ----
      print_elapse(start_t, elapse_t);
      printf("\n");

+     free(full_buf);
      unlink(filename);

      return 0;

pgsql-patches by date:

Previous
From: Bruce Momjian
Date:
Subject: Re: Avg performance for int8/numeric
Next
From: Tom Lane
Date:
Subject: Re: [PERFORM] Direct I/O issues