Greg Smith wrote:
> On Thu, 23 Nov 2006, Tom Lane wrote:
>
> > * It does not check for errors (if it had, you might have realized the
> > other problem).
>
> All the test_fsync code needs to check for errors better; there have been
> multiple occasions where I've run that with quesiontable input and it
> didn't complain, it just happily ran and reported times that were almost
> 0.
>
> Thanks for the note about alignment, I had seen something about that in
> the xlog.c but wasn't sure if that was important in this case.
>
> It's very important to the project I'm working on that I get this cleared
> up, and I think I'm in a good position to fix it myself now. I just
> wanted to report the issue and get some initial feedback on what's wrong.
> I'll try to rewrite that code with an eye toward the "Determine optimal
> fdatasync/fsync, O_SYNC/O_DSYNC options" to-do item, which is what I'd
> really like to have.
I have developed a patch that moves the defines into a include file
where they can be used by the backend and test_fsync.c. I have also set
up things so there is proper alignment for O_DIRECT, and added error
checking.
Not sure if people want this for 8.2. I think we can modify
test_fsync.c anytime but the movement of the defines into an include
file is a backend code change.
--
Bruce Momjian bruce@momjian.us
EnterpriseDB http://www.enterprisedb.com
+ If your life is a hard drive, Christ can be your backup. +
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.257
diff -c -c -r1.257 xlog.c
*** src/backend/access/transam/xlog.c 21 Nov 2006 20:59:52 -0000 1.257
--- src/backend/access/transam/xlog.c 24 Nov 2006 18:57:39 -0000
***************
*** 49,127 ****
#include "utils/pg_locale.h"
- /*
- * Because O_DIRECT bypasses the kernel buffers, and because we never
- * read those buffers except during crash recovery, it is a win to use
- * it in all cases where we sync on each write(). We could allow O_DIRECT
- * with fsync(), but because skipping the kernel buffer forces writes out
- * quickly, it seems best just to use it for O_SYNC. It is hard to imagine
- * how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT.
- * Also, O_DIRECT is never enough to force data to the drives, it merely
- * tries to bypass the kernel cache, so we still need O_SYNC or fsync().
- */
- #ifdef O_DIRECT
- #define PG_O_DIRECT O_DIRECT
- #else
- #define PG_O_DIRECT 0
- #endif
-
- /*
- * This chunk of hackery attempts to determine which file sync methods
- * are available on the current platform, and to choose an appropriate
- * default method. We assume that fsync() is always available, and that
- * configure determined whether fdatasync() is.
- */
- #if defined(O_SYNC)
- #define BARE_OPEN_SYNC_FLAG O_SYNC
- #elif defined(O_FSYNC)
- #define BARE_OPEN_SYNC_FLAG O_FSYNC
- #endif
- #ifdef BARE_OPEN_SYNC_FLAG
- #define OPEN_SYNC_FLAG (BARE_OPEN_SYNC_FLAG | PG_O_DIRECT)
- #endif
-
- #if defined(O_DSYNC)
- #if defined(OPEN_SYNC_FLAG)
- /* O_DSYNC is distinct? */
- #if O_DSYNC != BARE_OPEN_SYNC_FLAG
- #define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)
- #endif
- #else /* !defined(OPEN_SYNC_FLAG) */
- /* Win32 only has O_DSYNC */
- #define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)
- #endif
- #endif
-
- #if defined(OPEN_DATASYNC_FLAG)
- #define DEFAULT_SYNC_METHOD_STR "open_datasync"
- #define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN
- #define DEFAULT_SYNC_FLAGBIT OPEN_DATASYNC_FLAG
- #elif defined(HAVE_FDATASYNC)
- #define DEFAULT_SYNC_METHOD_STR "fdatasync"
- #define DEFAULT_SYNC_METHOD SYNC_METHOD_FDATASYNC
- #define DEFAULT_SYNC_FLAGBIT 0
- #elif defined(HAVE_FSYNC_WRITETHROUGH_ONLY)
- #define DEFAULT_SYNC_METHOD_STR "fsync_writethrough"
- #define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC_WRITETHROUGH
- #define DEFAULT_SYNC_FLAGBIT 0
- #else
- #define DEFAULT_SYNC_METHOD_STR "fsync"
- #define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC
- #define DEFAULT_SYNC_FLAGBIT 0
- #endif
-
-
- /*
- * Limitation of buffer-alignment for direct IO depends on OS and filesystem,
- * but XLOG_BLCKSZ is assumed to be enough for it.
- */
- #ifdef O_DIRECT
- #define ALIGNOF_XLOG_BUFFER XLOG_BLCKSZ
- #else
- #define ALIGNOF_XLOG_BUFFER ALIGNOF_BUFFER
- #endif
-
-
/* File path names (all relative to $PGDATA) */
#define BACKUP_LABEL_FILE "backup_label"
#define BACKUP_LABEL_OLD "backup_label.old"
--- 49,54 ----
Index: src/include/access/xlog.h
===================================================================
RCS file: /cvsroot/pgsql/src/include/access/xlog.h,v
retrieving revision 1.75
diff -c -c -r1.75 xlog.h
*** src/include/access/xlog.h 5 Nov 2006 22:42:10 -0000 1.75
--- src/include/access/xlog.h 24 Nov 2006 18:57:42 -0000
***************
*** 90,95 ****
--- 90,167 ----
extern int sync_method;
/*
+ * Because O_DIRECT bypasses the kernel buffers, and because we never
+ * read those buffers except during crash recovery, it is a win to use
+ * it in all cases where we sync on each write(). We could allow O_DIRECT
+ * with fsync(), but because skipping the kernel buffer forces writes out
+ * quickly, it seems best just to use it for O_SYNC. It is hard to imagine
+ * how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT.
+ * Also, O_DIRECT is never enough to force data to the drives, it merely
+ * tries to bypass the kernel cache, so we still need O_SYNC or fsync().
+ */
+ #ifdef O_DIRECT
+ #define PG_O_DIRECT O_DIRECT
+ #else
+ #define PG_O_DIRECT 0
+ #endif
+
+ /*
+ * This chunk of hackery attempts to determine which file sync methods
+ * are available on the current platform, and to choose an appropriate
+ * default method. We assume that fsync() is always available, and that
+ * configure determined whether fdatasync() is.
+ */
+ #if defined(O_SYNC)
+ #define BARE_OPEN_SYNC_FLAG O_SYNC
+ #elif defined(O_FSYNC)
+ #define BARE_OPEN_SYNC_FLAG O_FSYNC
+ #endif
+ #ifdef BARE_OPEN_SYNC_FLAG
+ #define OPEN_SYNC_FLAG (BARE_OPEN_SYNC_FLAG | PG_O_DIRECT)
+ #endif
+
+ #if defined(O_DSYNC)
+ #if defined(OPEN_SYNC_FLAG)
+ /* O_DSYNC is distinct? */
+ #if O_DSYNC != BARE_OPEN_SYNC_FLAG
+ #define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)
+ #endif
+ #else /* !defined(OPEN_SYNC_FLAG) */
+ /* Win32 only has O_DSYNC */
+ #define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)
+ #endif
+ #endif
+
+ #if defined(OPEN_DATASYNC_FLAG)
+ #define DEFAULT_SYNC_METHOD_STR "open_datasync"
+ #define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN
+ #define DEFAULT_SYNC_FLAGBIT OPEN_DATASYNC_FLAG
+ #elif defined(HAVE_FDATASYNC)
+ #define DEFAULT_SYNC_METHOD_STR "fdatasync"
+ #define DEFAULT_SYNC_METHOD SYNC_METHOD_FDATASYNC
+ #define DEFAULT_SYNC_FLAGBIT 0
+ #elif defined(HAVE_FSYNC_WRITETHROUGH_ONLY)
+ #define DEFAULT_SYNC_METHOD_STR "fsync_writethrough"
+ #define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC_WRITETHROUGH
+ #define DEFAULT_SYNC_FLAGBIT 0
+ #else
+ #define DEFAULT_SYNC_METHOD_STR "fsync"
+ #define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC
+ #define DEFAULT_SYNC_FLAGBIT 0
+ #endif
+
+
+ /*
+ * Limitation of buffer-alignment for direct IO depends on OS and filesystem,
+ * but XLOG_BLCKSZ is assumed to be enough for it.
+ */
+ #ifdef O_DIRECT
+ #define ALIGNOF_XLOG_BUFFER XLOG_BLCKSZ
+ #else
+ #define ALIGNOF_XLOG_BUFFER ALIGNOF_BUFFER
+ #endif
+
+ /*
* The rmgr data to be written by XLogInsert() is defined by a chain of
* one or more XLogRecData structs. (Multiple structs would be used when
* parts of the source data aren't physically adjacent in memory, or when
Index: src/tools/fsync/test_fsync.c
===================================================================
RCS file: /cvsroot/pgsql/src/tools/fsync/test_fsync.c,v
retrieving revision 1.16
diff -c -c -r1.16 test_fsync.c
*** src/tools/fsync/test_fsync.c 23 Nov 2006 17:20:47 -0000 1.16
--- src/tools/fsync/test_fsync.c 24 Nov 2006 18:57:43 -0000
***************
*** 1,10 ****
/*
* test_fsync.c
! * tests if fsync can be done from another process than the original write
*/
! #include "../../include/pg_config.h"
! #include "../../include/pg_config_os.h"
#include <sys/types.h>
#include <sys/stat.h>
--- 1,12 ----
/*
* test_fsync.c
! * test various fsync() methods
*/
! #include "postgres.h"
!
! #include "access/xlog_internal.h"
! #include "access/xlog.h"
#include <sys/types.h>
#include <sys/stat.h>
***************
*** 14,42 ****
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
#ifdef WIN32
#define FSYNC_FILENAME "./test_fsync.out"
#else
#define FSYNC_FILENAME "/var/tmp/test_fsync.out"
#endif
! /* O_SYNC and O_FSYNC are the same */
! #if defined(O_SYNC)
! #define OPEN_SYNC_FLAG O_SYNC
! #elif defined(O_FSYNC)
! #define OPEN_SYNC_FLAG O_FSYNC
! #elif defined(O_DSYNC)
! #define OPEN_DATASYNC_FLAG O_DSYNC
! #endif
!
! #if defined(OPEN_SYNC_FLAG)
! #if defined(O_DSYNC) && (O_DSYNC != OPEN_SYNC_FLAG)
! #define OPEN_DATASYNC_FLAG O_DSYNC
! #endif
! #endif
! #define WAL_FILE_SIZE (16 * 1024 * 1024)
void die(char *str);
void print_elapse(struct timeval start_t, struct timeval elapse_t);
--- 16,34 ----
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
+ #include <string.h>
#ifdef WIN32
#define FSYNC_FILENAME "./test_fsync.out"
#else
+ /* /tmp might be a memory file system */
#define FSYNC_FILENAME "/var/tmp/test_fsync.out"
#endif
! #define WRITE_SIZE (16 * 1024)
! /* We allocate extra to guarantee ALIGNOF_XLOG_BUFFER alignment */
! #define ALLOCATE_WAL_FILE_SIZE (WRITE_SIZE + ALIGNOF_XLOG_BUFFER)
void die(char *str);
void print_elapse(struct timeval start_t, struct timeval elapse_t);
***************
*** 49,55 ****
int tmpfile,
i,
loops = 1000;
! char *strout = (char *) malloc(WAL_FILE_SIZE);
char *filename = FSYNC_FILENAME;
if (argc > 2 && strcmp(argv[1], "-f") == 0)
--- 41,47 ----
int tmpfile,
i,
loops = 1000;
! char *full_buf = (char *) malloc(ALLOCATE_WAL_FILE_SIZE), *buf;
char *filename = FSYNC_FILENAME;
if (argc > 2 && strcmp(argv[1], "-f") == 0)
***************
*** 62,74 ****
if (argc > 1)
loops = atoi(argv[1]);
! for (i = 0; i < WAL_FILE_SIZE; i++)
! strout[i] = 'a';
if ((tmpfile = open(filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR)) == -1)
die("Cannot open output file.");
! write(tmpfile, strout, WAL_FILE_SIZE);
! fsync(tmpfile); /* fsync so later fsync's don't have to do it */
close(tmpfile);
printf("Simple write timing:\n");
--- 54,70 ----
if (argc > 1)
loops = atoi(argv[1]);
! buf = (char *)TYPEALIGN(ALIGNOF_XLOG_BUFFER, full_buf);
! for (i = 0; i < WRITE_SIZE; i++)
! buf[i] = 'a';
if ((tmpfile = open(filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR)) == -1)
die("Cannot open output file.");
! if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE)
! die("write failed");
! /* fsync so later fsync's don't have to do it */
! if (fsync(tmpfile) != 0)
! die("fsync failed");
close(tmpfile);
printf("Simple write timing:\n");
***************
*** 78,84 ****
{
if ((tmpfile = open(filename, O_RDWR)) == -1)
die("Cannot open output file.");
! write(tmpfile, strout, 8192);
close(tmpfile);
}
gettimeofday(&elapse_t, NULL);
--- 74,81 ----
{
if ((tmpfile = open(filename, O_RDWR)) == -1)
die("Cannot open output file.");
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
close(tmpfile);
}
gettimeofday(&elapse_t, NULL);
***************
*** 95,102 ****
{
if ((tmpfile = open(filename, O_RDWR)) == -1)
die("Cannot open output file.");
! write(tmpfile, strout, 8192);
! fsync(tmpfile);
close(tmpfile);
if ((tmpfile = open(filename, O_RDWR)) == -1)
die("Cannot open output file.");
--- 92,101 ----
{
if ((tmpfile = open(filename, O_RDWR)) == -1)
die("Cannot open output file.");
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
! if (fsync(tmpfile) != 0)
! die("fsync failed");
close(tmpfile);
if ((tmpfile = open(filename, O_RDWR)) == -1)
die("Cannot open output file.");
***************
*** 114,125 ****
{
if ((tmpfile = open(filename, O_RDWR)) == -1)
die("Cannot open output file.");
! write(tmpfile, strout, 8192);
close(tmpfile);
/* reopen file */
if ((tmpfile = open(filename, O_RDWR)) == -1)
die("Cannot open output file.");
! fsync(tmpfile);
close(tmpfile);
}
gettimeofday(&elapse_t, NULL);
--- 113,126 ----
{
if ((tmpfile = open(filename, O_RDWR)) == -1)
die("Cannot open output file.");
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
close(tmpfile);
/* reopen file */
if ((tmpfile = open(filename, O_RDWR)) == -1)
die("Cannot open output file.");
! if (fsync(tmpfile) != 0)
! die("fsync failed");
close(tmpfile);
}
gettimeofday(&elapse_t, NULL);
***************
*** 135,141 ****
die("Cannot open output file.");
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
! write(tmpfile, strout, 16384);
gettimeofday(&elapse_t, NULL);
close(tmpfile);
printf("\tone 16k o_sync write ");
--- 136,143 ----
die("Cannot open output file.");
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
! if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE)
! die("write failed");
gettimeofday(&elapse_t, NULL);
close(tmpfile);
printf("\tone 16k o_sync write ");
***************
*** 148,155 ****
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
{
! write(tmpfile, strout, 8192);
! write(tmpfile, strout, 8192);
}
gettimeofday(&elapse_t, NULL);
close(tmpfile);
--- 150,159 ----
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
{
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
}
gettimeofday(&elapse_t, NULL);
close(tmpfile);
***************
*** 169,175 ****
die("Cannot open output file.");
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
! write(tmpfile, strout, 8192);
gettimeofday(&elapse_t, NULL);
close(tmpfile);
printf("\topen o_dsync, write ");
--- 173,180 ----
die("Cannot open output file.");
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
gettimeofday(&elapse_t, NULL);
close(tmpfile);
printf("\topen o_dsync, write ");
***************
*** 181,187 ****
die("Cannot open output file.");
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
! write(tmpfile, strout, 8192);
gettimeofday(&elapse_t, NULL);
close(tmpfile);
printf("\topen o_sync, write ");
--- 186,193 ----
die("Cannot open output file.");
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
gettimeofday(&elapse_t, NULL);
close(tmpfile);
printf("\topen o_sync, write ");
***************
*** 199,205 ****
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
{
! write(tmpfile, strout, 8192);
fdatasync(tmpfile);
}
gettimeofday(&elapse_t, NULL);
--- 205,212 ----
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
{
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
fdatasync(tmpfile);
}
gettimeofday(&elapse_t, NULL);
***************
*** 217,224 ****
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
{
! write(tmpfile, strout, 8192);
! fsync(tmpfile);
}
gettimeofday(&elapse_t, NULL);
close(tmpfile);
--- 224,233 ----
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
{
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
! if (fsync(tmpfile) != 0)
! die("fsync failed");
}
gettimeofday(&elapse_t, NULL);
close(tmpfile);
***************
*** 235,242 ****
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
{
! write(tmpfile, strout, 8192);
! write(tmpfile, strout, 8192);
}
gettimeofday(&elapse_t, NULL);
close(tmpfile);
--- 244,253 ----
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
{
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
}
gettimeofday(&elapse_t, NULL);
close(tmpfile);
***************
*** 254,261 ****
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
{
! write(tmpfile, strout, 8192);
! write(tmpfile, strout, 8192);
}
gettimeofday(&elapse_t, NULL);
close(tmpfile);
--- 265,274 ----
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
{
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
}
gettimeofday(&elapse_t, NULL);
close(tmpfile);
***************
*** 271,278 ****
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
{
! write(tmpfile, strout, 8192);
! write(tmpfile, strout, 8192);
fdatasync(tmpfile);
}
gettimeofday(&elapse_t, NULL);
--- 284,293 ----
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
{
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
fdatasync(tmpfile);
}
gettimeofday(&elapse_t, NULL);
***************
*** 290,298 ****
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
{
! write(tmpfile, strout, 8192);
! write(tmpfile, strout, 8192);
! fsync(tmpfile);
}
gettimeofday(&elapse_t, NULL);
close(tmpfile);
--- 305,316 ----
gettimeofday(&start_t, NULL);
for (i = 0; i < loops; i++)
{
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! die("write failed");
! if (fsync(tmpfile) != 0)
! die("fsync failed");
}
gettimeofday(&elapse_t, NULL);
close(tmpfile);
***************
*** 300,305 ****
--- 318,324 ----
print_elapse(start_t, elapse_t);
printf("\n");
+ free(full_buf);
unlink(filename);
return 0;