Re: [PERFORM] Direct I/O issues - Mailing list pgsql-patches
From | Bruce Momjian |
---|---|
Subject | Re: [PERFORM] Direct I/O issues |
Date | |
Msg-id | 200611241858.kAOIwjh23094@momjian.us Whole thread Raw |
Responses |
Re: [PERFORM] Direct I/O issues
|
List | pgsql-patches |
Greg Smith wrote: > On Thu, 23 Nov 2006, Tom Lane wrote: > > > * It does not check for errors (if it had, you might have realized the > > other problem). > > All the test_fsync code needs to check for errors better; there have been > multiple occasions where I've run that with quesiontable input and it > didn't complain, it just happily ran and reported times that were almost > 0. > > Thanks for the note about alignment, I had seen something about that in > the xlog.c but wasn't sure if that was important in this case. > > It's very important to the project I'm working on that I get this cleared > up, and I think I'm in a good position to fix it myself now. I just > wanted to report the issue and get some initial feedback on what's wrong. > I'll try to rewrite that code with an eye toward the "Determine optimal > fdatasync/fsync, O_SYNC/O_DSYNC options" to-do item, which is what I'd > really like to have. I have developed a patch that moves the defines into a include file where they can be used by the backend and test_fsync.c. I have also set up things so there is proper alignment for O_DIRECT, and added error checking. Not sure if people want this for 8.2. I think we can modify test_fsync.c anytime but the movement of the defines into an include file is a backend code change. -- Bruce Momjian bruce@momjian.us EnterpriseDB http://www.enterprisedb.com + If your life is a hard drive, Christ can be your backup. + Index: src/backend/access/transam/xlog.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v retrieving revision 1.257 diff -c -c -r1.257 xlog.c *** src/backend/access/transam/xlog.c 21 Nov 2006 20:59:52 -0000 1.257 --- src/backend/access/transam/xlog.c 24 Nov 2006 18:57:39 -0000 *************** *** 49,127 **** #include "utils/pg_locale.h" - /* - * Because O_DIRECT bypasses the kernel buffers, and because we never - * read those buffers except during crash recovery, it is a win to use - * it in all cases where we sync on each write(). We could allow O_DIRECT - * with fsync(), but because skipping the kernel buffer forces writes out - * quickly, it seems best just to use it for O_SYNC. It is hard to imagine - * how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT. - * Also, O_DIRECT is never enough to force data to the drives, it merely - * tries to bypass the kernel cache, so we still need O_SYNC or fsync(). - */ - #ifdef O_DIRECT - #define PG_O_DIRECT O_DIRECT - #else - #define PG_O_DIRECT 0 - #endif - - /* - * This chunk of hackery attempts to determine which file sync methods - * are available on the current platform, and to choose an appropriate - * default method. We assume that fsync() is always available, and that - * configure determined whether fdatasync() is. - */ - #if defined(O_SYNC) - #define BARE_OPEN_SYNC_FLAG O_SYNC - #elif defined(O_FSYNC) - #define BARE_OPEN_SYNC_FLAG O_FSYNC - #endif - #ifdef BARE_OPEN_SYNC_FLAG - #define OPEN_SYNC_FLAG (BARE_OPEN_SYNC_FLAG | PG_O_DIRECT) - #endif - - #if defined(O_DSYNC) - #if defined(OPEN_SYNC_FLAG) - /* O_DSYNC is distinct? */ - #if O_DSYNC != BARE_OPEN_SYNC_FLAG - #define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT) - #endif - #else /* !defined(OPEN_SYNC_FLAG) */ - /* Win32 only has O_DSYNC */ - #define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT) - #endif - #endif - - #if defined(OPEN_DATASYNC_FLAG) - #define DEFAULT_SYNC_METHOD_STR "open_datasync" - #define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN - #define DEFAULT_SYNC_FLAGBIT OPEN_DATASYNC_FLAG - #elif defined(HAVE_FDATASYNC) - #define DEFAULT_SYNC_METHOD_STR "fdatasync" - #define DEFAULT_SYNC_METHOD SYNC_METHOD_FDATASYNC - #define DEFAULT_SYNC_FLAGBIT 0 - #elif defined(HAVE_FSYNC_WRITETHROUGH_ONLY) - #define DEFAULT_SYNC_METHOD_STR "fsync_writethrough" - #define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC_WRITETHROUGH - #define DEFAULT_SYNC_FLAGBIT 0 - #else - #define DEFAULT_SYNC_METHOD_STR "fsync" - #define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC - #define DEFAULT_SYNC_FLAGBIT 0 - #endif - - - /* - * Limitation of buffer-alignment for direct IO depends on OS and filesystem, - * but XLOG_BLCKSZ is assumed to be enough for it. - */ - #ifdef O_DIRECT - #define ALIGNOF_XLOG_BUFFER XLOG_BLCKSZ - #else - #define ALIGNOF_XLOG_BUFFER ALIGNOF_BUFFER - #endif - - /* File path names (all relative to $PGDATA) */ #define BACKUP_LABEL_FILE "backup_label" #define BACKUP_LABEL_OLD "backup_label.old" --- 49,54 ---- Index: src/include/access/xlog.h =================================================================== RCS file: /cvsroot/pgsql/src/include/access/xlog.h,v retrieving revision 1.75 diff -c -c -r1.75 xlog.h *** src/include/access/xlog.h 5 Nov 2006 22:42:10 -0000 1.75 --- src/include/access/xlog.h 24 Nov 2006 18:57:42 -0000 *************** *** 90,95 **** --- 90,167 ---- extern int sync_method; /* + * Because O_DIRECT bypasses the kernel buffers, and because we never + * read those buffers except during crash recovery, it is a win to use + * it in all cases where we sync on each write(). We could allow O_DIRECT + * with fsync(), but because skipping the kernel buffer forces writes out + * quickly, it seems best just to use it for O_SYNC. It is hard to imagine + * how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT. + * Also, O_DIRECT is never enough to force data to the drives, it merely + * tries to bypass the kernel cache, so we still need O_SYNC or fsync(). + */ + #ifdef O_DIRECT + #define PG_O_DIRECT O_DIRECT + #else + #define PG_O_DIRECT 0 + #endif + + /* + * This chunk of hackery attempts to determine which file sync methods + * are available on the current platform, and to choose an appropriate + * default method. We assume that fsync() is always available, and that + * configure determined whether fdatasync() is. + */ + #if defined(O_SYNC) + #define BARE_OPEN_SYNC_FLAG O_SYNC + #elif defined(O_FSYNC) + #define BARE_OPEN_SYNC_FLAG O_FSYNC + #endif + #ifdef BARE_OPEN_SYNC_FLAG + #define OPEN_SYNC_FLAG (BARE_OPEN_SYNC_FLAG | PG_O_DIRECT) + #endif + + #if defined(O_DSYNC) + #if defined(OPEN_SYNC_FLAG) + /* O_DSYNC is distinct? */ + #if O_DSYNC != BARE_OPEN_SYNC_FLAG + #define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT) + #endif + #else /* !defined(OPEN_SYNC_FLAG) */ + /* Win32 only has O_DSYNC */ + #define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT) + #endif + #endif + + #if defined(OPEN_DATASYNC_FLAG) + #define DEFAULT_SYNC_METHOD_STR "open_datasync" + #define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN + #define DEFAULT_SYNC_FLAGBIT OPEN_DATASYNC_FLAG + #elif defined(HAVE_FDATASYNC) + #define DEFAULT_SYNC_METHOD_STR "fdatasync" + #define DEFAULT_SYNC_METHOD SYNC_METHOD_FDATASYNC + #define DEFAULT_SYNC_FLAGBIT 0 + #elif defined(HAVE_FSYNC_WRITETHROUGH_ONLY) + #define DEFAULT_SYNC_METHOD_STR "fsync_writethrough" + #define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC_WRITETHROUGH + #define DEFAULT_SYNC_FLAGBIT 0 + #else + #define DEFAULT_SYNC_METHOD_STR "fsync" + #define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC + #define DEFAULT_SYNC_FLAGBIT 0 + #endif + + + /* + * Limitation of buffer-alignment for direct IO depends on OS and filesystem, + * but XLOG_BLCKSZ is assumed to be enough for it. + */ + #ifdef O_DIRECT + #define ALIGNOF_XLOG_BUFFER XLOG_BLCKSZ + #else + #define ALIGNOF_XLOG_BUFFER ALIGNOF_BUFFER + #endif + + /* * The rmgr data to be written by XLogInsert() is defined by a chain of * one or more XLogRecData structs. (Multiple structs would be used when * parts of the source data aren't physically adjacent in memory, or when Index: src/tools/fsync/test_fsync.c =================================================================== RCS file: /cvsroot/pgsql/src/tools/fsync/test_fsync.c,v retrieving revision 1.16 diff -c -c -r1.16 test_fsync.c *** src/tools/fsync/test_fsync.c 23 Nov 2006 17:20:47 -0000 1.16 --- src/tools/fsync/test_fsync.c 24 Nov 2006 18:57:43 -0000 *************** *** 1,10 **** /* * test_fsync.c ! * tests if fsync can be done from another process than the original write */ ! #include "../../include/pg_config.h" ! #include "../../include/pg_config_os.h" #include <sys/types.h> #include <sys/stat.h> --- 1,12 ---- /* * test_fsync.c ! * test various fsync() methods */ ! #include "postgres.h" ! ! #include "access/xlog_internal.h" ! #include "access/xlog.h" #include <sys/types.h> #include <sys/stat.h> *************** *** 14,42 **** #include <time.h> #include <sys/time.h> #include <unistd.h> #ifdef WIN32 #define FSYNC_FILENAME "./test_fsync.out" #else #define FSYNC_FILENAME "/var/tmp/test_fsync.out" #endif ! /* O_SYNC and O_FSYNC are the same */ ! #if defined(O_SYNC) ! #define OPEN_SYNC_FLAG O_SYNC ! #elif defined(O_FSYNC) ! #define OPEN_SYNC_FLAG O_FSYNC ! #elif defined(O_DSYNC) ! #define OPEN_DATASYNC_FLAG O_DSYNC ! #endif ! ! #if defined(OPEN_SYNC_FLAG) ! #if defined(O_DSYNC) && (O_DSYNC != OPEN_SYNC_FLAG) ! #define OPEN_DATASYNC_FLAG O_DSYNC ! #endif ! #endif ! #define WAL_FILE_SIZE (16 * 1024 * 1024) void die(char *str); void print_elapse(struct timeval start_t, struct timeval elapse_t); --- 16,34 ---- #include <time.h> #include <sys/time.h> #include <unistd.h> + #include <string.h> #ifdef WIN32 #define FSYNC_FILENAME "./test_fsync.out" #else + /* /tmp might be a memory file system */ #define FSYNC_FILENAME "/var/tmp/test_fsync.out" #endif ! #define WRITE_SIZE (16 * 1024) ! /* We allocate extra to guarantee ALIGNOF_XLOG_BUFFER alignment */ ! #define ALLOCATE_WAL_FILE_SIZE (WRITE_SIZE + ALIGNOF_XLOG_BUFFER) void die(char *str); void print_elapse(struct timeval start_t, struct timeval elapse_t); *************** *** 49,55 **** int tmpfile, i, loops = 1000; ! char *strout = (char *) malloc(WAL_FILE_SIZE); char *filename = FSYNC_FILENAME; if (argc > 2 && strcmp(argv[1], "-f") == 0) --- 41,47 ---- int tmpfile, i, loops = 1000; ! char *full_buf = (char *) malloc(ALLOCATE_WAL_FILE_SIZE), *buf; char *filename = FSYNC_FILENAME; if (argc > 2 && strcmp(argv[1], "-f") == 0) *************** *** 62,74 **** if (argc > 1) loops = atoi(argv[1]); ! for (i = 0; i < WAL_FILE_SIZE; i++) ! strout[i] = 'a'; if ((tmpfile = open(filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR)) == -1) die("Cannot open output file."); ! write(tmpfile, strout, WAL_FILE_SIZE); ! fsync(tmpfile); /* fsync so later fsync's don't have to do it */ close(tmpfile); printf("Simple write timing:\n"); --- 54,70 ---- if (argc > 1) loops = atoi(argv[1]); ! buf = (char *)TYPEALIGN(ALIGNOF_XLOG_BUFFER, full_buf); ! for (i = 0; i < WRITE_SIZE; i++) ! buf[i] = 'a'; if ((tmpfile = open(filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR)) == -1) die("Cannot open output file."); ! if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE) ! die("write failed"); ! /* fsync so later fsync's don't have to do it */ ! if (fsync(tmpfile) != 0) ! die("fsync failed"); close(tmpfile); printf("Simple write timing:\n"); *************** *** 78,84 **** { if ((tmpfile = open(filename, O_RDWR)) == -1) die("Cannot open output file."); ! write(tmpfile, strout, 8192); close(tmpfile); } gettimeofday(&elapse_t, NULL); --- 74,81 ---- { if ((tmpfile = open(filename, O_RDWR)) == -1) die("Cannot open output file."); ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); close(tmpfile); } gettimeofday(&elapse_t, NULL); *************** *** 95,102 **** { if ((tmpfile = open(filename, O_RDWR)) == -1) die("Cannot open output file."); ! write(tmpfile, strout, 8192); ! fsync(tmpfile); close(tmpfile); if ((tmpfile = open(filename, O_RDWR)) == -1) die("Cannot open output file."); --- 92,101 ---- { if ((tmpfile = open(filename, O_RDWR)) == -1) die("Cannot open output file."); ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); ! if (fsync(tmpfile) != 0) ! die("fsync failed"); close(tmpfile); if ((tmpfile = open(filename, O_RDWR)) == -1) die("Cannot open output file."); *************** *** 114,125 **** { if ((tmpfile = open(filename, O_RDWR)) == -1) die("Cannot open output file."); ! write(tmpfile, strout, 8192); close(tmpfile); /* reopen file */ if ((tmpfile = open(filename, O_RDWR)) == -1) die("Cannot open output file."); ! fsync(tmpfile); close(tmpfile); } gettimeofday(&elapse_t, NULL); --- 113,126 ---- { if ((tmpfile = open(filename, O_RDWR)) == -1) die("Cannot open output file."); ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); close(tmpfile); /* reopen file */ if ((tmpfile = open(filename, O_RDWR)) == -1) die("Cannot open output file."); ! if (fsync(tmpfile) != 0) ! die("fsync failed"); close(tmpfile); } gettimeofday(&elapse_t, NULL); *************** *** 135,141 **** die("Cannot open output file."); gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) ! write(tmpfile, strout, 16384); gettimeofday(&elapse_t, NULL); close(tmpfile); printf("\tone 16k o_sync write "); --- 136,143 ---- die("Cannot open output file."); gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) ! if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE) ! die("write failed"); gettimeofday(&elapse_t, NULL); close(tmpfile); printf("\tone 16k o_sync write "); *************** *** 148,155 **** gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) { ! write(tmpfile, strout, 8192); ! write(tmpfile, strout, 8192); } gettimeofday(&elapse_t, NULL); close(tmpfile); --- 150,159 ---- gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) { ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); } gettimeofday(&elapse_t, NULL); close(tmpfile); *************** *** 169,175 **** die("Cannot open output file."); gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) ! write(tmpfile, strout, 8192); gettimeofday(&elapse_t, NULL); close(tmpfile); printf("\topen o_dsync, write "); --- 173,180 ---- die("Cannot open output file."); gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); gettimeofday(&elapse_t, NULL); close(tmpfile); printf("\topen o_dsync, write "); *************** *** 181,187 **** die("Cannot open output file."); gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) ! write(tmpfile, strout, 8192); gettimeofday(&elapse_t, NULL); close(tmpfile); printf("\topen o_sync, write "); --- 186,193 ---- die("Cannot open output file."); gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); gettimeofday(&elapse_t, NULL); close(tmpfile); printf("\topen o_sync, write "); *************** *** 199,205 **** gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) { ! write(tmpfile, strout, 8192); fdatasync(tmpfile); } gettimeofday(&elapse_t, NULL); --- 205,212 ---- gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) { ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); fdatasync(tmpfile); } gettimeofday(&elapse_t, NULL); *************** *** 217,224 **** gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) { ! write(tmpfile, strout, 8192); ! fsync(tmpfile); } gettimeofday(&elapse_t, NULL); close(tmpfile); --- 224,233 ---- gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) { ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); ! if (fsync(tmpfile) != 0) ! die("fsync failed"); } gettimeofday(&elapse_t, NULL); close(tmpfile); *************** *** 235,242 **** gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) { ! write(tmpfile, strout, 8192); ! write(tmpfile, strout, 8192); } gettimeofday(&elapse_t, NULL); close(tmpfile); --- 244,253 ---- gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) { ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); } gettimeofday(&elapse_t, NULL); close(tmpfile); *************** *** 254,261 **** gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) { ! write(tmpfile, strout, 8192); ! write(tmpfile, strout, 8192); } gettimeofday(&elapse_t, NULL); close(tmpfile); --- 265,274 ---- gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) { ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); } gettimeofday(&elapse_t, NULL); close(tmpfile); *************** *** 271,278 **** gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) { ! write(tmpfile, strout, 8192); ! write(tmpfile, strout, 8192); fdatasync(tmpfile); } gettimeofday(&elapse_t, NULL); --- 284,293 ---- gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) { ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); fdatasync(tmpfile); } gettimeofday(&elapse_t, NULL); *************** *** 290,298 **** gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) { ! write(tmpfile, strout, 8192); ! write(tmpfile, strout, 8192); ! fsync(tmpfile); } gettimeofday(&elapse_t, NULL); close(tmpfile); --- 305,316 ---- gettimeofday(&start_t, NULL); for (i = 0; i < loops; i++) { ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); ! if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2) ! die("write failed"); ! if (fsync(tmpfile) != 0) ! die("fsync failed"); } gettimeofday(&elapse_t, NULL); close(tmpfile); *************** *** 300,305 **** --- 318,324 ---- print_elapse(start_t, elapse_t); printf("\n"); + free(full_buf); unlink(filename); return 0;
pgsql-patches by date: