Re: Streaming replication on win32, still broken - Mailing list pgsql-hackers
From | Heikki Linnakangas |
---|---|
Subject | Re: Streaming replication on win32, still broken |
Date | |
Msg-id | 4B7D36B6.7030903@enterprisedb.com Whole thread Raw |
In response to | Re: Streaming replication on win32, still broken (Magnus Hagander <magnus@hagander.net>) |
Responses |
Re: Streaming replication on win32, still broken
|
List | pgsql-hackers |
Magnus Hagander wrote: > 2010/2/18 Fujii Masao <masao.fujii@gmail.com>: >> On Thu, Feb 18, 2010 at 7:04 PM, Heikki Linnakangas >> <heikki.linnakangas@enterprisedb.com> wrote: >>> Magnus Hagander wrote: >>>> O_DIRECT helps us when we're not going to read the file again, because >>>> we don't waste cache on it. If we are, which is the case here, it >>>> should be really bad for performance, since we actually have to do a >>>> physical read. >>>> >>>> Incidentally, that should also apply to general WAL when archive_mdoe >>>> is on. Do we optimize for that? >>> Hmm, no we don't. We do take that into account so that we refrain from >>> issuing posix_fadvice(DONTNEED) if archive_mode is on, but we don't >>> disable O_DIRECT. Maybe we should.. >> Since the performance of WAL write is more important than that of WAL >> archiving in general, that optimization might offer little benefit. > > Well, it's going to make the process that reads the WAL cause actual > physical I/O... That'll take a chunk out of your total available I/O, > which is likely to push you to the limit of your I/O capacity much > quicker. Right, doesn't seem sensible, though it would be nice to see a benchmark on that. Here's a patch to disable O_DIRECT when archiving or streaming is enabled. This is pretty hard to test, so any extra eyeballs would be nice.. -- Heikki Linnakangas EnterpriseDB http://www.enterprisedb.com ? GNUmakefile ? a.patch ? b ? backend ? config.log ? config.status ? config.status.lineno ? configure.lineno ? gin-splay-1.patch ? gin-splay-2.patch ? gin-splay-3.patch ? include ? libpq ? log_unlogged_op_0115.patch ? md-1.c ? md-1.patch ? replication ? temp-file-resowner-2.patch ? contrib/pgbench/.deps ? contrib/pgbench/fsynctest ? contrib/pgbench/fsynctest.c ? contrib/pgbench/fsynctestfile ? contrib/pgbench/pgbench ? contrib/spi/.deps ? doc/src/sgml/HTML.index ? doc/src/sgml/bookindex.sgml ? doc/src/sgml/features-supported.sgml ? doc/src/sgml/features-unsupported.sgml ? doc/src/sgml/version.sgml ? src/Makefile.global ? src/backend/aaa.patch ? src/backend/postgres ? src/backend/access/common/.deps ? src/backend/access/gin/.deps ? src/backend/access/gist/.deps ? src/backend/access/hash/.deps ? src/backend/access/heap/.deps ? src/backend/access/index/.deps ? src/backend/access/nbtree/.deps ? src/backend/access/transam/.deps ? src/backend/bootstrap/.deps ? src/backend/catalog/.deps ? src/backend/commands/.deps ? src/backend/executor/.deps ? src/backend/foreign/.deps ? src/backend/foreign/dummy/.deps ? src/backend/foreign/postgresql/.deps ? src/backend/lib/.deps ? src/backend/libpq/.deps ? src/backend/main/.deps ? src/backend/nodes/.deps ? src/backend/optimizer/geqo/.deps ? src/backend/optimizer/path/.deps ? src/backend/optimizer/plan/.deps ? src/backend/optimizer/prep/.deps ? src/backend/optimizer/util/.deps ? src/backend/parser/.deps ? src/backend/po/af.mo ? src/backend/po/cs.mo ? src/backend/po/hr.mo ? src/backend/po/hu.mo ? src/backend/po/it.mo ? src/backend/po/ko.mo ? src/backend/po/nb.mo ? src/backend/po/nl.mo ? src/backend/po/pl.mo ? src/backend/po/ro.mo ? src/backend/po/ru.mo ? src/backend/po/sk.mo ? src/backend/po/sl.mo ? src/backend/po/sv.mo ? src/backend/po/zh_CN.mo ? src/backend/po/zh_TW.mo ? src/backend/port/.deps ? src/backend/postmaster/.deps ? src/backend/regex/.deps ? src/backend/replication/.deps ? src/backend/replication/libpqwalreceiver/.deps ? src/backend/rewrite/.deps ? src/backend/snowball/.deps ? src/backend/snowball/snowball_create.sql ? src/backend/storage/buffer/.deps ? src/backend/storage/file/.deps ? src/backend/storage/freespace/.deps ? src/backend/storage/ipc/.deps ? src/backend/storage/large_object/.deps ? src/backend/storage/lmgr/.deps ? src/backend/storage/page/.deps ? src/backend/storage/smgr/.deps ? src/backend/tcop/.deps ? src/backend/tsearch/.deps ? src/backend/utils/.deps ? src/backend/utils/probes.h ? src/backend/utils/adt/.deps ? src/backend/utils/cache/.deps ? src/backend/utils/error/.deps ? src/backend/utils/fmgr/.deps ? src/backend/utils/hash/.deps ? src/backend/utils/init/.deps ? src/backend/utils/mb/.deps ? src/backend/utils/mb/Unicode/BIG5.TXT ? src/backend/utils/mb/Unicode/CP950.TXT ? src/backend/utils/mb/conversion_procs/conversion_create.sql ? src/backend/utils/mb/conversion_procs/ascii_and_mic/.deps ? src/backend/utils/mb/conversion_procs/cyrillic_and_mic/.deps ? src/backend/utils/mb/conversion_procs/euc2004_sjis2004/.deps ? src/backend/utils/mb/conversion_procs/euc_cn_and_mic/.deps ? src/backend/utils/mb/conversion_procs/euc_jis_2004_and_shift_jis_2004/.deps ? src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/.deps ? src/backend/utils/mb/conversion_procs/euc_kr_and_mic/.deps ? src/backend/utils/mb/conversion_procs/euc_tw_and_big5/.deps ? src/backend/utils/mb/conversion_procs/latin2_and_win1250/.deps ? src/backend/utils/mb/conversion_procs/latin_and_mic/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_ascii/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_big5/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_euc2004/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_euc_jis_2004/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_gb18030/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_gbk/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_iso8859/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_johab/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_shift_jis_2004/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_sjis/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_uhc/.deps ? src/backend/utils/mb/conversion_procs/utf8_and_win/.deps ? src/backend/utils/misc/.deps ? src/backend/utils/mmgr/.deps ? src/backend/utils/resowner/.deps ? src/backend/utils/sort/.deps ? src/backend/utils/time/.deps ? src/bin/initdb/.deps ? src/bin/initdb/initdb ? src/bin/initdb/po/ko.mo ? src/bin/initdb/po/pl.mo ? src/bin/initdb/po/ro.mo ? src/bin/initdb/po/sk.mo ? src/bin/initdb/po/sl.mo ? src/bin/initdb/po/zh_CN.mo ? src/bin/initdb/po/zh_TW.mo ? src/bin/pg_config/.deps ? src/bin/pg_config/pg_config ? src/bin/pg_config/po/cs.mo ? src/bin/pg_config/po/pl.mo ? src/bin/pg_config/po/sl.mo ? src/bin/pg_config/po/zh_CN.mo ? src/bin/pg_config/po/zh_TW.mo ? src/bin/pg_controldata/.deps ? src/bin/pg_controldata/pg_controldata ? src/bin/pg_controldata/po/cs.mo ? src/bin/pg_controldata/po/fa.mo ? src/bin/pg_controldata/po/hu.mo ? src/bin/pg_controldata/po/nb.mo ? src/bin/pg_controldata/po/pl.mo ? src/bin/pg_controldata/po/ro.mo ? src/bin/pg_controldata/po/ru.mo ? src/bin/pg_controldata/po/sk.mo ? src/bin/pg_controldata/po/sl.mo ? src/bin/pg_controldata/po/zh_CN.mo ? src/bin/pg_controldata/po/zh_TW.mo ? src/bin/pg_ctl/.deps ? src/bin/pg_ctl/pg_ctl ? src/bin/pg_ctl/po/cs.mo ? src/bin/pg_ctl/po/ro.mo ? src/bin/pg_ctl/po/sk.mo ? src/bin/pg_ctl/po/sl.mo ? src/bin/pg_ctl/po/zh_CN.mo ? src/bin/pg_ctl/po/zh_TW.mo ? src/bin/pg_dump/.deps ? src/bin/pg_dump/pg_dump ? src/bin/pg_dump/pg_dumpall ? src/bin/pg_dump/pg_restore ? src/bin/pg_dump/po/cs.mo ? src/bin/pg_dump/po/ko.mo ? src/bin/pg_dump/po/nb.mo ? src/bin/pg_dump/po/ro.mo ? src/bin/pg_dump/po/ru.mo ? src/bin/pg_dump/po/sk.mo ? src/bin/pg_dump/po/sl.mo ? src/bin/pg_dump/po/zh_CN.mo ? src/bin/pg_dump/po/zh_TW.mo ? src/bin/pg_resetxlog/.deps ? src/bin/pg_resetxlog/pg_resetxlog ? src/bin/pg_resetxlog/po/cs.mo ? src/bin/pg_resetxlog/po/hu.mo ? src/bin/pg_resetxlog/po/nb.mo ? src/bin/pg_resetxlog/po/sk.mo ? src/bin/pg_resetxlog/po/sl.mo ? src/bin/pg_resetxlog/po/zh_CN.mo ? src/bin/pg_resetxlog/po/zh_TW.mo ? src/bin/psql/.deps ? src/bin/psql/psql ? src/bin/psql/po/fa.mo ? src/bin/psql/po/hu.mo ? src/bin/psql/po/it.mo ? src/bin/psql/po/ko.mo ? src/bin/psql/po/nb.mo ? src/bin/psql/po/ro.mo ? src/bin/psql/po/ru.mo ? src/bin/psql/po/sk.mo ? src/bin/psql/po/sl.mo ? src/bin/psql/po/zh_CN.mo ? src/bin/psql/po/zh_TW.mo ? src/bin/scripts/.deps ? src/bin/scripts/clusterdb ? src/bin/scripts/createdb ? src/bin/scripts/createlang ? src/bin/scripts/createuser ? src/bin/scripts/dropdb ? src/bin/scripts/droplang ? src/bin/scripts/dropuser ? src/bin/scripts/foo ? src/bin/scripts/reindexdb ? src/bin/scripts/vacuumdb ? src/bin/scripts/po/ru.mo ? src/bin/scripts/po/sk.mo ? src/bin/scripts/po/sl.mo ? src/bin/scripts/po/zh_CN.mo ? src/bin/scripts/po/zh_TW.mo ? src/include/pg_config.h ? src/include/stamp-h ? src/interfaces/ecpg/compatlib/.deps ? src/interfaces/ecpg/compatlib/exports.list ? src/interfaces/ecpg/compatlib/libecpg_compat.so.3.1 ? src/interfaces/ecpg/compatlib/libecpg_compat.so.3.2 ? src/interfaces/ecpg/ecpglib/.deps ? src/interfaces/ecpg/ecpglib/exports.list ? src/interfaces/ecpg/ecpglib/libecpg.so.6.1 ? src/interfaces/ecpg/ecpglib/libecpg.so.6.2 ? src/interfaces/ecpg/include/ecpg_config.h ? src/interfaces/ecpg/include/stamp-h ? src/interfaces/ecpg/pgtypeslib/.deps ? src/interfaces/ecpg/pgtypeslib/exports.list ? src/interfaces/ecpg/pgtypeslib/libpgtypes.so.3.1 ? src/interfaces/ecpg/pgtypeslib/libpgtypes.so.3.2 ? src/interfaces/ecpg/preproc/.deps ? src/interfaces/ecpg/preproc/ecpg ? src/interfaces/libpq/.deps ? src/interfaces/libpq/exports.list ? src/interfaces/libpq/libpq.so.5.3 ? src/interfaces/libpq/po/af.mo ? src/interfaces/libpq/po/hr.mo ? src/interfaces/libpq/po/nb.mo ? src/interfaces/libpq/po/pl.mo ? src/interfaces/libpq/po/sk.mo ? src/interfaces/libpq/po/sl.mo ? src/interfaces/libpq/po/zh_CN.mo ? src/interfaces/libpq/po/zh_TW.mo ? src/pl/plpgsql/src/.deps ? src/pl/plpgsql/src/pl_scan.c ? src/pl/plpgsql/src/po/tr.mo ? src/port/.deps ? src/port/pg_config_paths.h ? src/test/regress/.deps ? src/test/regress/log ? src/test/regress/pg_regress ? src/test/regress/results ? src/test/regress/testtablespace ? src/test/regress/expected/constraints.out ? src/test/regress/expected/copy.out ? src/test/regress/expected/create_function_1.out ? src/test/regress/expected/create_function_2.out ? src/test/regress/expected/largeobject.out ? src/test/regress/expected/largeobject_1.out ? src/test/regress/expected/misc.out ? src/test/regress/expected/tablespace.out ? src/test/regress/sql/constraints.sql ? src/test/regress/sql/copy.sql ? src/test/regress/sql/create_function_1.sql ? src/test/regress/sql/create_function_2.sql ? src/test/regress/sql/largeobject.sql ? src/test/regress/sql/misc.sql ? src/test/regress/sql/tablespace.sql ? src/timezone/.deps ? src/timezone/zic Index: src/backend/access/transam/xlog.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v retrieving revision 1.375 diff -u -r1.375 xlog.c --- src/backend/access/transam/xlog.c 17 Feb 2010 04:19:39 -0000 1.375 +++ src/backend/access/transam/xlog.c 18 Feb 2010 12:40:05 -0000 @@ -2686,13 +2686,10 @@ * WAL segment files will not be re-read in normal operation, so we advise * the OS to release any cached pages. But do not do so if WAL archiving * or streaming is active, because archiver and walsender process could use - * the cache to read the WAL segment. Also, don't bother with it if we - * are using O_DIRECT, since the kernel is presumably not caching in that - * case. + * the cache to read the WAL segment. */ #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) - if (!XLogIsNeeded() && - (get_sync_bit(sync_method) & PG_O_DIRECT) == 0) + if (!XLogIsNeeded()) (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED); #endif @@ -7652,10 +7649,29 @@ static int get_sync_bit(int method) { + int o_direct_flag = 0; + /* If fsync is disabled, never open in sync mode */ if (!enableFsync) return 0; + /* + * Optimize writes by bypassing kernel cache with O_DIRECT when using + * O_SYNC, O_DSYNC or O_FSYNC. But only if archiving and streaming are + * disabled, otherwise the archive command or walsender process will + * read the WAL soon after writing it, which is guaranteed to cause a + * physical read if we bypassed the kernel cache. We also skip the + * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the + * same reason. + * + * Never use O_DIRECT in walsender process for similar reasons; the WAL + * written by walreceiver is normally read by the startup process soon + * after its written. Also, walreceiver performs unaligned writes, which + * don't work with O_DIRECT, so it is required for correctness too. + */ + if (!XLogIsNeeded() && !am_walreceiver) + o_direct_flag = PG_O_DIRECT; + switch (method) { /* @@ -7670,11 +7686,11 @@ return 0; #ifdef OPEN_SYNC_FLAG case SYNC_METHOD_OPEN: - return OPEN_SYNC_FLAG; + return OPEN_SYNC_FLAG | o_direct_flag; #endif #ifdef OPEN_DATASYNC_FLAG case SYNC_METHOD_OPEN_DSYNC: - return OPEN_DATASYNC_FLAG; + return OPEN_DATASYNC_FLAG | o_direct_flag; #endif default: /* can't happen (unless we are out of sync with option array) */ Index: src/backend/replication/walreceiver.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/replication/walreceiver.c,v retrieving revision 1.4 diff -u -r1.4 walreceiver.c --- src/backend/replication/walreceiver.c 17 Feb 2010 04:19:39 -0000 1.4 +++ src/backend/replication/walreceiver.c 18 Feb 2010 12:40:05 -0000 @@ -50,6 +50,9 @@ #include "utils/ps_status.h" #include "utils/resowner.h" +/* Global variable to indicate if this process is a walreceiver process */ +bool am_walreceiver; + /* libpqreceiver hooks to these when loaded */ walrcv_connect_type walrcv_connect = NULL; walrcv_receive_type walrcv_receive = NULL; @@ -158,6 +161,8 @@ /* use volatile pointer to prevent code rearrangement */ volatile WalRcvData *walrcv = WalRcv; + am_walreceiver = true; + /* * WalRcv should be set up already (if we are a backend, we inherit * this by fork() or EXEC_BACKEND mechanism from the postmaster). @@ -424,16 +429,18 @@ bool use_existent; /* - * XLOG segment files will be re-read in recovery operation soon, - * so we don't need to advise the OS to release any cache page. + * fsync() and close current file before we switch to next one. + * We would otherwise have to reopen this file to fsync it later */ if (recvFile >= 0) { + XLogWalRcvFlush(); + /* - * fsync() before we switch to next file. We would otherwise - * have to reopen this file to fsync it later + * XLOG segment files will be re-read by recovery in startup + * process soon, so we don't advise the OS to release cache + * pages associated with the file like XLogFileClose() does. */ - XLogWalRcvFlush(); if (close(recvFile) != 0) ereport(PANIC, (errcode_for_file_access(), @@ -445,8 +452,7 @@ /* Create/use new log file */ XLByteToSeg(recptr, recvId, recvSeg); use_existent = true; - recvFile = XLogFileInit(recvId, recvSeg, - &use_existent, true); + recvFile = XLogFileInit(recvId, recvSeg, &use_existent, true); recvOff = 0; } Index: src/include/access/xlogdefs.h =================================================================== RCS file: /cvsroot/pgsql/src/include/access/xlogdefs.h,v retrieving revision 1.25 diff -u -r1.25 xlogdefs.h --- src/include/access/xlogdefs.h 15 Jan 2010 09:19:06 -0000 1.25 +++ src/include/access/xlogdefs.h 18 Feb 2010 12:40:05 -0000 @@ -106,23 +106,20 @@ * configure determined whether fdatasync() is. */ #if defined(O_SYNC) -#define BARE_OPEN_SYNC_FLAG O_SYNC +#define OPEN_SYNC_FLAG O_SYNC #elif defined(O_FSYNC) -#define BARE_OPEN_SYNC_FLAG O_FSYNC -#endif -#ifdef BARE_OPEN_SYNC_FLAG -#define OPEN_SYNC_FLAG (BARE_OPEN_SYNC_FLAG | PG_O_DIRECT) +#define OPEN_SYNC_FLAG O_FSYNC #endif #if defined(O_DSYNC) #if defined(OPEN_SYNC_FLAG) /* O_DSYNC is distinct? */ -#if O_DSYNC != BARE_OPEN_SYNC_FLAG -#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT) +#if O_DSYNC != OPEN_SYNC_FLAG +#define OPEN_DATASYNC_FLAG O_DSYNC #endif #else /* !defined(OPEN_SYNC_FLAG) */ /* Win32 only has O_DSYNC */ -#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT) +#define OPEN_DATASYNC_FLAG O_DSYNC #endif #endif Index: src/include/replication/walreceiver.h =================================================================== RCS file: /cvsroot/pgsql/src/include/replication/walreceiver.h,v retrieving revision 1.6 diff -u -r1.6 walreceiver.h --- src/include/replication/walreceiver.h 3 Feb 2010 09:47:19 -0000 1.6 +++ src/include/replication/walreceiver.h 18 Feb 2010 12:40:05 -0000 @@ -15,6 +15,8 @@ #include "access/xlogdefs.h" #include "storage/spin.h" +extern bool am_walreceiver; + /* * MAXCONNINFO: maximum size of a connection string. *
pgsql-hackers by date: