Fujii Masao wrote:
> Hi,
>
> When the replication connection is closed unexpectedly, walsender might emit
> the following unfit messages. IOW, the loss of the connection might be wrongly
> regarded as an arrival of invalid message by the walsender. This looks messy.
> We should get rid of that unfit FATAL message, emit a COMMERROR message and
> just call proc_exit() when the loss of the connection is found?
>
>> [2460]: LOG: could not receive data from client: No connection could be made because the target machine actively
refusedit.
>> [2460]: FATAL: invalid standby closing message type 4
>> [2460]: LOG: could not send data to client: No connection could be made because the target machine actively refused
it.
>
> Also the walsender wrongly tries to send the FATAL message to the standby even
> though the connection has already been closed, and then gets the following LOG
> message after the FATAL one. This FATAL message is suitable, but output of the
> LOG message looks messy, too. We should use COMMERROR instead of FATAL and then
> just call proc_exit() in order to prevent a message from being sent?
>
>> [12586] FATAL: unexpected EOF on standby connection
>> [12586] LOG: could not send data to client: Broken pipe
>
> The attached patch fixes those unfit messages.
Actually the pg_getbyte_if_available() function is a bit confused. The
comment above it claims that it returns 0 if no data is available
without blocking, but it actually returns -1 with errno==EWOULDBLOCK.
That stems from confusion in secure_read(); with SSL it returns 0 if it
would block, but without SSL it returns -1 and EWOULDBLOCK. We should
fix that so that secure_read() behaves consistently and so that
pq_getbyte_if_available() behaves like e.g pq_getbytes().
Patch attached for that. pq_getbyte_if_available() now reports any
errors with COMMERROR level, and returns EOF if the connection is closed
cleanly. If no data is available without blocking it now really returns
0 as the comment said. Walsender reports any unexpected EOF to the log
at COMMERROR level, similar to what normal backends do.
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
? aaa.patch
? postgres
? access/common/.deps
? access/gin/.deps
? access/gist/.deps
? access/hash/.deps
? access/heap/.deps
? access/index/.deps
? access/nbtree/.deps
? access/transam/.deps
? bootstrap/.deps
? catalog/.deps
? commands/.deps
? executor/.deps
? foreign/.deps
? foreign/dummy/.deps
? foreign/postgresql/.deps
? lib/.deps
? libpq/.deps
? main/.deps
? nodes/.deps
? optimizer/geqo/.deps
? optimizer/path/.deps
? optimizer/plan/.deps
? optimizer/prep/.deps
? optimizer/util/.deps
? parser/.deps
? po/af.mo
? po/cs.mo
? po/hr.mo
? po/hu.mo
? po/it.mo
? po/ko.mo
? po/nb.mo
? po/nl.mo
? po/pl.mo
? po/ro.mo
? po/ru.mo
? po/sk.mo
? po/sl.mo
? po/sv.mo
? po/zh_CN.mo
? po/zh_TW.mo
? port/.deps
? postmaster/.deps
? regex/.deps
? replication/.deps
? replication/libpqwalreceiver/.deps
? rewrite/.deps
? snowball/.deps
? snowball/snowball_create.sql
? storage/buffer/.deps
? storage/file/.deps
? storage/freespace/.deps
? storage/ipc/.deps
? storage/large_object/.deps
? storage/lmgr/.deps
? storage/page/.deps
? storage/smgr/.deps
? tcop/.deps
? tsearch/.deps
? utils/.deps
? utils/probes.h
? utils/adt/.deps
? utils/cache/.deps
? utils/error/.deps
? utils/fmgr/.deps
? utils/hash/.deps
? utils/init/.deps
? utils/mb/.deps
? utils/mb/Unicode/BIG5.TXT
? utils/mb/Unicode/CP950.TXT
? utils/mb/conversion_procs/conversion_create.sql
? utils/mb/conversion_procs/ascii_and_mic/.deps
? utils/mb/conversion_procs/cyrillic_and_mic/.deps
? utils/mb/conversion_procs/euc2004_sjis2004/.deps
? utils/mb/conversion_procs/euc_cn_and_mic/.deps
? utils/mb/conversion_procs/euc_jis_2004_and_shift_jis_2004/.deps
? utils/mb/conversion_procs/euc_jp_and_sjis/.deps
? utils/mb/conversion_procs/euc_kr_and_mic/.deps
? utils/mb/conversion_procs/euc_tw_and_big5/.deps
? utils/mb/conversion_procs/latin2_and_win1250/.deps
? utils/mb/conversion_procs/latin_and_mic/.deps
? utils/mb/conversion_procs/utf8_and_ascii/.deps
? utils/mb/conversion_procs/utf8_and_big5/.deps
? utils/mb/conversion_procs/utf8_and_cyrillic/.deps
? utils/mb/conversion_procs/utf8_and_euc2004/.deps
? utils/mb/conversion_procs/utf8_and_euc_cn/.deps
? utils/mb/conversion_procs/utf8_and_euc_jis_2004/.deps
? utils/mb/conversion_procs/utf8_and_euc_jp/.deps
? utils/mb/conversion_procs/utf8_and_euc_kr/.deps
? utils/mb/conversion_procs/utf8_and_euc_tw/.deps
? utils/mb/conversion_procs/utf8_and_gb18030/.deps
? utils/mb/conversion_procs/utf8_and_gbk/.deps
? utils/mb/conversion_procs/utf8_and_iso8859/.deps
? utils/mb/conversion_procs/utf8_and_iso8859_1/.deps
? utils/mb/conversion_procs/utf8_and_johab/.deps
? utils/mb/conversion_procs/utf8_and_shift_jis_2004/.deps
? utils/mb/conversion_procs/utf8_and_sjis/.deps
? utils/mb/conversion_procs/utf8_and_sjis2004/.deps
? utils/mb/conversion_procs/utf8_and_uhc/.deps
? utils/mb/conversion_procs/utf8_and_win/.deps
? utils/misc/.deps
? utils/mmgr/.deps
? utils/resowner/.deps
? utils/sort/.deps
? utils/time/.deps
Index: libpq/be-secure.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/libpq/be-secure.c,v
retrieving revision 1.96
diff -c -r1.96 be-secure.c
*** libpq/be-secure.c 15 Jan 2010 09:19:02 -0000 1.96
--- libpq/be-secure.c 18 Feb 2010 09:42:30 -0000
***************
*** 256,262 ****
case SSL_ERROR_WANT_READ:
case SSL_ERROR_WANT_WRITE:
if (port->noblock)
! return 0;
#ifdef WIN32
pgwin32_waitforsinglesocket(SSL_get_fd(port->ssl),
(err == SSL_ERROR_WANT_READ) ?
--- 256,266 ----
case SSL_ERROR_WANT_READ:
case SSL_ERROR_WANT_WRITE:
if (port->noblock)
! {
! errno = EWOULDBLOCK;
! n = -1;
! break;
! }
#ifdef WIN32
pgwin32_waitforsinglesocket(SSL_get_fd(port->ssl),
(err == SSL_ERROR_WANT_READ) ?
Index: libpq/pqcomm.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/libpq/pqcomm.c,v
retrieving revision 1.203
diff -c -r1.203 pqcomm.c
*** libpq/pqcomm.c 16 Feb 2010 19:26:02 -0000 1.203
--- libpq/pqcomm.c 18 Feb 2010 09:42:30 -0000
***************
*** 822,828 ****
* if available
*
* The received byte is stored in *c. Returns 1 if a byte was read, 0 if
! * if no data was available, or EOF.
* --------------------------------
*/
int
--- 822,828 ----
* if available
*
* The received byte is stored in *c. Returns 1 if a byte was read, 0 if
! * if no data was available, or EOF if trouble.
* --------------------------------
*/
int
***************
*** 848,853 ****
--- 848,879 ----
PG_TRY();
{
r = secure_read(MyProcPort, c, 1);
+ if (r < 0)
+ {
+ /*
+ * Ok if would block or interrupted (though it really shouldn't
+ * happen with a non-blocking operation).
+ */
+ if (errno == EAGAIN || EWOULDBLOCK || errno == EINTR)
+ r = 0;
+ else
+ {
+ /*
+ * Careful: an ereport() that tries to write to the client would
+ * cause recursion to here, leading to stack overflow and core
+ * dump! This message must go *only* to the postmaster log.
+ */
+ ereport(COMMERROR,
+ (errcode_for_socket_access(),
+ errmsg("could not receive data from client: %m")));
+ return EOF;
+ }
+ }
+ else if (r == 0)
+ {
+ /* EOF detected */
+ r = EOF;
+ }
}
PG_CATCH();
{
Index: replication/walsender.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/replication/walsender.c,v
retrieving revision 1.6
diff -c -r1.6 walsender.c
*** replication/walsender.c 17 Feb 2010 04:19:39 -0000 1.6
--- replication/walsender.c 18 Feb 2010 09:42:30 -0000
***************
*** 286,300 ****
break;
}
! /* 'X' means that the standby is closing the connection */
case 'X':
proc_exit(0);
- case EOF:
- ereport(ERROR,
- (errcode(ERRCODE_PROTOCOL_VIOLATION),
- errmsg("unexpected EOF on standby connection")));
-
default:
ereport(FATAL,
(errcode(ERRCODE_PROTOCOL_VIOLATION),
--- 286,304 ----
break;
}
! /*
! * 'X' means that the standby is closing the connection. EOF
! * means unexpected loss of standby connection. Either way,
! * perform normal shutdown.
! */
! case EOF:
! ereport(COMMERROR,
! (errcode(ERRCODE_PROTOCOL_VIOLATION),
! errmsg("unexpected EOF on standby connection")));
! /* fall through */
case 'X':
proc_exit(0);
default:
ereport(FATAL,
(errcode(ERRCODE_PROTOCOL_VIOLATION),
***************
*** 315,350 ****
r = pq_getbyte_if_available(&firstchar);
if (r < 0)
{
! /* no data available */
! if (errno == EAGAIN || errno == EWOULDBLOCK)
! return;
!
! /*
! * Ok if interrupted, though it shouldn't really happen with
! * a non-blocking operation.
! */
! if (errno == EINTR)
! return;
!
ereport(COMMERROR,
! (errcode_for_socket_access(),
! errmsg("could not receive data from client: %m")));
}
if (r == 0)
{
! /* standby disconnected unexpectedly */
! ereport(ERROR,
! (errcode(ERRCODE_PROTOCOL_VIOLATION),
! errmsg("unexpected EOF on standby connection")));
}
/* Handle the very limited subset of commands expected in this phase */
switch (firstchar)
{
/*
! * 'X' means that the standby is closing down the socket. EOF means
! * unexpected loss of standby connection. Either way, perform normal
! * shutdown.
*/
case 'X':
proc_exit(0);
--- 319,341 ----
r = pq_getbyte_if_available(&firstchar);
if (r < 0)
{
! /* unexpected error */
ereport(COMMERROR,
! (errcode(ERRCODE_PROTOCOL_VIOLATION),
! errmsg("unexpected EOF on standby connection")));
! proc_exit(0);
}
if (r == 0)
{
! /* no data available without blocking */
! return;
}
/* Handle the very limited subset of commands expected in this phase */
switch (firstchar)
{
/*
! * 'X' means that the standby is closing down the socket.
*/
case 'X':
proc_exit(0);