Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * walsender.c
4 : *
5 : * The WAL sender process (walsender) is new as of Postgres 9.0. It takes
6 : * care of sending XLOG from the primary server to a single recipient.
7 : * (Note that there can be more than one walsender process concurrently.)
8 : * It is started by the postmaster when the walreceiver of a standby server
9 : * connects to the primary server and requests XLOG streaming replication.
10 : *
11 : * A walsender is similar to a regular backend, ie. there is a one-to-one
12 : * relationship between a connection and a walsender process, but instead
13 : * of processing SQL queries, it understands a small set of special
14 : * replication-mode commands. The START_REPLICATION command begins streaming
15 : * WAL to the client. While streaming, the walsender keeps reading XLOG
16 : * records from the disk and sends them to the standby server over the
17 : * COPY protocol, until either side ends the replication by exiting COPY
18 : * mode (or until the connection is closed).
19 : *
20 : * Normal termination is by SIGTERM, which instructs the walsender to
21 : * close the connection and exit(0) at the next convenient moment. Emergency
22 : * termination is by SIGQUIT; like any backend, the walsender will simply
23 : * abort and exit on SIGQUIT. A close of the connection and a FATAL error
24 : * are treated as not a crash but approximately normal termination;
25 : * the walsender will exit quickly without sending any more XLOG records.
26 : *
27 : * If the server is shut down, checkpointer sends us
28 : * PROCSIG_WALSND_INIT_STOPPING after all regular backends have exited. If
29 : * the backend is idle or runs an SQL query this causes the backend to
30 : * shutdown, if logical replication is in progress all existing WAL records
31 : * are processed followed by a shutdown. Otherwise this causes the walsender
32 : * to switch to the "stopping" state. In this state, the walsender will reject
33 : * any further replication commands. The checkpointer begins the shutdown
34 : * checkpoint once all walsenders are confirmed as stopping. When the shutdown
35 : * checkpoint finishes, the postmaster sends us SIGUSR2. This instructs
36 : * walsender to send any outstanding WAL, including the shutdown checkpoint
37 : * record, wait for it to be replicated to the standby, and then exit.
38 : *
39 : *
40 : * Portions Copyright (c) 2010-2020, PostgreSQL Global Development Group
41 : *
42 : * IDENTIFICATION
43 : * src/backend/replication/walsender.c
44 : *
45 : *-------------------------------------------------------------------------
46 : */
47 : #include "postgres.h"
48 :
49 : #include <signal.h>
50 : #include <unistd.h>
51 :
52 : #include "access/printtup.h"
53 : #include "access/timeline.h"
54 : #include "access/transam.h"
55 : #include "access/xact.h"
56 : #include "access/xlog_internal.h"
57 : #include "access/xlogreader.h"
58 : #include "access/xlogutils.h"
59 : #include "catalog/pg_authid.h"
60 : #include "catalog/pg_type.h"
61 : #include "commands/dbcommands.h"
62 : #include "commands/defrem.h"
63 : #include "funcapi.h"
64 : #include "libpq/libpq.h"
65 : #include "libpq/pqformat.h"
66 : #include "miscadmin.h"
67 : #include "nodes/replnodes.h"
68 : #include "pgstat.h"
69 : #include "postmaster/interrupt.h"
70 : #include "replication/basebackup.h"
71 : #include "replication/decode.h"
72 : #include "replication/logical.h"
73 : #include "replication/slot.h"
74 : #include "replication/snapbuild.h"
75 : #include "replication/syncrep.h"
76 : #include "replication/walreceiver.h"
77 : #include "replication/walsender.h"
78 : #include "replication/walsender_private.h"
79 : #include "storage/condition_variable.h"
80 : #include "storage/fd.h"
81 : #include "storage/ipc.h"
82 : #include "storage/pmsignal.h"
83 : #include "storage/proc.h"
84 : #include "storage/procarray.h"
85 : #include "tcop/dest.h"
86 : #include "tcop/tcopprot.h"
87 : #include "utils/acl.h"
88 : #include "utils/builtins.h"
89 : #include "utils/guc.h"
90 : #include "utils/memutils.h"
91 : #include "utils/pg_lsn.h"
92 : #include "utils/portal.h"
93 : #include "utils/ps_status.h"
94 : #include "utils/timeout.h"
95 : #include "utils/timestamp.h"
96 :
97 : /*
98 : * Maximum data payload in a WAL data message. Must be >= XLOG_BLCKSZ.
99 : *
100 : * We don't have a good idea of what a good value would be; there's some
101 : * overhead per message in both walsender and walreceiver, but on the other
102 : * hand sending large batches makes walsender less responsive to signals
103 : * because signals are checked only between messages. 128kB (with
104 : * default 8k blocks) seems like a reasonable guess for now.
105 : */
106 : #define MAX_SEND_SIZE (XLOG_BLCKSZ * 16)
107 :
108 : /* Array of WalSnds in shared memory */
109 : WalSndCtlData *WalSndCtl = NULL;
110 :
111 : /* My slot in the shared memory array */
112 : WalSnd *MyWalSnd = NULL;
113 :
114 : /* Global state */
115 : bool am_walsender = false; /* Am I a walsender process? */
116 : bool am_cascading_walsender = false; /* Am I cascading WAL to another
117 : * standby? */
118 : bool am_db_walsender = false; /* Connected to a database? */
119 :
120 : /* User-settable parameters for walsender */
121 : int max_wal_senders = 0; /* the maximum number of concurrent
122 : * walsenders */
123 : int wal_sender_timeout = 60 * 1000; /* maximum time to send one WAL
124 : * data message */
125 : bool log_replication_commands = false;
126 :
127 : /*
128 : * State for WalSndWakeupRequest
129 : */
130 : bool wake_wal_senders = false;
131 :
132 : /*
133 : * xlogreader used for replication. Note that a WAL sender doing physical
134 : * replication does not need xlogreader to read WAL, but it needs one to
135 : * keep a state of its work.
136 : */
137 : static XLogReaderState *xlogreader = NULL;
138 :
139 : /*
140 : * These variables keep track of the state of the timeline we're currently
141 : * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric,
142 : * the timeline is not the latest timeline on this server, and the server's
143 : * history forked off from that timeline at sendTimeLineValidUpto.
144 : */
145 : static TimeLineID sendTimeLine = 0;
146 : static TimeLineID sendTimeLineNextTLI = 0;
147 : static bool sendTimeLineIsHistoric = false;
148 : static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr;
149 :
150 : /*
151 : * How far have we sent WAL already? This is also advertised in
152 : * MyWalSnd->sentPtr. (Actually, this is the next WAL location to send.)
153 : */
154 : static XLogRecPtr sentPtr = InvalidXLogRecPtr;
155 :
156 : /* Buffers for constructing outgoing messages and processing reply messages. */
157 : static StringInfoData output_message;
158 : static StringInfoData reply_message;
159 : static StringInfoData tmpbuf;
160 :
161 : /* Timestamp of last ProcessRepliesIfAny(). */
162 : static TimestampTz last_processing = 0;
163 :
164 : /*
165 : * Timestamp of last ProcessRepliesIfAny() that saw a reply from the
166 : * standby. Set to 0 if wal_sender_timeout doesn't need to be active.
167 : */
168 : static TimestampTz last_reply_timestamp = 0;
169 :
170 : /* Have we sent a heartbeat message asking for reply, since last reply? */
171 : static bool waiting_for_ping_response = false;
172 :
173 : /*
174 : * While streaming WAL in Copy mode, streamingDoneSending is set to true
175 : * after we have sent CopyDone. We should not send any more CopyData messages
176 : * after that. streamingDoneReceiving is set to true when we receive CopyDone
177 : * from the other end. When both become true, it's time to exit Copy mode.
178 : */
179 : static bool streamingDoneSending;
180 : static bool streamingDoneReceiving;
181 :
182 : /* Are we there yet? */
183 : static bool WalSndCaughtUp = false;
184 :
185 : /* Flags set by signal handlers for later service in main loop */
186 : static volatile sig_atomic_t got_SIGUSR2 = false;
187 : static volatile sig_atomic_t got_STOPPING = false;
188 :
189 : /*
190 : * This is set while we are streaming. When not set
191 : * PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set,
192 : * the main loop is responsible for checking got_STOPPING and terminating when
193 : * it's set (after streaming any remaining WAL).
194 : */
195 : static volatile sig_atomic_t replication_active = false;
196 :
197 : static LogicalDecodingContext *logical_decoding_ctx = NULL;
198 :
199 : /* A sample associating a WAL location with the time it was written. */
200 : typedef struct
201 : {
202 : XLogRecPtr lsn;
203 : TimestampTz time;
204 : } WalTimeSample;
205 :
206 : /* The size of our buffer of time samples. */
207 : #define LAG_TRACKER_BUFFER_SIZE 8192
208 :
209 : /* A mechanism for tracking replication lag. */
210 : typedef struct
211 : {
212 : XLogRecPtr last_lsn;
213 : WalTimeSample buffer[LAG_TRACKER_BUFFER_SIZE];
214 : int write_head;
215 : int read_heads[NUM_SYNC_REP_WAIT_MODE];
216 : WalTimeSample last_read[NUM_SYNC_REP_WAIT_MODE];
217 : } LagTracker;
218 :
219 : static LagTracker *lag_tracker;
220 :
221 : /* Signal handlers */
222 : static void WalSndLastCycleHandler(SIGNAL_ARGS);
223 :
224 : /* Prototypes for private functions */
225 : typedef void (*WalSndSendDataCallback) (void);
226 : static void WalSndLoop(WalSndSendDataCallback send_data);
227 : static void InitWalSenderSlot(void);
228 : static void WalSndKill(int code, Datum arg);
229 : static void WalSndShutdown(void) pg_attribute_noreturn();
230 : static void XLogSendPhysical(void);
231 : static void XLogSendLogical(void);
232 : static void WalSndDone(WalSndSendDataCallback send_data);
233 : static XLogRecPtr GetStandbyFlushRecPtr(void);
234 : static void IdentifySystem(void);
235 : static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd);
236 : static void DropReplicationSlot(DropReplicationSlotCmd *cmd);
237 : static void StartReplication(StartReplicationCmd *cmd);
238 : static void StartLogicalReplication(StartReplicationCmd *cmd);
239 : static void ProcessStandbyMessage(void);
240 : static void ProcessStandbyReplyMessage(void);
241 : static void ProcessStandbyHSFeedbackMessage(void);
242 : static void ProcessRepliesIfAny(void);
243 : static void WalSndKeepalive(bool requestReply);
244 : static void WalSndKeepaliveIfNecessary(void);
245 : static void WalSndCheckTimeOut(void);
246 : static long WalSndComputeSleeptime(TimestampTz now);
247 : static void WalSndPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool last_write);
248 : static void WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool last_write);
249 : static void WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid);
250 : static XLogRecPtr WalSndWaitForWal(XLogRecPtr loc);
251 : static void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time);
252 : static TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now);
253 : static bool TransactionIdInRecentPast(TransactionId xid, uint32 epoch);
254 :
255 : static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
256 : TimeLineID *tli_p);
257 :
258 :
259 : /* Initialize walsender process before entering the main command loop */
260 : void
261 288 : InitWalSender(void)
262 : {
263 288 : am_cascading_walsender = RecoveryInProgress();
264 :
265 : /* Create a per-walsender data structure in shared memory */
266 288 : InitWalSenderSlot();
267 :
268 : /*
269 : * We don't currently need any ResourceOwner in a walsender process, but
270 : * if we did, we could call CreateAuxProcessResourceOwner here.
271 : */
272 :
273 : /*
274 : * Let postmaster know that we're a WAL sender. Once we've declared us as
275 : * a WAL sender process, postmaster will let us outlive the bgwriter and
276 : * kill us last in the shutdown sequence, so we get a chance to stream all
277 : * remaining WAL at shutdown, including the shutdown checkpoint. Note that
278 : * there's no going back, and we mustn't write any WAL records after this.
279 : */
280 288 : MarkPostmasterChildWalSender();
281 288 : SendPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE);
282 :
283 : /* Initialize empty timestamp buffer for lag tracking. */
284 288 : lag_tracker = MemoryContextAllocZero(TopMemoryContext, sizeof(LagTracker));
285 288 : }
286 :
287 : /*
288 : * Clean up after an error.
289 : *
290 : * WAL sender processes don't use transactions like regular backends do.
291 : * This function does any cleanup required after an error in a WAL sender
292 : * process, similar to what transaction abort does in a regular backend.
293 : */
294 : void
295 0 : WalSndErrorCleanup(void)
296 : {
297 0 : LWLockReleaseAll();
298 0 : ConditionVariableCancelSleep();
299 0 : pgstat_report_wait_end();
300 :
301 0 : if (xlogreader != NULL && xlogreader->seg.ws_file >= 0)
302 0 : wal_segment_close(xlogreader);
303 :
304 0 : if (MyReplicationSlot != NULL)
305 0 : ReplicationSlotRelease();
306 :
307 0 : ReplicationSlotCleanup();
308 :
309 0 : replication_active = false;
310 :
311 : /*
312 : * If there is a transaction in progress, it will clean up our
313 : * ResourceOwner, but if a replication command set up a resource owner
314 : * without a transaction, we've got to clean that up now.
315 : */
316 0 : if (!IsTransactionOrTransactionBlock())
317 0 : WalSndResourceCleanup(false);
318 :
319 0 : if (got_STOPPING || got_SIGUSR2)
320 0 : proc_exit(0);
321 :
322 : /* Revert back to startup state */
323 0 : WalSndSetState(WALSNDSTATE_STARTUP);
324 0 : }
325 :
326 : /*
327 : * Clean up any ResourceOwner we created.
328 : */
329 : void
330 0 : WalSndResourceCleanup(bool isCommit)
331 : {
332 : ResourceOwner resowner;
333 :
334 0 : if (CurrentResourceOwner == NULL)
335 0 : return;
336 :
337 : /*
338 : * Deleting CurrentResourceOwner is not allowed, so we must save a pointer
339 : * in a local variable and clear it first.
340 : */
341 0 : resowner = CurrentResourceOwner;
342 0 : CurrentResourceOwner = NULL;
343 :
344 : /* Now we can release resources and delete it. */
345 0 : ResourceOwnerRelease(resowner,
346 : RESOURCE_RELEASE_BEFORE_LOCKS, isCommit, true);
347 0 : ResourceOwnerRelease(resowner,
348 : RESOURCE_RELEASE_LOCKS, isCommit, true);
349 0 : ResourceOwnerRelease(resowner,
350 : RESOURCE_RELEASE_AFTER_LOCKS, isCommit, true);
351 0 : ResourceOwnerDelete(resowner);
352 : }
353 :
354 : /*
355 : * Handle a client's connection abort in an orderly manner.
356 : */
357 : static void
358 6 : WalSndShutdown(void)
359 : {
360 : /*
361 : * Reset whereToSendOutput to prevent ereport from attempting to send any
362 : * more messages to the standby.
363 : */
364 6 : if (whereToSendOutput == DestRemote)
365 6 : whereToSendOutput = DestNone;
366 :
367 6 : proc_exit(0);
368 : abort(); /* keep the compiler quiet */
369 : }
370 :
371 : /*
372 : * Handle the IDENTIFY_SYSTEM command.
373 : */
374 : static void
375 74 : IdentifySystem(void)
376 : {
377 : char sysid[32];
378 : char xloc[MAXFNAMELEN];
379 : XLogRecPtr logptr;
380 74 : char *dbname = NULL;
381 : DestReceiver *dest;
382 : TupOutputState *tstate;
383 : TupleDesc tupdesc;
384 : Datum values[4];
385 : bool nulls[4];
386 :
387 : /*
388 : * Reply with a result set with one row, four columns. First col is system
389 : * ID, second is timeline ID, third is current xlog location and the
390 : * fourth contains the database name if we are connected to one.
391 : */
392 :
393 74 : snprintf(sysid, sizeof(sysid), UINT64_FORMAT,
394 : GetSystemIdentifier());
395 :
396 74 : am_cascading_walsender = RecoveryInProgress();
397 74 : if (am_cascading_walsender)
398 : {
399 : /* this also updates ThisTimeLineID */
400 0 : logptr = GetStandbyFlushRecPtr();
401 : }
402 : else
403 74 : logptr = GetFlushRecPtr();
404 :
405 74 : snprintf(xloc, sizeof(xloc), "%X/%X", (uint32) (logptr >> 32), (uint32) logptr);
406 :
407 74 : if (MyDatabaseId != InvalidOid)
408 : {
409 74 : MemoryContext cur = CurrentMemoryContext;
410 :
411 : /* syscache access needs a transaction env. */
412 74 : StartTransactionCommand();
413 : /* make dbname live outside TX context */
414 74 : MemoryContextSwitchTo(cur);
415 74 : dbname = get_database_name(MyDatabaseId);
416 74 : CommitTransactionCommand();
417 : /* CommitTransactionCommand switches to TopMemoryContext */
418 74 : MemoryContextSwitchTo(cur);
419 : }
420 :
421 74 : dest = CreateDestReceiver(DestRemoteSimple);
422 74 : MemSet(nulls, false, sizeof(nulls));
423 :
424 : /* need a tuple descriptor representing four columns */
425 74 : tupdesc = CreateTemplateTupleDesc(4);
426 74 : TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "systemid",
427 : TEXTOID, -1, 0);
428 74 : TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "timeline",
429 : INT4OID, -1, 0);
430 74 : TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 3, "xlogpos",
431 : TEXTOID, -1, 0);
432 74 : TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 4, "dbname",
433 : TEXTOID, -1, 0);
434 :
435 : /* prepare for projection of tuples */
436 74 : tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual);
437 :
438 : /* column 1: system identifier */
439 74 : values[0] = CStringGetTextDatum(sysid);
440 :
441 : /* column 2: timeline */
442 74 : values[1] = Int32GetDatum(ThisTimeLineID);
443 :
444 : /* column 3: wal location */
445 74 : values[2] = CStringGetTextDatum(xloc);
446 :
447 : /* column 4: database name, or NULL if none */
448 74 : if (dbname)
449 74 : values[3] = CStringGetTextDatum(dbname);
450 : else
451 0 : nulls[3] = true;
452 :
453 : /* send it to dest */
454 74 : do_tup_output(tstate, values, nulls);
455 :
456 74 : end_tup_output(tstate);
457 74 : }
458 :
459 :
460 : /*
461 : * Handle TIMELINE_HISTORY command.
462 : */
463 : static void
464 0 : SendTimeLineHistory(TimeLineHistoryCmd *cmd)
465 : {
466 : StringInfoData buf;
467 : char histfname[MAXFNAMELEN];
468 : char path[MAXPGPATH];
469 : int fd;
470 : off_t histfilelen;
471 : off_t bytesleft;
472 : Size len;
473 :
474 : /*
475 : * Reply with a result set with one row, and two columns. The first col is
476 : * the name of the history file, 2nd is the contents.
477 : */
478 :
479 0 : TLHistoryFileName(histfname, cmd->timeline);
480 0 : TLHistoryFilePath(path, cmd->timeline);
481 :
482 : /* Send a RowDescription message */
483 0 : pq_beginmessage(&buf, 'T');
484 0 : pq_sendint16(&buf, 2); /* 2 fields */
485 :
486 : /* first field */
487 0 : pq_sendstring(&buf, "filename"); /* col name */
488 0 : pq_sendint32(&buf, 0); /* table oid */
489 0 : pq_sendint16(&buf, 0); /* attnum */
490 0 : pq_sendint32(&buf, TEXTOID); /* type oid */
491 0 : pq_sendint16(&buf, -1); /* typlen */
492 0 : pq_sendint32(&buf, 0); /* typmod */
493 0 : pq_sendint16(&buf, 0); /* format code */
494 :
495 : /* second field */
496 0 : pq_sendstring(&buf, "content"); /* col name */
497 0 : pq_sendint32(&buf, 0); /* table oid */
498 0 : pq_sendint16(&buf, 0); /* attnum */
499 0 : pq_sendint32(&buf, BYTEAOID); /* type oid */
500 0 : pq_sendint16(&buf, -1); /* typlen */
501 0 : pq_sendint32(&buf, 0); /* typmod */
502 0 : pq_sendint16(&buf, 0); /* format code */
503 0 : pq_endmessage(&buf);
504 :
505 : /* Send a DataRow message */
506 0 : pq_beginmessage(&buf, 'D');
507 0 : pq_sendint16(&buf, 2); /* # of columns */
508 0 : len = strlen(histfname);
509 0 : pq_sendint32(&buf, len); /* col1 len */
510 0 : pq_sendbytes(&buf, histfname, len);
511 :
512 0 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
513 0 : if (fd < 0)
514 0 : ereport(ERROR,
515 : (errcode_for_file_access(),
516 : errmsg("could not open file \"%s\": %m", path)));
517 :
518 : /* Determine file length and send it to client */
519 0 : histfilelen = lseek(fd, 0, SEEK_END);
520 0 : if (histfilelen < 0)
521 0 : ereport(ERROR,
522 : (errcode_for_file_access(),
523 : errmsg("could not seek to end of file \"%s\": %m", path)));
524 0 : if (lseek(fd, 0, SEEK_SET) != 0)
525 0 : ereport(ERROR,
526 : (errcode_for_file_access(),
527 : errmsg("could not seek to beginning of file \"%s\": %m", path)));
528 :
529 0 : pq_sendint32(&buf, histfilelen); /* col2 len */
530 :
531 0 : bytesleft = histfilelen;
532 0 : while (bytesleft > 0)
533 : {
534 : PGAlignedBlock rbuf;
535 : int nread;
536 :
537 0 : pgstat_report_wait_start(WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ);
538 0 : nread = read(fd, rbuf.data, sizeof(rbuf));
539 0 : pgstat_report_wait_end();
540 0 : if (nread < 0)
541 0 : ereport(ERROR,
542 : (errcode_for_file_access(),
543 : errmsg("could not read file \"%s\": %m",
544 : path)));
545 0 : else if (nread == 0)
546 0 : ereport(ERROR,
547 : (errcode(ERRCODE_DATA_CORRUPTED),
548 : errmsg("could not read file \"%s\": read %d of %zu",
549 : path, nread, (Size) bytesleft)));
550 :
551 0 : pq_sendbytes(&buf, rbuf.data, nread);
552 0 : bytesleft -= nread;
553 : }
554 :
555 0 : if (CloseTransientFile(fd) != 0)
556 0 : ereport(ERROR,
557 : (errcode_for_file_access(),
558 : errmsg("could not close file \"%s\": %m", path)));
559 :
560 0 : pq_endmessage(&buf);
561 0 : }
562 :
563 : /*
564 : * Handle START_REPLICATION command.
565 : *
566 : * At the moment, this never returns, but an ereport(ERROR) will take us back
567 : * to the main loop.
568 : */
569 : static void
570 0 : StartReplication(StartReplicationCmd *cmd)
571 : {
572 : StringInfoData buf;
573 : XLogRecPtr FlushPtr;
574 :
575 0 : if (ThisTimeLineID == 0)
576 0 : ereport(ERROR,
577 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
578 : errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
579 :
580 : /* create xlogreader for physical replication */
581 0 : xlogreader =
582 0 : XLogReaderAllocate(wal_segment_size, NULL,
583 0 : XL_ROUTINE(.segment_open = WalSndSegmentOpen,
584 : .segment_close = wal_segment_close),
585 : NULL);
586 :
587 0 : if (!xlogreader)
588 0 : ereport(ERROR,
589 : (errcode(ERRCODE_OUT_OF_MEMORY),
590 : errmsg("out of memory")));
591 :
592 : /*
593 : * We assume here that we're logging enough information in the WAL for
594 : * log-shipping, since this is checked in PostmasterMain().
595 : *
596 : * NOTE: wal_level can only change at shutdown, so in most cases it is
597 : * difficult for there to be WAL data that we can still see that was
598 : * written at wal_level='minimal'.
599 : */
600 :
601 0 : if (cmd->slotname)
602 : {
603 0 : (void) ReplicationSlotAcquire(cmd->slotname, SAB_Error);
604 0 : if (SlotIsLogical(MyReplicationSlot))
605 0 : ereport(ERROR,
606 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
607 : errmsg("cannot use a logical replication slot for physical replication")));
608 :
609 : /*
610 : * We don't need to verify the slot's restart_lsn here; instead we
611 : * rely on the caller requesting the starting point to use. If the
612 : * WAL segment doesn't exist, we'll fail later.
613 : */
614 : }
615 :
616 : /*
617 : * Select the timeline. If it was given explicitly by the client, use
618 : * that. Otherwise use the timeline of the last replayed record, which is
619 : * kept in ThisTimeLineID.
620 : */
621 0 : if (am_cascading_walsender)
622 : {
623 : /* this also updates ThisTimeLineID */
624 0 : FlushPtr = GetStandbyFlushRecPtr();
625 : }
626 : else
627 0 : FlushPtr = GetFlushRecPtr();
628 :
629 0 : if (cmd->timeline != 0)
630 : {
631 : XLogRecPtr switchpoint;
632 :
633 0 : sendTimeLine = cmd->timeline;
634 0 : if (sendTimeLine == ThisTimeLineID)
635 : {
636 0 : sendTimeLineIsHistoric = false;
637 0 : sendTimeLineValidUpto = InvalidXLogRecPtr;
638 : }
639 : else
640 : {
641 : List *timeLineHistory;
642 :
643 0 : sendTimeLineIsHistoric = true;
644 :
645 : /*
646 : * Check that the timeline the client requested exists, and the
647 : * requested start location is on that timeline.
648 : */
649 0 : timeLineHistory = readTimeLineHistory(ThisTimeLineID);
650 0 : switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory,
651 : &sendTimeLineNextTLI);
652 0 : list_free_deep(timeLineHistory);
653 :
654 : /*
655 : * Found the requested timeline in the history. Check that
656 : * requested startpoint is on that timeline in our history.
657 : *
658 : * This is quite loose on purpose. We only check that we didn't
659 : * fork off the requested timeline before the switchpoint. We
660 : * don't check that we switched *to* it before the requested
661 : * starting point. This is because the client can legitimately
662 : * request to start replication from the beginning of the WAL
663 : * segment that contains switchpoint, but on the new timeline, so
664 : * that it doesn't end up with a partial segment. If you ask for
665 : * too old a starting point, you'll get an error later when we
666 : * fail to find the requested WAL segment in pg_wal.
667 : *
668 : * XXX: we could be more strict here and only allow a startpoint
669 : * that's older than the switchpoint, if it's still in the same
670 : * WAL segment.
671 : */
672 0 : if (!XLogRecPtrIsInvalid(switchpoint) &&
673 0 : switchpoint < cmd->startpoint)
674 : {
675 0 : ereport(ERROR,
676 : (errmsg("requested starting point %X/%X on timeline %u is not in this server's history",
677 : (uint32) (cmd->startpoint >> 32),
678 : (uint32) (cmd->startpoint),
679 : cmd->timeline),
680 : errdetail("This server's history forked from timeline %u at %X/%X.",
681 : cmd->timeline,
682 : (uint32) (switchpoint >> 32),
683 : (uint32) (switchpoint))));
684 : }
685 0 : sendTimeLineValidUpto = switchpoint;
686 : }
687 : }
688 : else
689 : {
690 0 : sendTimeLine = ThisTimeLineID;
691 0 : sendTimeLineValidUpto = InvalidXLogRecPtr;
692 0 : sendTimeLineIsHistoric = false;
693 : }
694 :
695 0 : streamingDoneSending = streamingDoneReceiving = false;
696 :
697 : /* If there is nothing to stream, don't even enter COPY mode */
698 0 : if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto)
699 : {
700 : /*
701 : * When we first start replication the standby will be behind the
702 : * primary. For some applications, for example synchronous
703 : * replication, it is important to have a clear state for this initial
704 : * catchup mode, so we can trigger actions when we change streaming
705 : * state later. We may stay in this state for a long time, which is
706 : * exactly why we want to be able to monitor whether or not we are
707 : * still here.
708 : */
709 0 : WalSndSetState(WALSNDSTATE_CATCHUP);
710 :
711 : /* Send a CopyBothResponse message, and start streaming */
712 0 : pq_beginmessage(&buf, 'W');
713 0 : pq_sendbyte(&buf, 0);
714 0 : pq_sendint16(&buf, 0);
715 0 : pq_endmessage(&buf);
716 0 : pq_flush();
717 :
718 : /*
719 : * Don't allow a request to stream from a future point in WAL that
720 : * hasn't been flushed to disk in this server yet.
721 : */
722 0 : if (FlushPtr < cmd->startpoint)
723 : {
724 0 : ereport(ERROR,
725 : (errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
726 : (uint32) (cmd->startpoint >> 32),
727 : (uint32) (cmd->startpoint),
728 : (uint32) (FlushPtr >> 32),
729 : (uint32) (FlushPtr))));
730 : }
731 :
732 : /* Start streaming from the requested point */
733 0 : sentPtr = cmd->startpoint;
734 :
735 : /* Initialize shared memory status, too */
736 0 : SpinLockAcquire(&MyWalSnd->mutex);
737 0 : MyWalSnd->sentPtr = sentPtr;
738 0 : SpinLockRelease(&MyWalSnd->mutex);
739 :
740 0 : SyncRepInitConfig();
741 :
742 : /* Main loop of walsender */
743 0 : replication_active = true;
744 :
745 0 : WalSndLoop(XLogSendPhysical);
746 :
747 0 : replication_active = false;
748 0 : if (got_STOPPING)
749 0 : proc_exit(0);
750 0 : WalSndSetState(WALSNDSTATE_STARTUP);
751 :
752 0 : Assert(streamingDoneSending && streamingDoneReceiving);
753 : }
754 :
755 0 : if (cmd->slotname)
756 0 : ReplicationSlotRelease();
757 :
758 : /*
759 : * Copy is finished now. Send a single-row result set indicating the next
760 : * timeline.
761 : */
762 0 : if (sendTimeLineIsHistoric)
763 : {
764 : char startpos_str[8 + 1 + 8 + 1];
765 : DestReceiver *dest;
766 : TupOutputState *tstate;
767 : TupleDesc tupdesc;
768 : Datum values[2];
769 : bool nulls[2];
770 :
771 0 : snprintf(startpos_str, sizeof(startpos_str), "%X/%X",
772 0 : (uint32) (sendTimeLineValidUpto >> 32),
773 : (uint32) sendTimeLineValidUpto);
774 :
775 0 : dest = CreateDestReceiver(DestRemoteSimple);
776 0 : MemSet(nulls, false, sizeof(nulls));
777 :
778 : /*
779 : * Need a tuple descriptor representing two columns. int8 may seem
780 : * like a surprising data type for this, but in theory int4 would not
781 : * be wide enough for this, as TimeLineID is unsigned.
782 : */
783 0 : tupdesc = CreateTemplateTupleDesc(2);
784 0 : TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli",
785 : INT8OID, -1, 0);
786 0 : TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos",
787 : TEXTOID, -1, 0);
788 :
789 : /* prepare for projection of tuple */
790 0 : tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual);
791 :
792 0 : values[0] = Int64GetDatum((int64) sendTimeLineNextTLI);
793 0 : values[1] = CStringGetTextDatum(startpos_str);
794 :
795 : /* send it to dest */
796 0 : do_tup_output(tstate, values, nulls);
797 :
798 0 : end_tup_output(tstate);
799 : }
800 :
801 : /* Send CommandComplete message */
802 0 : EndReplicationCommand("START_STREAMING");
803 0 : }
804 :
805 : /*
806 : * XLogReaderRoutine->page_read callback for logical decoding contexts, as a
807 : * walsender process.
808 : *
809 : * Inside the walsender we can do better than read_local_xlog_page,
810 : * which has to do a plain sleep/busy loop, because the walsender's latch gets
811 : * set every time WAL is flushed.
812 : */
813 : static int
814 50958 : logical_read_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen,
815 : XLogRecPtr targetRecPtr, char *cur_page)
816 : {
817 : XLogRecPtr flushptr;
818 : int count;
819 : WALReadError errinfo;
820 : XLogSegNo segno;
821 :
822 50958 : XLogReadDetermineTimeline(state, targetPagePtr, reqLen);
823 50958 : sendTimeLineIsHistoric = (state->currTLI != ThisTimeLineID);
824 50958 : sendTimeLine = state->currTLI;
825 50958 : sendTimeLineValidUpto = state->currTLIValidUntil;
826 50958 : sendTimeLineNextTLI = state->nextTLI;
827 :
828 : /* make sure we have enough WAL available */
829 50958 : flushptr = WalSndWaitForWal(targetPagePtr + reqLen);
830 :
831 : /* fail if not (implies we are going to shut down) */
832 50896 : if (flushptr < targetPagePtr + reqLen)
833 18328 : return -1;
834 :
835 32568 : if (targetPagePtr + XLOG_BLCKSZ <= flushptr)
836 31578 : count = XLOG_BLCKSZ; /* more than one block available */
837 : else
838 990 : count = flushptr - targetPagePtr; /* part of the page available */
839 :
840 : /* now actually read the data, we know it's there */
841 32568 : if (!WALRead(state,
842 : cur_page,
843 : targetPagePtr,
844 : XLOG_BLCKSZ,
845 : state->seg.ws_tli, /* Pass the current TLI because only
846 : * WalSndSegmentOpen controls whether new
847 : * TLI is needed. */
848 : &errinfo))
849 0 : WALReadRaiseError(&errinfo);
850 :
851 : /*
852 : * After reading into the buffer, check that what we read was valid. We do
853 : * this after reading, because even though the segment was present when we
854 : * opened it, it might get recycled or removed while we read it. The
855 : * read() succeeds in that case, but the data we tried to read might
856 : * already have been overwritten with new WAL records.
857 : */
858 32568 : XLByteToSeg(targetPagePtr, segno, state->segcxt.ws_segsize);
859 32568 : CheckXLogRemoved(segno, state->seg.ws_tli);
860 :
861 32568 : return count;
862 : }
863 :
864 : /*
865 : * Process extra options given to CREATE_REPLICATION_SLOT.
866 : */
867 : static void
868 182 : parseCreateReplSlotOptions(CreateReplicationSlotCmd *cmd,
869 : bool *reserve_wal,
870 : CRSSnapshotAction *snapshot_action)
871 : {
872 : ListCell *lc;
873 182 : bool snapshot_action_given = false;
874 182 : bool reserve_wal_given = false;
875 :
876 : /* Parse options */
877 364 : foreach(lc, cmd->options)
878 : {
879 182 : DefElem *defel = (DefElem *) lfirst(lc);
880 :
881 182 : if (strcmp(defel->defname, "export_snapshot") == 0)
882 : {
883 56 : if (snapshot_action_given || cmd->kind != REPLICATION_KIND_LOGICAL)
884 0 : ereport(ERROR,
885 : (errcode(ERRCODE_SYNTAX_ERROR),
886 : errmsg("conflicting or redundant options")));
887 :
888 56 : snapshot_action_given = true;
889 56 : *snapshot_action = defGetBoolean(defel) ? CRS_EXPORT_SNAPSHOT :
890 : CRS_NOEXPORT_SNAPSHOT;
891 : }
892 126 : else if (strcmp(defel->defname, "use_snapshot") == 0)
893 : {
894 126 : if (snapshot_action_given || cmd->kind != REPLICATION_KIND_LOGICAL)
895 0 : ereport(ERROR,
896 : (errcode(ERRCODE_SYNTAX_ERROR),
897 : errmsg("conflicting or redundant options")));
898 :
899 126 : snapshot_action_given = true;
900 126 : *snapshot_action = CRS_USE_SNAPSHOT;
901 : }
902 0 : else if (strcmp(defel->defname, "reserve_wal") == 0)
903 : {
904 0 : if (reserve_wal_given || cmd->kind != REPLICATION_KIND_PHYSICAL)
905 0 : ereport(ERROR,
906 : (errcode(ERRCODE_SYNTAX_ERROR),
907 : errmsg("conflicting or redundant options")));
908 :
909 0 : reserve_wal_given = true;
910 0 : *reserve_wal = true;
911 : }
912 : else
913 0 : elog(ERROR, "unrecognized option: %s", defel->defname);
914 : }
915 182 : }
916 :
917 : /*
918 : * Create a new replication slot.
919 : */
920 : static void
921 182 : CreateReplicationSlot(CreateReplicationSlotCmd *cmd)
922 : {
923 182 : const char *snapshot_name = NULL;
924 : char xloc[MAXFNAMELEN];
925 : char *slot_name;
926 182 : bool reserve_wal = false;
927 182 : CRSSnapshotAction snapshot_action = CRS_EXPORT_SNAPSHOT;
928 : DestReceiver *dest;
929 : TupOutputState *tstate;
930 : TupleDesc tupdesc;
931 : Datum values[4];
932 : bool nulls[4];
933 :
934 182 : Assert(!MyReplicationSlot);
935 :
936 182 : parseCreateReplSlotOptions(cmd, &reserve_wal, &snapshot_action);
937 :
938 : /* setup state for WalSndSegmentOpen */
939 182 : sendTimeLineIsHistoric = false;
940 182 : sendTimeLine = ThisTimeLineID;
941 :
942 182 : if (cmd->kind == REPLICATION_KIND_PHYSICAL)
943 : {
944 0 : ReplicationSlotCreate(cmd->slotname, false,
945 0 : cmd->temporary ? RS_TEMPORARY : RS_PERSISTENT);
946 : }
947 : else
948 : {
949 182 : CheckLogicalDecodingRequirements();
950 :
951 : /*
952 : * Initially create persistent slot as ephemeral - that allows us to
953 : * nicely handle errors during initialization because it'll get
954 : * dropped if this transaction fails. We'll make it persistent at the
955 : * end. Temporary slots can be created as temporary from beginning as
956 : * they get dropped on error as well.
957 : */
958 182 : ReplicationSlotCreate(cmd->slotname, true,
959 182 : cmd->temporary ? RS_TEMPORARY : RS_EPHEMERAL);
960 : }
961 :
962 182 : if (cmd->kind == REPLICATION_KIND_LOGICAL)
963 : {
964 : LogicalDecodingContext *ctx;
965 182 : bool need_full_snapshot = false;
966 :
967 : /*
968 : * Do options check early so that we can bail before calling the
969 : * DecodingContextFindStartpoint which can take long time.
970 : */
971 182 : if (snapshot_action == CRS_EXPORT_SNAPSHOT)
972 : {
973 0 : if (IsTransactionBlock())
974 0 : ereport(ERROR,
975 : /*- translator: %s is a CREATE_REPLICATION_SLOT statement */
976 : (errmsg("%s must not be called inside a transaction",
977 : "CREATE_REPLICATION_SLOT ... EXPORT_SNAPSHOT")));
978 :
979 0 : need_full_snapshot = true;
980 : }
981 182 : else if (snapshot_action == CRS_USE_SNAPSHOT)
982 : {
983 126 : if (!IsTransactionBlock())
984 0 : ereport(ERROR,
985 : /*- translator: %s is a CREATE_REPLICATION_SLOT statement */
986 : (errmsg("%s must be called inside a transaction",
987 : "CREATE_REPLICATION_SLOT ... USE_SNAPSHOT")));
988 :
989 126 : if (XactIsoLevel != XACT_REPEATABLE_READ)
990 0 : ereport(ERROR,
991 : /*- translator: %s is a CREATE_REPLICATION_SLOT statement */
992 : (errmsg("%s must be called in REPEATABLE READ isolation mode transaction",
993 : "CREATE_REPLICATION_SLOT ... USE_SNAPSHOT")));
994 :
995 126 : if (FirstSnapshotSet)
996 0 : ereport(ERROR,
997 : /*- translator: %s is a CREATE_REPLICATION_SLOT statement */
998 : (errmsg("%s must be called before any query",
999 : "CREATE_REPLICATION_SLOT ... USE_SNAPSHOT")));
1000 :
1001 126 : if (IsSubTransaction())
1002 0 : ereport(ERROR,
1003 : /*- translator: %s is a CREATE_REPLICATION_SLOT statement */
1004 : (errmsg("%s must not be called in a subtransaction",
1005 : "CREATE_REPLICATION_SLOT ... USE_SNAPSHOT")));
1006 :
1007 126 : need_full_snapshot = true;
1008 : }
1009 :
1010 182 : ctx = CreateInitDecodingContext(cmd->plugin, NIL, need_full_snapshot,
1011 : InvalidXLogRecPtr,
1012 182 : XL_ROUTINE(.page_read = logical_read_xlog_page,
1013 : .segment_open = WalSndSegmentOpen,
1014 : .segment_close = wal_segment_close),
1015 : WalSndPrepareWrite, WalSndWriteData,
1016 : WalSndUpdateProgress);
1017 :
1018 : /*
1019 : * Signal that we don't need the timeout mechanism. We're just
1020 : * creating the replication slot and don't yet accept feedback
1021 : * messages or send keepalives. As we possibly need to wait for
1022 : * further WAL the walsender would otherwise possibly be killed too
1023 : * soon.
1024 : */
1025 182 : last_reply_timestamp = 0;
1026 :
1027 : /* build initial snapshot, might take a while */
1028 182 : DecodingContextFindStartpoint(ctx);
1029 :
1030 : /*
1031 : * Export or use the snapshot if we've been asked to do so.
1032 : *
1033 : * NB. We will convert the snapbuild.c kind of snapshot to normal
1034 : * snapshot when doing this.
1035 : */
1036 182 : if (snapshot_action == CRS_EXPORT_SNAPSHOT)
1037 : {
1038 0 : snapshot_name = SnapBuildExportSnapshot(ctx->snapshot_builder);
1039 : }
1040 182 : else if (snapshot_action == CRS_USE_SNAPSHOT)
1041 : {
1042 : Snapshot snap;
1043 :
1044 126 : snap = SnapBuildInitialSnapshot(ctx->snapshot_builder);
1045 126 : RestoreTransactionSnapshot(snap, MyProc);
1046 : }
1047 :
1048 : /* don't need the decoding context anymore */
1049 182 : FreeDecodingContext(ctx);
1050 :
1051 182 : if (!cmd->temporary)
1052 56 : ReplicationSlotPersist();
1053 : }
1054 0 : else if (cmd->kind == REPLICATION_KIND_PHYSICAL && reserve_wal)
1055 : {
1056 0 : ReplicationSlotReserveWal();
1057 :
1058 0 : ReplicationSlotMarkDirty();
1059 :
1060 : /* Write this slot to disk if it's a permanent one. */
1061 0 : if (!cmd->temporary)
1062 0 : ReplicationSlotSave();
1063 : }
1064 :
1065 364 : snprintf(xloc, sizeof(xloc), "%X/%X",
1066 182 : (uint32) (MyReplicationSlot->data.confirmed_flush >> 32),
1067 182 : (uint32) MyReplicationSlot->data.confirmed_flush);
1068 :
1069 182 : dest = CreateDestReceiver(DestRemoteSimple);
1070 182 : MemSet(nulls, false, sizeof(nulls));
1071 :
1072 : /*----------
1073 : * Need a tuple descriptor representing four columns:
1074 : * - first field: the slot name
1075 : * - second field: LSN at which we became consistent
1076 : * - third field: exported snapshot's name
1077 : * - fourth field: output plugin
1078 : *----------
1079 : */
1080 182 : tupdesc = CreateTemplateTupleDesc(4);
1081 182 : TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "slot_name",
1082 : TEXTOID, -1, 0);
1083 182 : TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "consistent_point",
1084 : TEXTOID, -1, 0);
1085 182 : TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 3, "snapshot_name",
1086 : TEXTOID, -1, 0);
1087 182 : TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 4, "output_plugin",
1088 : TEXTOID, -1, 0);
1089 :
1090 : /* prepare for projection of tuples */
1091 182 : tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual);
1092 :
1093 : /* slot_name */
1094 182 : slot_name = NameStr(MyReplicationSlot->data.name);
1095 182 : values[0] = CStringGetTextDatum(slot_name);
1096 :
1097 : /* consistent wal location */
1098 182 : values[1] = CStringGetTextDatum(xloc);
1099 :
1100 : /* snapshot name, or NULL if none */
1101 182 : if (snapshot_name != NULL)
1102 0 : values[2] = CStringGetTextDatum(snapshot_name);
1103 : else
1104 182 : nulls[2] = true;
1105 :
1106 : /* plugin, or NULL if none */
1107 182 : if (cmd->plugin != NULL)
1108 182 : values[3] = CStringGetTextDatum(cmd->plugin);
1109 : else
1110 0 : nulls[3] = true;
1111 :
1112 : /* send it to dest */
1113 182 : do_tup_output(tstate, values, nulls);
1114 182 : end_tup_output(tstate);
1115 :
1116 182 : ReplicationSlotRelease();
1117 182 : }
1118 :
1119 : /*
1120 : * Get rid of a replication slot that is no longer wanted.
1121 : */
1122 : static void
1123 18 : DropReplicationSlot(DropReplicationSlotCmd *cmd)
1124 : {
1125 18 : ReplicationSlotDrop(cmd->slotname, !cmd->wait);
1126 18 : }
1127 :
1128 : /*
1129 : * Load previously initiated logical slot and prepare for sending data (via
1130 : * WalSndLoop).
1131 : */
1132 : static void
1133 196 : StartLogicalReplication(StartReplicationCmd *cmd)
1134 : {
1135 : StringInfoData buf;
1136 : QueryCompletion qc;
1137 :
1138 : /* make sure that our requirements are still fulfilled */
1139 196 : CheckLogicalDecodingRequirements();
1140 :
1141 196 : Assert(!MyReplicationSlot);
1142 :
1143 196 : (void) ReplicationSlotAcquire(cmd->slotname, SAB_Error);
1144 :
1145 196 : if (XLogRecPtrIsInvalid(MyReplicationSlot->data.restart_lsn))
1146 0 : ereport(ERROR,
1147 : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1148 : errmsg("cannot read from logical replication slot \"%s\"",
1149 : cmd->slotname),
1150 : errdetail("This slot has been invalidated because it exceeded the maximum reserved size.")));
1151 :
1152 : /*
1153 : * Force a disconnect, so that the decoding code doesn't need to care
1154 : * about an eventual switch from running in recovery, to running in a
1155 : * normal environment. Client code is expected to handle reconnects.
1156 : */
1157 196 : if (am_cascading_walsender && !RecoveryInProgress())
1158 : {
1159 0 : ereport(LOG,
1160 : (errmsg("terminating walsender process after promotion")));
1161 0 : got_STOPPING = true;
1162 : }
1163 :
1164 : /*
1165 : * Create our decoding context, making it start at the previously ack'ed
1166 : * position.
1167 : *
1168 : * Do this before sending a CopyBothResponse message, so that any errors
1169 : * are reported early.
1170 : */
1171 196 : logical_decoding_ctx =
1172 196 : CreateDecodingContext(cmd->startpoint, cmd->options, false,
1173 196 : XL_ROUTINE(.page_read = logical_read_xlog_page,
1174 : .segment_open = WalSndSegmentOpen,
1175 : .segment_close = wal_segment_close),
1176 : WalSndPrepareWrite, WalSndWriteData,
1177 : WalSndUpdateProgress);
1178 196 : xlogreader = logical_decoding_ctx->reader;
1179 :
1180 196 : WalSndSetState(WALSNDSTATE_CATCHUP);
1181 :
1182 : /* Send a CopyBothResponse message, and start streaming */
1183 196 : pq_beginmessage(&buf, 'W');
1184 196 : pq_sendbyte(&buf, 0);
1185 196 : pq_sendint16(&buf, 0);
1186 196 : pq_endmessage(&buf);
1187 196 : pq_flush();
1188 :
1189 : /* Start reading WAL from the oldest required WAL. */
1190 196 : XLogBeginRead(logical_decoding_ctx->reader,
1191 196 : MyReplicationSlot->data.restart_lsn);
1192 :
1193 : /*
1194 : * Report the location after which we'll send out further commits as the
1195 : * current sentPtr.
1196 : */
1197 196 : sentPtr = MyReplicationSlot->data.confirmed_flush;
1198 :
1199 : /* Also update the sent position status in shared memory */
1200 196 : SpinLockAcquire(&MyWalSnd->mutex);
1201 196 : MyWalSnd->sentPtr = MyReplicationSlot->data.restart_lsn;
1202 196 : SpinLockRelease(&MyWalSnd->mutex);
1203 :
1204 196 : replication_active = true;
1205 :
1206 196 : SyncRepInitConfig();
1207 :
1208 : /* Main loop of walsender */
1209 196 : WalSndLoop(XLogSendLogical);
1210 :
1211 122 : FreeDecodingContext(logical_decoding_ctx);
1212 122 : ReplicationSlotRelease();
1213 :
1214 122 : replication_active = false;
1215 122 : if (got_STOPPING)
1216 0 : proc_exit(0);
1217 122 : WalSndSetState(WALSNDSTATE_STARTUP);
1218 :
1219 : /* Get out of COPY mode (CommandComplete). */
1220 122 : SetQueryCompletion(&qc, CMDTAG_COPY, 0);
1221 122 : EndCommand(&qc, DestRemote, false);
1222 122 : }
1223 :
1224 : /*
1225 : * LogicalDecodingContext 'prepare_write' callback.
1226 : *
1227 : * Prepare a write into a StringInfo.
1228 : *
1229 : * Don't do anything lasting in here, it's quite possible that nothing will be done
1230 : * with the data.
1231 : */
1232 : static void
1233 322410 : WalSndPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool last_write)
1234 : {
1235 : /* can't have sync rep confused by sending the same LSN several times */
1236 322410 : if (!last_write)
1237 226 : lsn = InvalidXLogRecPtr;
1238 :
1239 322410 : resetStringInfo(ctx->out);
1240 :
1241 322410 : pq_sendbyte(ctx->out, 'w');
1242 322410 : pq_sendint64(ctx->out, lsn); /* dataStart */
1243 322410 : pq_sendint64(ctx->out, lsn); /* walEnd */
1244 :
1245 : /*
1246 : * Fill out the sendtime later, just as it's done in XLogSendPhysical, but
1247 : * reserve space here.
1248 : */
1249 322410 : pq_sendint64(ctx->out, 0); /* sendtime */
1250 322410 : }
1251 :
1252 : /*
1253 : * LogicalDecodingContext 'write' callback.
1254 : *
1255 : * Actually write out data previously prepared by WalSndPrepareWrite out to
1256 : * the network. Take as long as needed, but process replies from the other
1257 : * side and check timeouts during that.
1258 : */
1259 : static void
1260 322410 : WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
1261 : bool last_write)
1262 : {
1263 : TimestampTz now;
1264 :
1265 : /*
1266 : * Fill the send timestamp last, so that it is taken as late as possible.
1267 : * This is somewhat ugly, but the protocol is set as it's already used for
1268 : * several releases by streaming physical replication.
1269 : */
1270 322410 : resetStringInfo(&tmpbuf);
1271 322410 : now = GetCurrentTimestamp();
1272 322410 : pq_sendint64(&tmpbuf, now);
1273 322410 : memcpy(&ctx->out->data[1 + sizeof(int64) + sizeof(int64)],
1274 322410 : tmpbuf.data, sizeof(int64));
1275 :
1276 : /* output previously gathered data in a CopyData packet */
1277 322410 : pq_putmessage_noblock('d', ctx->out->data, ctx->out->len);
1278 :
1279 322410 : CHECK_FOR_INTERRUPTS();
1280 :
1281 : /* Try to flush pending output to the client */
1282 322410 : if (pq_flush_if_writable() != 0)
1283 6 : WalSndShutdown();
1284 :
1285 : /* Try taking fast path unless we get too close to walsender timeout. */
1286 322404 : if (now < TimestampTzPlusMilliseconds(last_reply_timestamp,
1287 322404 : wal_sender_timeout / 2) &&
1288 322404 : !pq_is_send_pending())
1289 : {
1290 644706 : return;
1291 : }
1292 :
1293 : /* If we have pending write here, go to slow path */
1294 : for (;;)
1295 : {
1296 : int wakeEvents;
1297 : long sleeptime;
1298 :
1299 : /* Check for input from the client */
1300 230 : ProcessRepliesIfAny();
1301 :
1302 : /* die if timeout was reached */
1303 230 : WalSndCheckTimeOut();
1304 :
1305 : /* Send keepalive if the time has come */
1306 230 : WalSndKeepaliveIfNecessary();
1307 :
1308 230 : if (!pq_is_send_pending())
1309 102 : break;
1310 :
1311 128 : sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
1312 :
1313 128 : wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH |
1314 : WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE | WL_TIMEOUT;
1315 :
1316 : /* Sleep until something happens or we time out */
1317 128 : (void) WaitLatchOrSocket(MyLatch, wakeEvents,
1318 128 : MyProcPort->sock, sleeptime,
1319 : WAIT_EVENT_WAL_SENDER_WRITE_DATA);
1320 :
1321 : /* Clear any already-pending wakeups */
1322 128 : ResetLatch(MyLatch);
1323 :
1324 128 : CHECK_FOR_INTERRUPTS();
1325 :
1326 : /* Process any requests or signals received recently */
1327 128 : if (ConfigReloadPending)
1328 : {
1329 0 : ConfigReloadPending = false;
1330 0 : ProcessConfigFile(PGC_SIGHUP);
1331 0 : SyncRepInitConfig();
1332 : }
1333 :
1334 : /* Try to flush pending output to the client */
1335 128 : if (pq_flush_if_writable() != 0)
1336 0 : WalSndShutdown();
1337 128 : }
1338 :
1339 : /* reactivate latch so WalSndLoop knows to continue */
1340 102 : SetLatch(MyLatch);
1341 : }
1342 :
1343 : /*
1344 : * LogicalDecodingContext 'update_progress' callback.
1345 : *
1346 : * Write the current position to the lag tracker (see XLogSendPhysical).
1347 : */
1348 : static void
1349 434 : WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid)
1350 : {
1351 : static TimestampTz sendTime = 0;
1352 434 : TimestampTz now = GetCurrentTimestamp();
1353 :
1354 : /*
1355 : * Track lag no more than once per WALSND_LOGICAL_LAG_TRACK_INTERVAL_MS to
1356 : * avoid flooding the lag tracker when we commit frequently.
1357 : */
1358 : #define WALSND_LOGICAL_LAG_TRACK_INTERVAL_MS 1000
1359 434 : if (!TimestampDifferenceExceeds(sendTime, now,
1360 : WALSND_LOGICAL_LAG_TRACK_INTERVAL_MS))
1361 792 : return;
1362 :
1363 76 : LagTrackerWrite(lsn, now);
1364 76 : sendTime = now;
1365 : }
1366 :
1367 : /*
1368 : * Wait till WAL < loc is flushed to disk so it can be safely sent to client.
1369 : *
1370 : * Returns end LSN of flushed WAL. Normally this will be >= loc, but
1371 : * if we detect a shutdown request (either from postmaster or client)
1372 : * we will return early, so caller must always check.
1373 : */
1374 : static XLogRecPtr
1375 50958 : WalSndWaitForWal(XLogRecPtr loc)
1376 : {
1377 : int wakeEvents;
1378 : static XLogRecPtr RecentFlushPtr = InvalidXLogRecPtr;
1379 :
1380 : /*
1381 : * Fast path to avoid acquiring the spinlock in case we already know we
1382 : * have enough WAL available. This is particularly interesting if we're
1383 : * far behind.
1384 : */
1385 101660 : if (RecentFlushPtr != InvalidXLogRecPtr &&
1386 50702 : loc <= RecentFlushPtr)
1387 31670 : return RecentFlushPtr;
1388 :
1389 : /* Get a more recent flush pointer. */
1390 19288 : if (!RecoveryInProgress())
1391 19288 : RecentFlushPtr = GetFlushRecPtr();
1392 : else
1393 0 : RecentFlushPtr = GetXLogReplayRecPtr(NULL);
1394 :
1395 : for (;;)
1396 : {
1397 : long sleeptime;
1398 :
1399 : /* Clear any already-pending wakeups */
1400 20606 : ResetLatch(MyLatch);
1401 :
1402 20606 : CHECK_FOR_INTERRUPTS();
1403 :
1404 : /* Process any requests or signals received recently */
1405 20606 : if (ConfigReloadPending)
1406 : {
1407 0 : ConfigReloadPending = false;
1408 0 : ProcessConfigFile(PGC_SIGHUP);
1409 0 : SyncRepInitConfig();
1410 : }
1411 :
1412 : /* Check for input from the client */
1413 20606 : ProcessRepliesIfAny();
1414 :
1415 : /*
1416 : * If we're shutting down, trigger pending WAL to be written out,
1417 : * otherwise we'd possibly end up waiting for WAL that never gets
1418 : * written, because walwriter has shut down already.
1419 : */
1420 20544 : if (got_STOPPING)
1421 18208 : XLogBackgroundFlush();
1422 :
1423 : /* Update our idea of the currently flushed position. */
1424 20544 : if (!RecoveryInProgress())
1425 20544 : RecentFlushPtr = GetFlushRecPtr();
1426 : else
1427 0 : RecentFlushPtr = GetXLogReplayRecPtr(NULL);
1428 :
1429 : /*
1430 : * If postmaster asked us to stop, don't wait anymore.
1431 : *
1432 : * It's important to do this check after the recomputation of
1433 : * RecentFlushPtr, so we can send all remaining data before shutting
1434 : * down.
1435 : */
1436 20544 : if (got_STOPPING)
1437 18208 : break;
1438 :
1439 : /*
1440 : * We only send regular messages to the client for full decoded
1441 : * transactions, but a synchronous replication and walsender shutdown
1442 : * possibly are waiting for a later location. So, before sleeping, we
1443 : * send a ping containing the flush location. If the receiver is
1444 : * otherwise idle, this keepalive will trigger a reply. Processing the
1445 : * reply will update these MyWalSnd locations.
1446 : */
1447 3722 : if (MyWalSnd->flush < sentPtr &&
1448 2314 : MyWalSnd->write < sentPtr &&
1449 928 : !waiting_for_ping_response)
1450 928 : WalSndKeepalive(false);
1451 :
1452 : /* check whether we're done */
1453 2336 : if (loc <= RecentFlushPtr)
1454 898 : break;
1455 :
1456 : /* Waiting for new WAL. Since we need to wait, we're now caught up. */
1457 1438 : WalSndCaughtUp = true;
1458 :
1459 : /*
1460 : * Try to flush any pending output to the client.
1461 : */
1462 1438 : if (pq_flush_if_writable() != 0)
1463 0 : WalSndShutdown();
1464 :
1465 : /*
1466 : * If we have received CopyDone from the client, sent CopyDone
1467 : * ourselves, and the output buffer is empty, it's time to exit
1468 : * streaming, so fail the current WAL fetch request.
1469 : */
1470 1558 : if (streamingDoneReceiving && streamingDoneSending &&
1471 120 : !pq_is_send_pending())
1472 120 : break;
1473 :
1474 : /* die if timeout was reached */
1475 1318 : WalSndCheckTimeOut();
1476 :
1477 : /* Send keepalive if the time has come */
1478 1318 : WalSndKeepaliveIfNecessary();
1479 :
1480 : /*
1481 : * Sleep until something happens or we time out. Also wait for the
1482 : * socket becoming writable, if there's still pending output.
1483 : * Otherwise we might sit on sendable output data while waiting for
1484 : * new WAL to be generated. (But if we have nothing to send, we don't
1485 : * want to wake on socket-writable.)
1486 : */
1487 1318 : sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
1488 :
1489 1318 : wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH |
1490 : WL_SOCKET_READABLE | WL_TIMEOUT;
1491 :
1492 1318 : if (pq_is_send_pending())
1493 0 : wakeEvents |= WL_SOCKET_WRITEABLE;
1494 :
1495 1318 : (void) WaitLatchOrSocket(MyLatch, wakeEvents,
1496 1318 : MyProcPort->sock, sleeptime,
1497 : WAIT_EVENT_WAL_SENDER_WAIT_WAL);
1498 1318 : }
1499 :
1500 : /* reactivate latch so WalSndLoop knows to continue */
1501 19226 : SetLatch(MyLatch);
1502 19226 : return RecentFlushPtr;
1503 : }
1504 :
1505 : /*
1506 : * Execute an incoming replication command.
1507 : *
1508 : * Returns true if the cmd_string was recognized as WalSender command, false
1509 : * if not.
1510 : */
1511 : bool
1512 1454 : exec_replication_command(const char *cmd_string)
1513 : {
1514 : int parse_rc;
1515 : Node *cmd_node;
1516 : const char *cmdtag;
1517 : MemoryContext cmd_context;
1518 : MemoryContext old_context;
1519 :
1520 : /*
1521 : * If WAL sender has been told that shutdown is getting close, switch its
1522 : * status accordingly to handle the next replication commands correctly.
1523 : */
1524 1454 : if (got_STOPPING)
1525 0 : WalSndSetState(WALSNDSTATE_STOPPING);
1526 :
1527 : /*
1528 : * Throw error if in stopping mode. We need prevent commands that could
1529 : * generate WAL while the shutdown checkpoint is being written. To be
1530 : * safe, we just prohibit all new commands.
1531 : */
1532 1454 : if (MyWalSnd->state == WALSNDSTATE_STOPPING)
1533 0 : ereport(ERROR,
1534 : (errmsg("cannot execute new commands while WAL sender is in stopping mode")));
1535 :
1536 : /*
1537 : * CREATE_REPLICATION_SLOT ... LOGICAL exports a snapshot until the next
1538 : * command arrives. Clean up the old stuff if there's anything.
1539 : */
1540 1454 : SnapBuildClearExportedSnapshot();
1541 :
1542 1454 : CHECK_FOR_INTERRUPTS();
1543 :
1544 : /*
1545 : * Parse the command.
1546 : */
1547 1454 : cmd_context = AllocSetContextCreate(CurrentMemoryContext,
1548 : "Replication command context",
1549 : ALLOCSET_DEFAULT_SIZES);
1550 1454 : old_context = MemoryContextSwitchTo(cmd_context);
1551 :
1552 1454 : replication_scanner_init(cmd_string);
1553 1454 : parse_rc = replication_yyparse();
1554 1454 : if (parse_rc != 0)
1555 0 : ereport(ERROR,
1556 : (errcode(ERRCODE_SYNTAX_ERROR),
1557 : errmsg_internal("replication command parser returned %d",
1558 : parse_rc)));
1559 1454 : replication_scanner_finish();
1560 :
1561 1454 : cmd_node = replication_parse_result;
1562 :
1563 : /*
1564 : * If it's a SQL command, just clean up our mess and return false; the
1565 : * caller will take care of executing it.
1566 : */
1567 1454 : if (IsA(cmd_node, SQLCmd))
1568 : {
1569 984 : if (MyDatabaseId == InvalidOid)
1570 0 : ereport(ERROR,
1571 : (errmsg("cannot execute SQL commands in WAL sender for physical replication")));
1572 :
1573 984 : MemoryContextSwitchTo(old_context);
1574 984 : MemoryContextDelete(cmd_context);
1575 :
1576 : /* Tell the caller that this wasn't a WalSender command. */
1577 984 : return false;
1578 : }
1579 :
1580 : /*
1581 : * Report query to various monitoring facilities. For this purpose, we
1582 : * report replication commands just like SQL commands.
1583 : */
1584 470 : debug_query_string = cmd_string;
1585 :
1586 470 : pgstat_report_activity(STATE_RUNNING, cmd_string);
1587 :
1588 : /*
1589 : * Log replication command if log_replication_commands is enabled. Even
1590 : * when it's disabled, log the command with DEBUG1 level for backward
1591 : * compatibility.
1592 : */
1593 470 : ereport(log_replication_commands ? LOG : DEBUG1,
1594 : (errmsg("received replication command: %s", cmd_string)));
1595 :
1596 : /*
1597 : * Disallow replication commands in aborted transaction blocks.
1598 : */
1599 470 : if (IsAbortedTransactionBlockState())
1600 0 : ereport(ERROR,
1601 : (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION),
1602 : errmsg("current transaction is aborted, "
1603 : "commands ignored until end of transaction block")));
1604 :
1605 470 : CHECK_FOR_INTERRUPTS();
1606 :
1607 : /*
1608 : * Allocate buffers that will be used for each outgoing and incoming
1609 : * message. We do this just once per command to reduce palloc overhead.
1610 : */
1611 470 : initStringInfo(&output_message);
1612 470 : initStringInfo(&reply_message);
1613 470 : initStringInfo(&tmpbuf);
1614 :
1615 470 : switch (cmd_node->type)
1616 : {
1617 : case T_IdentifySystemCmd:
1618 74 : cmdtag = "IDENTIFY_SYSTEM";
1619 74 : set_ps_display(cmdtag);
1620 74 : IdentifySystem();
1621 74 : EndReplicationCommand(cmdtag);
1622 74 : break;
1623 :
1624 : case T_BaseBackupCmd:
1625 0 : cmdtag = "BASE_BACKUP";
1626 0 : set_ps_display(cmdtag);
1627 0 : PreventInTransactionBlock(true, cmdtag);
1628 0 : SendBaseBackup((BaseBackupCmd *) cmd_node);
1629 0 : EndReplicationCommand(cmdtag);
1630 0 : break;
1631 :
1632 : case T_CreateReplicationSlotCmd:
1633 182 : cmdtag = "CREATE_REPLICATION_SLOT";
1634 182 : set_ps_display(cmdtag);
1635 182 : CreateReplicationSlot((CreateReplicationSlotCmd *) cmd_node);
1636 182 : EndReplicationCommand(cmdtag);
1637 182 : break;
1638 :
1639 : case T_DropReplicationSlotCmd:
1640 18 : cmdtag = "DROP_REPLICATION_SLOT";
1641 18 : set_ps_display(cmdtag);
1642 18 : DropReplicationSlot((DropReplicationSlotCmd *) cmd_node);
1643 18 : EndReplicationCommand(cmdtag);
1644 18 : break;
1645 :
1646 : case T_StartReplicationCmd:
1647 : {
1648 196 : StartReplicationCmd *cmd = (StartReplicationCmd *) cmd_node;
1649 :
1650 196 : cmdtag = "START_REPLICATION";
1651 196 : set_ps_display(cmdtag);
1652 196 : PreventInTransactionBlock(true, cmdtag);
1653 :
1654 196 : if (cmd->kind == REPLICATION_KIND_PHYSICAL)
1655 0 : StartReplication(cmd);
1656 : else
1657 196 : StartLogicalReplication(cmd);
1658 :
1659 : /* dupe, but necessary per libpqrcv_endstreaming */
1660 122 : EndReplicationCommand(cmdtag);
1661 :
1662 122 : Assert(xlogreader != NULL);
1663 122 : break;
1664 : }
1665 :
1666 : case T_TimeLineHistoryCmd:
1667 0 : cmdtag = "TIMELINE_HISTORY";
1668 0 : set_ps_display(cmdtag);
1669 0 : PreventInTransactionBlock(true, cmdtag);
1670 0 : SendTimeLineHistory((TimeLineHistoryCmd *) cmd_node);
1671 0 : EndReplicationCommand(cmdtag);
1672 0 : break;
1673 :
1674 : case T_VariableShowStmt:
1675 : {
1676 0 : DestReceiver *dest = CreateDestReceiver(DestRemoteSimple);
1677 0 : VariableShowStmt *n = (VariableShowStmt *) cmd_node;
1678 :
1679 0 : cmdtag = "SHOW";
1680 0 : set_ps_display(cmdtag);
1681 :
1682 : /* syscache access needs a transaction environment */
1683 0 : StartTransactionCommand();
1684 0 : GetPGVariable(n->name, dest);
1685 0 : CommitTransactionCommand();
1686 0 : EndReplicationCommand(cmdtag);
1687 : }
1688 0 : break;
1689 :
1690 : default:
1691 0 : elog(ERROR, "unrecognized replication command node tag: %u",
1692 : cmd_node->type);
1693 : }
1694 :
1695 : /* done */
1696 396 : MemoryContextSwitchTo(old_context);
1697 396 : MemoryContextDelete(cmd_context);
1698 :
1699 : /*
1700 : * We need not update ps display or pg_stat_activity, because PostgresMain
1701 : * will reset those to "idle". But we must reset debug_query_string to
1702 : * ensure it doesn't become a dangling pointer.
1703 : */
1704 396 : debug_query_string = NULL;
1705 :
1706 396 : return true;
1707 : }
1708 :
1709 : /*
1710 : * Process any incoming messages while streaming. Also checks if the remote
1711 : * end has closed the connection.
1712 : */
1713 : static void
1714 1203282 : ProcessRepliesIfAny(void)
1715 : {
1716 : unsigned char firstchar;
1717 : int r;
1718 1203282 : bool received = false;
1719 :
1720 1203282 : last_processing = GetCurrentTimestamp();
1721 :
1722 : for (;;)
1723 : {
1724 1253538 : pq_startmsgread();
1725 1253538 : r = pq_getbyte_if_available(&firstchar);
1726 1253538 : if (r < 0)
1727 : {
1728 : /* unexpected error or EOF */
1729 6 : ereport(COMMERROR,
1730 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
1731 : errmsg("unexpected EOF on standby connection")));
1732 6 : proc_exit(0);
1733 : }
1734 1253532 : if (r == 0)
1735 : {
1736 : /* no data available without blocking */
1737 1203218 : pq_endmsgread();
1738 1203218 : break;
1739 : }
1740 :
1741 : /* Read the message contents */
1742 50314 : resetStringInfo(&reply_message);
1743 50314 : if (pq_getmessage(&reply_message, 0))
1744 : {
1745 0 : ereport(COMMERROR,
1746 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
1747 : errmsg("unexpected EOF on standby connection")));
1748 0 : proc_exit(0);
1749 : }
1750 :
1751 : /*
1752 : * If we already received a CopyDone from the frontend, the frontend
1753 : * should not send us anything until we've closed our end of the COPY.
1754 : * XXX: In theory, the frontend could already send the next command
1755 : * before receiving the CopyDone, but libpq doesn't currently allow
1756 : * that.
1757 : */
1758 50314 : if (streamingDoneReceiving && firstchar != 'X')
1759 0 : ereport(FATAL,
1760 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
1761 : errmsg("unexpected standby message type \"%c\", after receiving CopyDone",
1762 : firstchar)));
1763 :
1764 : /* Handle the very limited subset of commands expected in this phase */
1765 50314 : switch (firstchar)
1766 : {
1767 : /*
1768 : * 'd' means a standby reply wrapped in a CopyData packet.
1769 : */
1770 : case 'd':
1771 50134 : ProcessStandbyMessage();
1772 50134 : received = true;
1773 50134 : break;
1774 :
1775 : /*
1776 : * CopyDone means the standby requested to finish streaming.
1777 : * Reply with CopyDone, if we had not sent that already.
1778 : */
1779 : case 'c':
1780 122 : if (!streamingDoneSending)
1781 : {
1782 122 : pq_putmessage_noblock('c', NULL, 0);
1783 122 : streamingDoneSending = true;
1784 : }
1785 :
1786 122 : streamingDoneReceiving = true;
1787 122 : received = true;
1788 122 : break;
1789 :
1790 : /*
1791 : * 'X' means that the standby is closing down the socket.
1792 : */
1793 : case 'X':
1794 58 : proc_exit(0);
1795 :
1796 : default:
1797 0 : ereport(FATAL,
1798 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
1799 : errmsg("invalid standby message type \"%c\"",
1800 : firstchar)));
1801 : }
1802 50256 : }
1803 :
1804 : /*
1805 : * Save the last reply timestamp if we've received at least one reply.
1806 : */
1807 1203218 : if (received)
1808 : {
1809 4712 : last_reply_timestamp = last_processing;
1810 4712 : waiting_for_ping_response = false;
1811 : }
1812 1203218 : }
1813 :
1814 : /*
1815 : * Process a status update message received from standby.
1816 : */
1817 : static void
1818 50134 : ProcessStandbyMessage(void)
1819 : {
1820 : char msgtype;
1821 :
1822 : /*
1823 : * Check message type from the first byte.
1824 : */
1825 50134 : msgtype = pq_getmsgbyte(&reply_message);
1826 :
1827 50134 : switch (msgtype)
1828 : {
1829 : case 'r':
1830 50134 : ProcessStandbyReplyMessage();
1831 50134 : break;
1832 :
1833 : case 'h':
1834 0 : ProcessStandbyHSFeedbackMessage();
1835 0 : break;
1836 :
1837 : default:
1838 0 : ereport(COMMERROR,
1839 : (errcode(ERRCODE_PROTOCOL_VIOLATION),
1840 : errmsg("unexpected message type \"%c\"", msgtype)));
1841 0 : proc_exit(0);
1842 : }
1843 50134 : }
1844 :
1845 : /*
1846 : * Remember that a walreceiver just confirmed receipt of lsn `lsn`.
1847 : */
1848 : static void
1849 0 : PhysicalConfirmReceivedLocation(XLogRecPtr lsn)
1850 : {
1851 0 : bool changed = false;
1852 0 : ReplicationSlot *slot = MyReplicationSlot;
1853 :
1854 0 : Assert(lsn != InvalidXLogRecPtr);
1855 0 : SpinLockAcquire(&slot->mutex);
1856 0 : if (slot->data.restart_lsn != lsn)
1857 : {
1858 0 : changed = true;
1859 0 : slot->data.restart_lsn = lsn;
1860 : }
1861 0 : SpinLockRelease(&slot->mutex);
1862 :
1863 0 : if (changed)
1864 : {
1865 0 : ReplicationSlotMarkDirty();
1866 0 : ReplicationSlotsComputeRequiredLSN();
1867 : }
1868 :
1869 : /*
1870 : * One could argue that the slot should be saved to disk now, but that'd
1871 : * be energy wasted - the worst lost information can do here is give us
1872 : * wrong information in a statistics view - we'll just potentially be more
1873 : * conservative in removing files.
1874 : */
1875 0 : }
1876 :
1877 : /*
1878 : * Regular reply from standby advising of WAL locations on standby server.
1879 : */
1880 : static void
1881 50134 : ProcessStandbyReplyMessage(void)
1882 : {
1883 : XLogRecPtr writePtr,
1884 : flushPtr,
1885 : applyPtr;
1886 : bool replyRequested;
1887 : TimeOffset writeLag,
1888 : flushLag,
1889 : applyLag;
1890 : bool clearLagTimes;
1891 : TimestampTz now;
1892 : TimestampTz replyTime;
1893 :
1894 : static bool fullyAppliedLastTime = false;
1895 :
1896 : /* the caller already consumed the msgtype byte */
1897 50134 : writePtr = pq_getmsgint64(&reply_message);
1898 50134 : flushPtr = pq_getmsgint64(&reply_message);
1899 50134 : applyPtr = pq_getmsgint64(&reply_message);
1900 50134 : replyTime = pq_getmsgint64(&reply_message);
1901 50134 : replyRequested = pq_getmsgbyte(&reply_message);
1902 :
1903 50134 : if (log_min_messages <= DEBUG2)
1904 : {
1905 : char *replyTimeStr;
1906 :
1907 : /* Copy because timestamptz_to_str returns a static buffer */
1908 0 : replyTimeStr = pstrdup(timestamptz_to_str(replyTime));
1909 :
1910 0 : elog(DEBUG2, "write %X/%X flush %X/%X apply %X/%X%s reply_time %s",
1911 : (uint32) (writePtr >> 32), (uint32) writePtr,
1912 : (uint32) (flushPtr >> 32), (uint32) flushPtr,
1913 : (uint32) (applyPtr >> 32), (uint32) applyPtr,
1914 : replyRequested ? " (reply requested)" : "",
1915 : replyTimeStr);
1916 :
1917 0 : pfree(replyTimeStr);
1918 : }
1919 :
1920 : /* See if we can compute the round-trip lag for these positions. */
1921 50134 : now = GetCurrentTimestamp();
1922 50134 : writeLag = LagTrackerRead(SYNC_REP_WAIT_WRITE, writePtr, now);
1923 50134 : flushLag = LagTrackerRead(SYNC_REP_WAIT_FLUSH, flushPtr, now);
1924 50134 : applyLag = LagTrackerRead(SYNC_REP_WAIT_APPLY, applyPtr, now);
1925 :
1926 : /*
1927 : * If the standby reports that it has fully replayed the WAL in two
1928 : * consecutive reply messages, then the second such message must result
1929 : * from wal_receiver_status_interval expiring on the standby. This is a
1930 : * convenient time to forget the lag times measured when it last
1931 : * wrote/flushed/applied a WAL record, to avoid displaying stale lag data
1932 : * until more WAL traffic arrives.
1933 : */
1934 50134 : clearLagTimes = false;
1935 50134 : if (applyPtr == sentPtr)
1936 : {
1937 706 : if (fullyAppliedLastTime)
1938 430 : clearLagTimes = true;
1939 706 : fullyAppliedLastTime = true;
1940 : }
1941 : else
1942 49428 : fullyAppliedLastTime = false;
1943 :
1944 : /* Send a reply if the standby requested one. */
1945 50134 : if (replyRequested)
1946 0 : WalSndKeepalive(false);
1947 :
1948 : /*
1949 : * Update shared state for this WalSender process based on reply data from
1950 : * standby.
1951 : */
1952 : {
1953 50134 : WalSnd *walsnd = MyWalSnd;
1954 :
1955 50134 : SpinLockAcquire(&walsnd->mutex);
1956 50134 : walsnd->write = writePtr;
1957 50134 : walsnd->flush = flushPtr;
1958 50134 : walsnd->apply = applyPtr;
1959 50134 : if (writeLag != -1 || clearLagTimes)
1960 842 : walsnd->writeLag = writeLag;
1961 50134 : if (flushLag != -1 || clearLagTimes)
1962 4212 : walsnd->flushLag = flushLag;
1963 50134 : if (applyLag != -1 || clearLagTimes)
1964 842 : walsnd->applyLag = applyLag;
1965 50134 : walsnd->replyTime = replyTime;
1966 50134 : SpinLockRelease(&walsnd->mutex);
1967 : }
1968 :
1969 50134 : if (!am_cascading_walsender)
1970 50134 : SyncRepReleaseWaiters();
1971 :
1972 : /*
1973 : * Advance our local xmin horizon when the client confirmed a flush.
1974 : */
1975 50134 : if (MyReplicationSlot && flushPtr != InvalidXLogRecPtr)
1976 : {
1977 50132 : if (SlotIsLogical(MyReplicationSlot))
1978 50132 : LogicalConfirmReceivedLocation(flushPtr);
1979 : else
1980 0 : PhysicalConfirmReceivedLocation(flushPtr);
1981 : }
1982 50134 : }
1983 :
1984 : /* compute new replication slot xmin horizon if needed */
1985 : static void
1986 0 : PhysicalReplicationSlotNewXmin(TransactionId feedbackXmin, TransactionId feedbackCatalogXmin)
1987 : {
1988 0 : bool changed = false;
1989 0 : ReplicationSlot *slot = MyReplicationSlot;
1990 :
1991 0 : SpinLockAcquire(&slot->mutex);
1992 0 : MyProc->xmin = InvalidTransactionId;
1993 :
1994 : /*
1995 : * For physical replication we don't need the interlock provided by xmin
1996 : * and effective_xmin since the consequences of a missed increase are
1997 : * limited to query cancellations, so set both at once.
1998 : */
1999 0 : if (!TransactionIdIsNormal(slot->data.xmin) ||
2000 0 : !TransactionIdIsNormal(feedbackXmin) ||
2001 0 : TransactionIdPrecedes(slot->data.xmin, feedbackXmin))
2002 : {
2003 0 : changed = true;
2004 0 : slot->data.xmin = feedbackXmin;
2005 0 : slot->effective_xmin = feedbackXmin;
2006 : }
2007 0 : if (!TransactionIdIsNormal(slot->data.catalog_xmin) ||
2008 0 : !TransactionIdIsNormal(feedbackCatalogXmin) ||
2009 0 : TransactionIdPrecedes(slot->data.catalog_xmin, feedbackCatalogXmin))
2010 : {
2011 0 : changed = true;
2012 0 : slot->data.catalog_xmin = feedbackCatalogXmin;
2013 0 : slot->effective_catalog_xmin = feedbackCatalogXmin;
2014 : }
2015 0 : SpinLockRelease(&slot->mutex);
2016 :
2017 0 : if (changed)
2018 : {
2019 0 : ReplicationSlotMarkDirty();
2020 0 : ReplicationSlotsComputeRequiredXmin(false);
2021 : }
2022 0 : }
2023 :
2024 : /*
2025 : * Check that the provided xmin/epoch are sane, that is, not in the future
2026 : * and not so far back as to be already wrapped around.
2027 : *
2028 : * Epoch of nextXid should be same as standby, or if the counter has
2029 : * wrapped, then one greater than standby.
2030 : *
2031 : * This check doesn't care about whether clog exists for these xids
2032 : * at all.
2033 : */
2034 : static bool
2035 0 : TransactionIdInRecentPast(TransactionId xid, uint32 epoch)
2036 : {
2037 : FullTransactionId nextFullXid;
2038 : TransactionId nextXid;
2039 : uint32 nextEpoch;
2040 :
2041 0 : nextFullXid = ReadNextFullTransactionId();
2042 0 : nextXid = XidFromFullTransactionId(nextFullXid);
2043 0 : nextEpoch = EpochFromFullTransactionId(nextFullXid);
2044 :
2045 0 : if (xid <= nextXid)
2046 : {
2047 0 : if (epoch != nextEpoch)
2048 0 : return false;
2049 : }
2050 : else
2051 : {
2052 0 : if (epoch + 1 != nextEpoch)
2053 0 : return false;
2054 : }
2055 :
2056 0 : if (!TransactionIdPrecedesOrEquals(xid, nextXid))
2057 0 : return false; /* epoch OK, but it's wrapped around */
2058 :
2059 0 : return true;
2060 : }
2061 :
2062 : /*
2063 : * Hot Standby feedback
2064 : */
2065 : static void
2066 0 : ProcessStandbyHSFeedbackMessage(void)
2067 : {
2068 : TransactionId feedbackXmin;
2069 : uint32 feedbackEpoch;
2070 : TransactionId feedbackCatalogXmin;
2071 : uint32 feedbackCatalogEpoch;
2072 : TimestampTz replyTime;
2073 :
2074 : /*
2075 : * Decipher the reply message. The caller already consumed the msgtype
2076 : * byte. See XLogWalRcvSendHSFeedback() in walreceiver.c for the creation
2077 : * of this message.
2078 : */
2079 0 : replyTime = pq_getmsgint64(&reply_message);
2080 0 : feedbackXmin = pq_getmsgint(&reply_message, 4);
2081 0 : feedbackEpoch = pq_getmsgint(&reply_message, 4);
2082 0 : feedbackCatalogXmin = pq_getmsgint(&reply_message, 4);
2083 0 : feedbackCatalogEpoch = pq_getmsgint(&reply_message, 4);
2084 :
2085 0 : if (log_min_messages <= DEBUG2)
2086 : {
2087 : char *replyTimeStr;
2088 :
2089 : /* Copy because timestamptz_to_str returns a static buffer */
2090 0 : replyTimeStr = pstrdup(timestamptz_to_str(replyTime));
2091 :
2092 0 : elog(DEBUG2, "hot standby feedback xmin %u epoch %u, catalog_xmin %u epoch %u reply_time %s",
2093 : feedbackXmin,
2094 : feedbackEpoch,
2095 : feedbackCatalogXmin,
2096 : feedbackCatalogEpoch,
2097 : replyTimeStr);
2098 :
2099 0 : pfree(replyTimeStr);
2100 : }
2101 :
2102 : /*
2103 : * Update shared state for this WalSender process based on reply data from
2104 : * standby.
2105 : */
2106 : {
2107 0 : WalSnd *walsnd = MyWalSnd;
2108 :
2109 0 : SpinLockAcquire(&walsnd->mutex);
2110 0 : walsnd->replyTime = replyTime;
2111 0 : SpinLockRelease(&walsnd->mutex);
2112 : }
2113 :
2114 : /*
2115 : * Unset WalSender's xmins if the feedback message values are invalid.
2116 : * This happens when the downstream turned hot_standby_feedback off.
2117 : */
2118 0 : if (!TransactionIdIsNormal(feedbackXmin)
2119 0 : && !TransactionIdIsNormal(feedbackCatalogXmin))
2120 : {
2121 0 : MyProc->xmin = InvalidTransactionId;
2122 0 : if (MyReplicationSlot != NULL)
2123 0 : PhysicalReplicationSlotNewXmin(feedbackXmin, feedbackCatalogXmin);
2124 0 : return;
2125 : }
2126 :
2127 : /*
2128 : * Check that the provided xmin/epoch are sane, that is, not in the future
2129 : * and not so far back as to be already wrapped around. Ignore if not.
2130 : */
2131 0 : if (TransactionIdIsNormal(feedbackXmin) &&
2132 0 : !TransactionIdInRecentPast(feedbackXmin, feedbackEpoch))
2133 0 : return;
2134 :
2135 0 : if (TransactionIdIsNormal(feedbackCatalogXmin) &&
2136 0 : !TransactionIdInRecentPast(feedbackCatalogXmin, feedbackCatalogEpoch))
2137 0 : return;
2138 :
2139 : /*
2140 : * Set the WalSender's xmin equal to the standby's requested xmin, so that
2141 : * the xmin will be taken into account by GetSnapshotData() /
2142 : * ComputeXidHorizons(). This will hold back the removal of dead rows and
2143 : * thereby prevent the generation of cleanup conflicts on the standby
2144 : * server.
2145 : *
2146 : * There is a small window for a race condition here: although we just
2147 : * checked that feedbackXmin precedes nextXid, the nextXid could have
2148 : * gotten advanced between our fetching it and applying the xmin below,
2149 : * perhaps far enough to make feedbackXmin wrap around. In that case the
2150 : * xmin we set here would be "in the future" and have no effect. No point
2151 : * in worrying about this since it's too late to save the desired data
2152 : * anyway. Assuming that the standby sends us an increasing sequence of
2153 : * xmins, this could only happen during the first reply cycle, else our
2154 : * own xmin would prevent nextXid from advancing so far.
2155 : *
2156 : * We don't bother taking the ProcArrayLock here. Setting the xmin field
2157 : * is assumed atomic, and there's no real need to prevent concurrent
2158 : * horizon determinations. (If we're moving our xmin forward, this is
2159 : * obviously safe, and if we're moving it backwards, well, the data is at
2160 : * risk already since a VACUUM could already have determined the horizon.)
2161 : *
2162 : * If we're using a replication slot we reserve the xmin via that,
2163 : * otherwise via the walsender's PGPROC entry. We can only track the
2164 : * catalog xmin separately when using a slot, so we store the least of the
2165 : * two provided when not using a slot.
2166 : *
2167 : * XXX: It might make sense to generalize the ephemeral slot concept and
2168 : * always use the slot mechanism to handle the feedback xmin.
2169 : */
2170 0 : if (MyReplicationSlot != NULL) /* XXX: persistency configurable? */
2171 0 : PhysicalReplicationSlotNewXmin(feedbackXmin, feedbackCatalogXmin);
2172 : else
2173 : {
2174 0 : if (TransactionIdIsNormal(feedbackCatalogXmin)
2175 0 : && TransactionIdPrecedes(feedbackCatalogXmin, feedbackXmin))
2176 0 : MyProc->xmin = feedbackCatalogXmin;
2177 : else
2178 0 : MyProc->xmin = feedbackXmin;
2179 : }
2180 : }
2181 :
2182 : /*
2183 : * Compute how long send/receive loops should sleep.
2184 : *
2185 : * If wal_sender_timeout is enabled we want to wake up in time to send
2186 : * keepalives and to abort the connection if wal_sender_timeout has been
2187 : * reached.
2188 : */
2189 : static long
2190 4790 : WalSndComputeSleeptime(TimestampTz now)
2191 : {
2192 4790 : long sleeptime = 10000; /* 10 s */
2193 :
2194 4790 : if (wal_sender_timeout > 0 && last_reply_timestamp > 0)
2195 : {
2196 : TimestampTz wakeup_time;
2197 : long sec_to_timeout;
2198 : int microsec_to_timeout;
2199 :
2200 : /*
2201 : * At the latest stop sleeping once wal_sender_timeout has been
2202 : * reached.
2203 : */
2204 4784 : wakeup_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
2205 : wal_sender_timeout);
2206 :
2207 : /*
2208 : * If no ping has been sent yet, wakeup when it's time to do so.
2209 : * WalSndKeepaliveIfNecessary() wants to send a keepalive once half of
2210 : * the timeout passed without a response.
2211 : */
2212 4784 : if (!waiting_for_ping_response)
2213 1440 : wakeup_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
2214 : wal_sender_timeout / 2);
2215 :
2216 : /* Compute relative time until wakeup. */
2217 4784 : TimestampDifference(now, wakeup_time,
2218 : &sec_to_timeout, µsec_to_timeout);
2219 :
2220 9568 : sleeptime = sec_to_timeout * 1000 +
2221 4784 : microsec_to_timeout / 1000;
2222 : }
2223 :
2224 4790 : return sleeptime;
2225 : }
2226 :
2227 : /*
2228 : * Check whether there have been responses by the client within
2229 : * wal_sender_timeout and shutdown if not. Using last_processing as the
2230 : * reference point avoids counting server-side stalls against the client.
2231 : * However, a long server-side stall can make WalSndKeepaliveIfNecessary()
2232 : * postdate last_processing by more than wal_sender_timeout. If that happens,
2233 : * the client must reply almost immediately to avoid a timeout. This rarely
2234 : * affects the default configuration, under which clients spontaneously send a
2235 : * message every standby_message_timeout = wal_sender_timeout/6 = 10s. We
2236 : * could eliminate that problem by recognizing timeout expiration at
2237 : * wal_sender_timeout/2 after the keepalive.
2238 : */
2239 : static void
2240 1183798 : WalSndCheckTimeOut(void)
2241 : {
2242 : TimestampTz timeout;
2243 :
2244 : /* don't bail out if we're doing something that doesn't require timeouts */
2245 1183798 : if (last_reply_timestamp <= 0)
2246 1183804 : return;
2247 :
2248 1183792 : timeout = TimestampTzPlusMilliseconds(last_reply_timestamp,
2249 : wal_sender_timeout);
2250 :
2251 1183792 : if (wal_sender_timeout > 0 && last_processing >= timeout)
2252 : {
2253 : /*
2254 : * Since typically expiration of replication timeout means
2255 : * communication problem, we don't send the error message to the
2256 : * standby.
2257 : */
2258 0 : ereport(COMMERROR,
2259 : (errmsg("terminating walsender process due to replication timeout")));
2260 :
2261 0 : WalSndShutdown();
2262 : }
2263 : }
2264 :
2265 : /* Main loop of walsender process that streams the WAL over Copy messages. */
2266 : static void
2267 196 : WalSndLoop(WalSndSendDataCallback send_data)
2268 : {
2269 : /*
2270 : * Initialize the last reply timestamp. That enables timeout processing
2271 : * from hereon.
2272 : */
2273 196 : last_reply_timestamp = GetCurrentTimestamp();
2274 196 : waiting_for_ping_response = false;
2275 :
2276 : /*
2277 : * Loop until we reach the end of this timeline or the client requests to
2278 : * stop streaming.
2279 : */
2280 : for (;;)
2281 : {
2282 : /* Clear any already-pending wakeups */
2283 1182446 : ResetLatch(MyLatch);
2284 :
2285 1182446 : CHECK_FOR_INTERRUPTS();
2286 :
2287 : /* Process any requests or signals received recently */
2288 1182446 : if (ConfigReloadPending)
2289 : {
2290 0 : ConfigReloadPending = false;
2291 0 : ProcessConfigFile(PGC_SIGHUP);
2292 0 : SyncRepInitConfig();
2293 : }
2294 :
2295 : /* Check for input from the client */
2296 1182446 : ProcessRepliesIfAny();
2297 :
2298 : /*
2299 : * If we have received CopyDone from the client, sent CopyDone
2300 : * ourselves, and the output buffer is empty, it's time to exit
2301 : * streaming.
2302 : */
2303 1182568 : if (streamingDoneReceiving && streamingDoneSending &&
2304 124 : !pq_is_send_pending())
2305 122 : break;
2306 :
2307 : /*
2308 : * If we don't have any pending data in the output buffer, try to send
2309 : * some more. If there is some, we don't bother to call send_data
2310 : * again until we've flushed it ... but we'd better assume we are not
2311 : * caught up.
2312 : */
2313 1182322 : if (!pq_is_send_pending())
2314 1178976 : send_data();
2315 : else
2316 3346 : WalSndCaughtUp = false;
2317 :
2318 : /* Try to flush pending output to the client */
2319 1182254 : if (pq_flush_if_writable() != 0)
2320 0 : WalSndShutdown();
2321 :
2322 : /* If nothing remains to be sent right now ... */
2323 1182254 : if (WalSndCaughtUp && !pq_is_send_pending())
2324 : {
2325 : /*
2326 : * If we're in catchup state, move to streaming. This is an
2327 : * important state change for users to know about, since before
2328 : * this point data loss might occur if the primary dies and we
2329 : * need to failover to the standby. The state change is also
2330 : * important for synchronous replication, since commits that
2331 : * started to wait at that point might wait for some time.
2332 : */
2333 10380 : if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
2334 : {
2335 196 : ereport(DEBUG1,
2336 : (errmsg("\"%s\" has now caught up with upstream server",
2337 : application_name)));
2338 196 : WalSndSetState(WALSNDSTATE_STREAMING);
2339 : }
2340 :
2341 : /*
2342 : * When SIGUSR2 arrives, we send any outstanding logs up to the
2343 : * shutdown checkpoint record (i.e., the latest record), wait for
2344 : * them to be replicated to the standby, and exit. This may be a
2345 : * normal termination at shutdown, or a promotion, the walsender
2346 : * is not sure which.
2347 : */
2348 10380 : if (got_SIGUSR2)
2349 9104 : WalSndDone(send_data);
2350 : }
2351 :
2352 : /* Check for replication timeout. */
2353 1182250 : WalSndCheckTimeOut();
2354 :
2355 : /* Send keepalive if the time has come */
2356 1182250 : WalSndKeepaliveIfNecessary();
2357 :
2358 : /*
2359 : * Block if we have unsent data. XXX For logical replication, let
2360 : * WalSndWaitForWal() handle any other blocking; idle receivers need
2361 : * its additional actions. For physical replication, also block if
2362 : * caught up; its send_data does not block.
2363 : */
2364 1182250 : if ((WalSndCaughtUp && send_data != XLogSendLogical &&
2365 1182250 : !streamingDoneSending) ||
2366 1182250 : pq_is_send_pending())
2367 : {
2368 : long sleeptime;
2369 : int wakeEvents;
2370 :
2371 3344 : wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT |
2372 : WL_SOCKET_READABLE;
2373 :
2374 : /*
2375 : * Use fresh timestamp, not last_processing, to reduce the chance
2376 : * of reaching wal_sender_timeout before sending a keepalive.
2377 : */
2378 3344 : sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
2379 :
2380 3344 : if (pq_is_send_pending())
2381 3344 : wakeEvents |= WL_SOCKET_WRITEABLE;
2382 :
2383 : /* Sleep until something happens or we time out */
2384 3344 : (void) WaitLatchOrSocket(MyLatch, wakeEvents,
2385 3344 : MyProcPort->sock, sleeptime,
2386 : WAIT_EVENT_WAL_SENDER_MAIN);
2387 : }
2388 1182250 : }
2389 122 : }
2390 :
2391 : /* Initialize a per-walsender data structure for this walsender process */
2392 : static void
2393 288 : InitWalSenderSlot(void)
2394 : {
2395 : int i;
2396 :
2397 : /*
2398 : * WalSndCtl should be set up already (we inherit this by fork() or
2399 : * EXEC_BACKEND mechanism from the postmaster).
2400 : */
2401 288 : Assert(WalSndCtl != NULL);
2402 288 : Assert(MyWalSnd == NULL);
2403 :
2404 : /*
2405 : * Find a free walsender slot and reserve it. This must not fail due to
2406 : * the prior check for free WAL senders in InitProcess().
2407 : */
2408 1012 : for (i = 0; i < max_wal_senders; i++)
2409 : {
2410 506 : WalSnd *walsnd = &WalSndCtl->walsnds[i];
2411 :
2412 506 : SpinLockAcquire(&walsnd->mutex);
2413 :
2414 506 : if (walsnd->pid != 0)
2415 : {
2416 218 : SpinLockRelease(&walsnd->mutex);
2417 218 : continue;
2418 : }
2419 : else
2420 : {
2421 : /*
2422 : * Found a free slot. Reserve it for us.
2423 : */
2424 288 : walsnd->pid = MyProcPid;
2425 288 : walsnd->state = WALSNDSTATE_STARTUP;
2426 288 : walsnd->sentPtr = InvalidXLogRecPtr;
2427 288 : walsnd->needreload = false;
2428 288 : walsnd->write = InvalidXLogRecPtr;
2429 288 : walsnd->flush = InvalidXLogRecPtr;
2430 288 : walsnd->apply = InvalidXLogRecPtr;
2431 288 : walsnd->writeLag = -1;
2432 288 : walsnd->flushLag = -1;
2433 288 : walsnd->applyLag = -1;
2434 288 : walsnd->sync_standby_priority = 0;
2435 288 : walsnd->latch = &MyProc->procLatch;
2436 288 : walsnd->replyTime = 0;
2437 288 : SpinLockRelease(&walsnd->mutex);
2438 : /* don't need the lock anymore */
2439 288 : MyWalSnd = (WalSnd *) walsnd;
2440 :
2441 288 : break;
2442 : }
2443 : }
2444 :
2445 288 : Assert(MyWalSnd != NULL);
2446 :
2447 : /* Arrange to clean up at walsender exit */
2448 288 : on_shmem_exit(WalSndKill, 0);
2449 288 : }
2450 :
2451 : /* Destroy the per-walsender data structure for this walsender process */
2452 : static void
2453 288 : WalSndKill(int code, Datum arg)
2454 : {
2455 288 : WalSnd *walsnd = MyWalSnd;
2456 :
2457 288 : Assert(walsnd != NULL);
2458 :
2459 288 : MyWalSnd = NULL;
2460 :
2461 288 : SpinLockAcquire(&walsnd->mutex);
2462 : /* clear latch while holding the spinlock, so it can safely be read */
2463 288 : walsnd->latch = NULL;
2464 : /* Mark WalSnd struct as no longer being in use. */
2465 288 : walsnd->pid = 0;
2466 288 : SpinLockRelease(&walsnd->mutex);
2467 288 : }
2468 :
2469 : /* XLogReaderRoutine->segment_open callback */
2470 : static void
2471 18584 : WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
2472 : TimeLineID *tli_p)
2473 : {
2474 : char path[MAXPGPATH];
2475 :
2476 : /*-------
2477 : * When reading from a historic timeline, and there is a timeline switch
2478 : * within this segment, read from the WAL segment belonging to the new
2479 : * timeline.
2480 : *
2481 : * For example, imagine that this server is currently on timeline 5, and
2482 : * we're streaming timeline 4. The switch from timeline 4 to 5 happened at
2483 : * 0/13002088. In pg_wal, we have these files:
2484 : *
2485 : * ...
2486 : * 000000040000000000000012
2487 : * 000000040000000000000013
2488 : * 000000050000000000000013
2489 : * 000000050000000000000014
2490 : * ...
2491 : *
2492 : * In this situation, when requested to send the WAL from segment 0x13, on
2493 : * timeline 4, we read the WAL from file 000000050000000000000013. Archive
2494 : * recovery prefers files from newer timelines, so if the segment was
2495 : * restored from the archive on this server, the file belonging to the old
2496 : * timeline, 000000040000000000000013, might not exist. Their contents are
2497 : * equal up to the switchpoint, because at a timeline switch, the used
2498 : * portion of the old segment is copied to the new file. -------
2499 : */
2500 18584 : *tli_p = sendTimeLine;
2501 18584 : if (sendTimeLineIsHistoric)
2502 : {
2503 : XLogSegNo endSegNo;
2504 :
2505 0 : XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize);
2506 0 : if (state->seg.ws_segno == endSegNo)
2507 0 : *tli_p = sendTimeLineNextTLI;
2508 : }
2509 :
2510 18584 : XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize);
2511 18584 : state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
2512 18584 : if (state->seg.ws_file >= 0)
2513 37168 : return;
2514 :
2515 : /*
2516 : * If the file is not found, assume it's because the standby asked for a
2517 : * too old WAL segment that has already been removed or recycled.
2518 : */
2519 0 : if (errno == ENOENT)
2520 : {
2521 : char xlogfname[MAXFNAMELEN];
2522 0 : int save_errno = errno;
2523 :
2524 0 : XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size);
2525 0 : errno = save_errno;
2526 0 : ereport(ERROR,
2527 : (errcode_for_file_access(),
2528 : errmsg("requested WAL segment %s has already been removed",
2529 : xlogfname)));
2530 : }
2531 : else
2532 0 : ereport(ERROR,
2533 : (errcode_for_file_access(),
2534 : errmsg("could not open file \"%s\": %m",
2535 : path)));
2536 : }
2537 :
2538 : /*
2539 : * Send out the WAL in its normal physical/stored form.
2540 : *
2541 : * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
2542 : * but not yet sent to the client, and buffer it in the libpq output
2543 : * buffer.
2544 : *
2545 : * If there is no unsent WAL remaining, WalSndCaughtUp is set to true,
2546 : * otherwise WalSndCaughtUp is set to false.
2547 : */
2548 : static void
2549 0 : XLogSendPhysical(void)
2550 : {
2551 : XLogRecPtr SendRqstPtr;
2552 : XLogRecPtr startptr;
2553 : XLogRecPtr endptr;
2554 : Size nbytes;
2555 : XLogSegNo segno;
2556 : WALReadError errinfo;
2557 :
2558 : /* If requested switch the WAL sender to the stopping state. */
2559 0 : if (got_STOPPING)
2560 0 : WalSndSetState(WALSNDSTATE_STOPPING);
2561 :
2562 0 : if (streamingDoneSending)
2563 : {
2564 0 : WalSndCaughtUp = true;
2565 0 : return;
2566 : }
2567 :
2568 : /* Figure out how far we can safely send the WAL. */
2569 0 : if (sendTimeLineIsHistoric)
2570 : {
2571 : /*
2572 : * Streaming an old timeline that's in this server's history, but is
2573 : * not the one we're currently inserting or replaying. It can be
2574 : * streamed up to the point where we switched off that timeline.
2575 : */
2576 0 : SendRqstPtr = sendTimeLineValidUpto;
2577 : }
2578 0 : else if (am_cascading_walsender)
2579 : {
2580 : /*
2581 : * Streaming the latest timeline on a standby.
2582 : *
2583 : * Attempt to send all WAL that has already been replayed, so that we
2584 : * know it's valid. If we're receiving WAL through streaming
2585 : * replication, it's also OK to send any WAL that has been received
2586 : * but not replayed.
2587 : *
2588 : * The timeline we're recovering from can change, or we can be
2589 : * promoted. In either case, the current timeline becomes historic. We
2590 : * need to detect that so that we don't try to stream past the point
2591 : * where we switched to another timeline. We check for promotion or
2592 : * timeline switch after calculating FlushPtr, to avoid a race
2593 : * condition: if the timeline becomes historic just after we checked
2594 : * that it was still current, it's still be OK to stream it up to the
2595 : * FlushPtr that was calculated before it became historic.
2596 : */
2597 0 : bool becameHistoric = false;
2598 :
2599 0 : SendRqstPtr = GetStandbyFlushRecPtr();
2600 :
2601 0 : if (!RecoveryInProgress())
2602 : {
2603 : /*
2604 : * We have been promoted. RecoveryInProgress() updated
2605 : * ThisTimeLineID to the new current timeline.
2606 : */
2607 0 : am_cascading_walsender = false;
2608 0 : becameHistoric = true;
2609 : }
2610 : else
2611 : {
2612 : /*
2613 : * Still a cascading standby. But is the timeline we're sending
2614 : * still the one recovery is recovering from? ThisTimeLineID was
2615 : * updated by the GetStandbyFlushRecPtr() call above.
2616 : */
2617 0 : if (sendTimeLine != ThisTimeLineID)
2618 0 : becameHistoric = true;
2619 : }
2620 :
2621 0 : if (becameHistoric)
2622 : {
2623 : /*
2624 : * The timeline we were sending has become historic. Read the
2625 : * timeline history file of the new timeline to see where exactly
2626 : * we forked off from the timeline we were sending.
2627 : */
2628 : List *history;
2629 :
2630 0 : history = readTimeLineHistory(ThisTimeLineID);
2631 0 : sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI);
2632 :
2633 0 : Assert(sendTimeLine < sendTimeLineNextTLI);
2634 0 : list_free_deep(history);
2635 :
2636 0 : sendTimeLineIsHistoric = true;
2637 :
2638 0 : SendRqstPtr = sendTimeLineValidUpto;
2639 : }
2640 : }
2641 : else
2642 : {
2643 : /*
2644 : * Streaming the current timeline on a primary.
2645 : *
2646 : * Attempt to send all data that's already been written out and
2647 : * fsync'd to disk. We cannot go further than what's been written out
2648 : * given the current implementation of WALRead(). And in any case
2649 : * it's unsafe to send WAL that is not securely down to disk on the
2650 : * primary: if the primary subsequently crashes and restarts, standbys
2651 : * must not have applied any WAL that got lost on the primary.
2652 : */
2653 0 : SendRqstPtr = GetFlushRecPtr();
2654 : }
2655 :
2656 : /*
2657 : * Record the current system time as an approximation of the time at which
2658 : * this WAL location was written for the purposes of lag tracking.
2659 : *
2660 : * In theory we could make XLogFlush() record a time in shmem whenever WAL
2661 : * is flushed and we could get that time as well as the LSN when we call
2662 : * GetFlushRecPtr() above (and likewise for the cascading standby
2663 : * equivalent), but rather than putting any new code into the hot WAL path
2664 : * it seems good enough to capture the time here. We should reach this
2665 : * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that
2666 : * may take some time, we read the WAL flush pointer and take the time
2667 : * very close to together here so that we'll get a later position if it is
2668 : * still moving.
2669 : *
2670 : * Because LagTrackerWrite ignores samples when the LSN hasn't advanced,
2671 : * this gives us a cheap approximation for the WAL flush time for this
2672 : * LSN.
2673 : *
2674 : * Note that the LSN is not necessarily the LSN for the data contained in
2675 : * the present message; it's the end of the WAL, which might be further
2676 : * ahead. All the lag tracking machinery cares about is finding out when
2677 : * that arbitrary LSN is eventually reported as written, flushed and
2678 : * applied, so that it can measure the elapsed time.
2679 : */
2680 0 : LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp());
2681 :
2682 : /*
2683 : * If this is a historic timeline and we've reached the point where we
2684 : * forked to the next timeline, stop streaming.
2685 : *
2686 : * Note: We might already have sent WAL > sendTimeLineValidUpto. The
2687 : * startup process will normally replay all WAL that has been received
2688 : * from the primary, before promoting, but if the WAL streaming is
2689 : * terminated at a WAL page boundary, the valid portion of the timeline
2690 : * might end in the middle of a WAL record. We might've already sent the
2691 : * first half of that partial WAL record to the cascading standby, so that
2692 : * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't
2693 : * replay the partial WAL record either, so it can still follow our
2694 : * timeline switch.
2695 : */
2696 0 : if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr)
2697 : {
2698 : /* close the current file. */
2699 0 : if (xlogreader->seg.ws_file >= 0)
2700 0 : wal_segment_close(xlogreader);
2701 :
2702 : /* Send CopyDone */
2703 0 : pq_putmessage_noblock('c', NULL, 0);
2704 0 : streamingDoneSending = true;
2705 :
2706 0 : WalSndCaughtUp = true;
2707 :
2708 0 : elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
2709 : (uint32) (sendTimeLineValidUpto >> 32), (uint32) sendTimeLineValidUpto,
2710 : (uint32) (sentPtr >> 32), (uint32) sentPtr);
2711 0 : return;
2712 : }
2713 :
2714 : /* Do we have any work to do? */
2715 0 : Assert(sentPtr <= SendRqstPtr);
2716 0 : if (SendRqstPtr <= sentPtr)
2717 : {
2718 0 : WalSndCaughtUp = true;
2719 0 : return;
2720 : }
2721 :
2722 : /*
2723 : * Figure out how much to send in one message. If there's no more than
2724 : * MAX_SEND_SIZE bytes to send, send everything. Otherwise send
2725 : * MAX_SEND_SIZE bytes, but round back to logfile or page boundary.
2726 : *
2727 : * The rounding is not only for performance reasons. Walreceiver relies on
2728 : * the fact that we never split a WAL record across two messages. Since a
2729 : * long WAL record is split at page boundary into continuation records,
2730 : * page boundary is always a safe cut-off point. We also assume that
2731 : * SendRqstPtr never points to the middle of a WAL record.
2732 : */
2733 0 : startptr = sentPtr;
2734 0 : endptr = startptr;
2735 0 : endptr += MAX_SEND_SIZE;
2736 :
2737 : /* if we went beyond SendRqstPtr, back off */
2738 0 : if (SendRqstPtr <= endptr)
2739 : {
2740 0 : endptr = SendRqstPtr;
2741 0 : if (sendTimeLineIsHistoric)
2742 0 : WalSndCaughtUp = false;
2743 : else
2744 0 : WalSndCaughtUp = true;
2745 : }
2746 : else
2747 : {
2748 : /* round down to page boundary. */
2749 0 : endptr -= (endptr % XLOG_BLCKSZ);
2750 0 : WalSndCaughtUp = false;
2751 : }
2752 :
2753 0 : nbytes = endptr - startptr;
2754 0 : Assert(nbytes <= MAX_SEND_SIZE);
2755 :
2756 : /*
2757 : * OK to read and send the slice.
2758 : */
2759 0 : resetStringInfo(&output_message);
2760 0 : pq_sendbyte(&output_message, 'w');
2761 :
2762 0 : pq_sendint64(&output_message, startptr); /* dataStart */
2763 0 : pq_sendint64(&output_message, SendRqstPtr); /* walEnd */
2764 0 : pq_sendint64(&output_message, 0); /* sendtime, filled in last */
2765 :
2766 : /*
2767 : * Read the log directly into the output buffer to avoid extra memcpy
2768 : * calls.
2769 : */
2770 0 : enlargeStringInfo(&output_message, nbytes);
2771 :
2772 : retry:
2773 0 : if (!WALRead(xlogreader,
2774 0 : &output_message.data[output_message.len],
2775 : startptr,
2776 : nbytes,
2777 0 : xlogreader->seg.ws_tli, /* Pass the current TLI because
2778 : * only WalSndSegmentOpen controls
2779 : * whether new TLI is needed. */
2780 : &errinfo))
2781 0 : WALReadRaiseError(&errinfo);
2782 :
2783 : /* See logical_read_xlog_page(). */
2784 0 : XLByteToSeg(startptr, segno, xlogreader->segcxt.ws_segsize);
2785 0 : CheckXLogRemoved(segno, xlogreader->seg.ws_tli);
2786 :
2787 : /*
2788 : * During recovery, the currently-open WAL file might be replaced with the
2789 : * file of the same name retrieved from archive. So we always need to
2790 : * check what we read was valid after reading into the buffer. If it's
2791 : * invalid, we try to open and read the file again.
2792 : */
2793 0 : if (am_cascading_walsender)
2794 : {
2795 0 : WalSnd *walsnd = MyWalSnd;
2796 : bool reload;
2797 :
2798 0 : SpinLockAcquire(&walsnd->mutex);
2799 0 : reload = walsnd->needreload;
2800 0 : walsnd->needreload = false;
2801 0 : SpinLockRelease(&walsnd->mutex);
2802 :
2803 0 : if (reload && xlogreader->seg.ws_file >= 0)
2804 : {
2805 0 : wal_segment_close(xlogreader);
2806 :
2807 0 : goto retry;
2808 : }
2809 : }
2810 :
2811 0 : output_message.len += nbytes;
2812 0 : output_message.data[output_message.len] = '\0';
2813 :
2814 : /*
2815 : * Fill the send timestamp last, so that it is taken as late as possible.
2816 : */
2817 0 : resetStringInfo(&tmpbuf);
2818 0 : pq_sendint64(&tmpbuf, GetCurrentTimestamp());
2819 0 : memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)],
2820 0 : tmpbuf.data, sizeof(int64));
2821 :
2822 0 : pq_putmessage_noblock('d', output_message.data, output_message.len);
2823 :
2824 0 : sentPtr = endptr;
2825 :
2826 : /* Update shared memory status */
2827 : {
2828 0 : WalSnd *walsnd = MyWalSnd;
2829 :
2830 0 : SpinLockAcquire(&walsnd->mutex);
2831 0 : walsnd->sentPtr = sentPtr;
2832 0 : SpinLockRelease(&walsnd->mutex);
2833 : }
2834 :
2835 : /* Report progress of XLOG streaming in PS display */
2836 0 : if (update_process_title)
2837 : {
2838 : char activitymsg[50];
2839 :
2840 0 : snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X",
2841 0 : (uint32) (sentPtr >> 32), (uint32) sentPtr);
2842 0 : set_ps_display(activitymsg);
2843 : }
2844 : }
2845 :
2846 : /*
2847 : * Stream out logically decoded data.
2848 : */
2849 : static void
2850 1188080 : XLogSendLogical(void)
2851 : {
2852 : XLogRecord *record;
2853 : char *errm;
2854 :
2855 : /*
2856 : * We'll use the current flush point to determine whether we've caught up.
2857 : * This variable is static in order to cache it across calls. Caching is
2858 : * helpful because GetFlushRecPtr() needs to acquire a heavily-contended
2859 : * spinlock.
2860 : */
2861 : static XLogRecPtr flushPtr = InvalidXLogRecPtr;
2862 :
2863 : /*
2864 : * Don't know whether we've caught up yet. We'll set WalSndCaughtUp to
2865 : * true in WalSndWaitForWal, if we're actually waiting. We also set to
2866 : * true if XLogReadRecord() had to stop reading but WalSndWaitForWal
2867 : * didn't wait - i.e. when we're shutting down.
2868 : */
2869 1188080 : WalSndCaughtUp = false;
2870 :
2871 1188080 : record = XLogReadRecord(logical_decoding_ctx->reader, &errm);
2872 :
2873 : /* xlog record was invalid */
2874 1188018 : if (errm != NULL)
2875 0 : elog(ERROR, "%s", errm);
2876 :
2877 1188018 : if (record != NULL)
2878 : {
2879 : /*
2880 : * Note the lack of any call to LagTrackerWrite() which is handled by
2881 : * WalSndUpdateProgress which is called by output plugin through
2882 : * logical decoding write api.
2883 : */
2884 1169690 : LogicalDecodingProcessRecord(logical_decoding_ctx, logical_decoding_ctx->reader);
2885 :
2886 1169684 : sentPtr = logical_decoding_ctx->reader->EndRecPtr;
2887 : }
2888 :
2889 : /*
2890 : * If first time through in this session, initialize flushPtr. Otherwise,
2891 : * we only need to update flushPtr if EndRecPtr is past it.
2892 : */
2893 1188012 : if (flushPtr == InvalidXLogRecPtr)
2894 196 : flushPtr = GetFlushRecPtr();
2895 1187816 : else if (logical_decoding_ctx->reader->EndRecPtr >= flushPtr)
2896 19396 : flushPtr = GetFlushRecPtr();
2897 :
2898 : /* If EndRecPtr is still past our flushPtr, it means we caught up. */
2899 1188012 : if (logical_decoding_ctx->reader->EndRecPtr >= flushPtr)
2900 19056 : WalSndCaughtUp = true;
2901 :
2902 : /*
2903 : * If we're caught up and have been requested to stop, have WalSndLoop()
2904 : * terminate the connection in an orderly manner, after writing out all
2905 : * the pending data.
2906 : */
2907 1188012 : if (WalSndCaughtUp && got_STOPPING)
2908 18208 : got_SIGUSR2 = true;
2909 :
2910 : /* Update shared memory status */
2911 : {
2912 1188012 : WalSnd *walsnd = MyWalSnd;
2913 :
2914 1188012 : SpinLockAcquire(&walsnd->mutex);
2915 1188012 : walsnd->sentPtr = sentPtr;
2916 1188012 : SpinLockRelease(&walsnd->mutex);
2917 : }
2918 1188012 : }
2919 :
2920 : /*
2921 : * Shutdown if the sender is caught up.
2922 : *
2923 : * NB: This should only be called when the shutdown signal has been received
2924 : * from postmaster.
2925 : *
2926 : * Note that if we determine that there's still more data to send, this
2927 : * function will return control to the caller.
2928 : */
2929 : static void
2930 9104 : WalSndDone(WalSndSendDataCallback send_data)
2931 : {
2932 : XLogRecPtr replicatedPtr;
2933 :
2934 : /* ... let's just be real sure we're caught up ... */
2935 9104 : send_data();
2936 :
2937 : /*
2938 : * To figure out whether all WAL has successfully been replicated, check
2939 : * flush location if valid, write otherwise. Tools like pg_receivewal will
2940 : * usually (unless in synchronous mode) return an invalid flush location.
2941 : */
2942 18208 : replicatedPtr = XLogRecPtrIsInvalid(MyWalSnd->flush) ?
2943 9104 : MyWalSnd->write : MyWalSnd->flush;
2944 :
2945 9108 : if (WalSndCaughtUp && sentPtr == replicatedPtr &&
2946 4 : !pq_is_send_pending())
2947 : {
2948 : QueryCompletion qc;
2949 :
2950 : /* Inform the standby that XLOG streaming is done */
2951 4 : SetQueryCompletion(&qc, CMDTAG_COPY, 0);
2952 4 : EndCommand(&qc, DestRemote, false);
2953 4 : pq_flush();
2954 :
2955 4 : proc_exit(0);
2956 : }
2957 9100 : if (!waiting_for_ping_response)
2958 3344 : WalSndKeepalive(true);
2959 9100 : }
2960 :
2961 : /*
2962 : * Returns the latest point in WAL that has been safely flushed to disk, and
2963 : * can be sent to the standby. This should only be called when in recovery,
2964 : * ie. we're streaming to a cascaded standby.
2965 : *
2966 : * As a side-effect, ThisTimeLineID is updated to the TLI of the last
2967 : * replayed WAL record.
2968 : */
2969 : static XLogRecPtr
2970 0 : GetStandbyFlushRecPtr(void)
2971 : {
2972 : XLogRecPtr replayPtr;
2973 : TimeLineID replayTLI;
2974 : XLogRecPtr receivePtr;
2975 : TimeLineID receiveTLI;
2976 : XLogRecPtr result;
2977 :
2978 : /*
2979 : * We can safely send what's already been replayed. Also, if walreceiver
2980 : * is streaming WAL from the same timeline, we can send anything that it
2981 : * has streamed, but hasn't been replayed yet.
2982 : */
2983 :
2984 0 : receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
2985 0 : replayPtr = GetXLogReplayRecPtr(&replayTLI);
2986 :
2987 0 : ThisTimeLineID = replayTLI;
2988 :
2989 0 : result = replayPtr;
2990 0 : if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr)
2991 0 : result = receivePtr;
2992 :
2993 0 : return result;
2994 : }
2995 :
2996 : /*
2997 : * Request walsenders to reload the currently-open WAL file
2998 : */
2999 : void
3000 0 : WalSndRqstFileReload(void)
3001 : {
3002 : int i;
3003 :
3004 0 : for (i = 0; i < max_wal_senders; i++)
3005 : {
3006 0 : WalSnd *walsnd = &WalSndCtl->walsnds[i];
3007 :
3008 0 : SpinLockAcquire(&walsnd->mutex);
3009 0 : if (walsnd->pid == 0)
3010 : {
3011 0 : SpinLockRelease(&walsnd->mutex);
3012 0 : continue;
3013 : }
3014 0 : walsnd->needreload = true;
3015 0 : SpinLockRelease(&walsnd->mutex);
3016 : }
3017 0 : }
3018 :
3019 : /*
3020 : * Handle PROCSIG_WALSND_INIT_STOPPING signal.
3021 : */
3022 : void
3023 4 : HandleWalSndInitStopping(void)
3024 : {
3025 4 : Assert(am_walsender);
3026 :
3027 : /*
3028 : * If replication has not yet started, die like with SIGTERM. If
3029 : * replication is active, only set a flag and wake up the main loop. It
3030 : * will send any outstanding WAL, wait for it to be replicated to the
3031 : * standby, and then exit gracefully.
3032 : */
3033 4 : if (!replication_active)
3034 0 : kill(MyProcPid, SIGTERM);
3035 : else
3036 4 : got_STOPPING = true;
3037 4 : }
3038 :
3039 : /*
3040 : * SIGUSR2: set flag to do a last cycle and shut down afterwards. The WAL
3041 : * sender should already have been switched to WALSNDSTATE_STOPPING at
3042 : * this point.
3043 : */
3044 : static void
3045 0 : WalSndLastCycleHandler(SIGNAL_ARGS)
3046 : {
3047 0 : int save_errno = errno;
3048 :
3049 0 : got_SIGUSR2 = true;
3050 0 : SetLatch(MyLatch);
3051 :
3052 0 : errno = save_errno;
3053 0 : }
3054 :
3055 : /* Set up signal handlers */
3056 : void
3057 288 : WalSndSignals(void)
3058 : {
3059 : /* Set up signal handlers */
3060 288 : pqsignal(SIGHUP, SignalHandlerForConfigReload);
3061 288 : pqsignal(SIGINT, StatementCancelHandler); /* query cancel */
3062 288 : pqsignal(SIGTERM, die); /* request shutdown */
3063 : /* SIGQUIT handler was already set up by InitPostmasterChild */
3064 288 : InitializeTimeouts(); /* establishes SIGALRM handler */
3065 288 : pqsignal(SIGPIPE, SIG_IGN);
3066 288 : pqsignal(SIGUSR1, procsignal_sigusr1_handler);
3067 288 : pqsignal(SIGUSR2, WalSndLastCycleHandler); /* request a last cycle and
3068 : * shutdown */
3069 :
3070 : /* Reset some signals that are accepted by postmaster but not here */
3071 288 : pqsignal(SIGCHLD, SIG_DFL);
3072 288 : }
3073 :
3074 : /* Report shared-memory space needed by WalSndShmemInit */
3075 : Size
3076 10704 : WalSndShmemSize(void)
3077 : {
3078 10704 : Size size = 0;
3079 :
3080 10704 : size = offsetof(WalSndCtlData, walsnds);
3081 10704 : size = add_size(size, mul_size(max_wal_senders, sizeof(WalSnd)));
3082 :
3083 10704 : return size;
3084 : }
3085 :
3086 : /* Allocate and initialize walsender-related shared memory */
3087 : void
3088 3568 : WalSndShmemInit(void)
3089 : {
3090 : bool found;
3091 : int i;
3092 :
3093 3568 : WalSndCtl = (WalSndCtlData *)
3094 3568 : ShmemInitStruct("Wal Sender Ctl", WalSndShmemSize(), &found);
3095 :
3096 3568 : if (!found)
3097 : {
3098 : /* First time through, so initialize */
3099 3568 : MemSet(WalSndCtl, 0, WalSndShmemSize());
3100 :
3101 14272 : for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; i++)
3102 10704 : SHMQueueInit(&(WalSndCtl->SyncRepQueue[i]));
3103 :
3104 39248 : for (i = 0; i < max_wal_senders; i++)
3105 : {
3106 35680 : WalSnd *walsnd = &WalSndCtl->walsnds[i];
3107 :
3108 35680 : SpinLockInit(&walsnd->mutex);
3109 : }
3110 : }
3111 3568 : }
3112 :
3113 : /*
3114 : * Wake up all walsenders
3115 : *
3116 : * This will be called inside critical sections, so throwing an error is not
3117 : * advisable.
3118 : */
3119 : void
3120 52770 : WalSndWakeup(void)
3121 : {
3122 : int i;
3123 :
3124 580470 : for (i = 0; i < max_wal_senders; i++)
3125 : {
3126 : Latch *latch;
3127 527700 : WalSnd *walsnd = &WalSndCtl->walsnds[i];
3128 :
3129 : /*
3130 : * Get latch pointer with spinlock held, for the unlikely case that
3131 : * pointer reads aren't atomic (as they're 8 bytes).
3132 : */
3133 527700 : SpinLockAcquire(&walsnd->mutex);
3134 527700 : latch = walsnd->latch;
3135 527700 : SpinLockRelease(&walsnd->mutex);
3136 :
3137 527700 : if (latch != NULL)
3138 1106 : SetLatch(latch);
3139 : }
3140 52770 : }
3141 :
3142 : /*
3143 : * Signal all walsenders to move to stopping state.
3144 : *
3145 : * This will trigger walsenders to move to a state where no further WAL can be
3146 : * generated. See this file's header for details.
3147 : */
3148 : void
3149 274 : WalSndInitStopping(void)
3150 : {
3151 : int i;
3152 :
3153 3014 : for (i = 0; i < max_wal_senders; i++)
3154 : {
3155 2740 : WalSnd *walsnd = &WalSndCtl->walsnds[i];
3156 : pid_t pid;
3157 :
3158 2740 : SpinLockAcquire(&walsnd->mutex);
3159 2740 : pid = walsnd->pid;
3160 2740 : SpinLockRelease(&walsnd->mutex);
3161 :
3162 2740 : if (pid == 0)
3163 2736 : continue;
3164 :
3165 4 : SendProcSignal(pid, PROCSIG_WALSND_INIT_STOPPING, InvalidBackendId);
3166 : }
3167 274 : }
3168 :
3169 : /*
3170 : * Wait that all the WAL senders have quit or reached the stopping state. This
3171 : * is used by the checkpointer to control when the shutdown checkpoint can
3172 : * safely be performed.
3173 : */
3174 : void
3175 300 : WalSndWaitStopping(void)
3176 : {
3177 : for (;;)
3178 : {
3179 : int i;
3180 300 : bool all_stopped = true;
3181 :
3182 3040 : for (i = 0; i < max_wal_senders; i++)
3183 : {
3184 2766 : WalSnd *walsnd = &WalSndCtl->walsnds[i];
3185 :
3186 2766 : SpinLockAcquire(&walsnd->mutex);
3187 :
3188 2766 : if (walsnd->pid == 0)
3189 : {
3190 2740 : SpinLockRelease(&walsnd->mutex);
3191 2740 : continue;
3192 : }
3193 :
3194 26 : if (walsnd->state != WALSNDSTATE_STOPPING)
3195 : {
3196 26 : all_stopped = false;
3197 26 : SpinLockRelease(&walsnd->mutex);
3198 26 : break;
3199 : }
3200 0 : SpinLockRelease(&walsnd->mutex);
3201 : }
3202 :
3203 : /* safe to leave if confirmation is done for all WAL senders */
3204 300 : if (all_stopped)
3205 548 : return;
3206 :
3207 26 : pg_usleep(10000L); /* wait for 10 msec */
3208 26 : }
3209 : }
3210 :
3211 : /* Set state for current walsender (only called in walsender) */
3212 : void
3213 514 : WalSndSetState(WalSndState state)
3214 : {
3215 514 : WalSnd *walsnd = MyWalSnd;
3216 :
3217 514 : Assert(am_walsender);
3218 :
3219 514 : if (walsnd->state == state)
3220 514 : return;
3221 :
3222 514 : SpinLockAcquire(&walsnd->mutex);
3223 514 : walsnd->state = state;
3224 514 : SpinLockRelease(&walsnd->mutex);
3225 : }
3226 :
3227 : /*
3228 : * Return a string constant representing the state. This is used
3229 : * in system views, and should *not* be translated.
3230 : */
3231 : static const char *
3232 514 : WalSndGetStateString(WalSndState state)
3233 : {
3234 514 : switch (state)
3235 : {
3236 : case WALSNDSTATE_STARTUP:
3237 16 : return "startup";
3238 : case WALSNDSTATE_BACKUP:
3239 0 : return "backup";
3240 : case WALSNDSTATE_CATCHUP:
3241 26 : return "catchup";
3242 : case WALSNDSTATE_STREAMING:
3243 472 : return "streaming";
3244 : case WALSNDSTATE_STOPPING:
3245 0 : return "stopping";
3246 : }
3247 0 : return "UNKNOWN";
3248 : }
3249 :
3250 : static Interval *
3251 452 : offset_to_interval(TimeOffset offset)
3252 : {
3253 452 : Interval *result = palloc(sizeof(Interval));
3254 :
3255 452 : result->month = 0;
3256 452 : result->day = 0;
3257 452 : result->time = offset;
3258 :
3259 452 : return result;
3260 : }
3261 :
3262 : /*
3263 : * Returns activity of walsenders, including pids and xlog locations sent to
3264 : * standby servers.
3265 : */
3266 : Datum
3267 464 : pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
3268 : {
3269 : #define PG_STAT_GET_WAL_SENDERS_COLS 12
3270 464 : ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
3271 : TupleDesc tupdesc;
3272 : Tuplestorestate *tupstore;
3273 : MemoryContext per_query_ctx;
3274 : MemoryContext oldcontext;
3275 : SyncRepStandbyData *sync_standbys;
3276 : int num_standbys;
3277 : int i;
3278 :
3279 : /* check to see if caller supports us returning a tuplestore */
3280 464 : if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
3281 0 : ereport(ERROR,
3282 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3283 : errmsg("set-valued function called in context that cannot accept a set")));
3284 464 : if (!(rsinfo->allowedModes & SFRM_Materialize))
3285 0 : ereport(ERROR,
3286 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
3287 : errmsg("materialize mode required, but it is not allowed in this context")));
3288 :
3289 : /* Build a tuple descriptor for our result type */
3290 464 : if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
3291 0 : elog(ERROR, "return type must be a row type");
3292 :
3293 464 : per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
3294 464 : oldcontext = MemoryContextSwitchTo(per_query_ctx);
3295 :
3296 464 : tupstore = tuplestore_begin_heap(true, false, work_mem);
3297 464 : rsinfo->returnMode = SFRM_Materialize;
3298 464 : rsinfo->setResult = tupstore;
3299 464 : rsinfo->setDesc = tupdesc;
3300 :
3301 464 : MemoryContextSwitchTo(oldcontext);
3302 :
3303 : /*
3304 : * Get the currently active synchronous standbys. This could be out of
3305 : * date before we're done, but we'll use the data anyway.
3306 : */
3307 464 : num_standbys = SyncRepGetCandidateStandbys(&sync_standbys);
3308 :
3309 5104 : for (i = 0; i < max_wal_senders; i++)
3310 : {
3311 4640 : WalSnd *walsnd = &WalSndCtl->walsnds[i];
3312 : XLogRecPtr sentPtr;
3313 : XLogRecPtr write;
3314 : XLogRecPtr flush;
3315 : XLogRecPtr apply;
3316 : TimeOffset writeLag;
3317 : TimeOffset flushLag;
3318 : TimeOffset applyLag;
3319 : int priority;
3320 : int pid;
3321 : WalSndState state;
3322 : TimestampTz replyTime;
3323 : bool is_sync_standby;
3324 : Datum values[PG_STAT_GET_WAL_SENDERS_COLS];
3325 : bool nulls[PG_STAT_GET_WAL_SENDERS_COLS];
3326 : int j;
3327 :
3328 : /* Collect data from shared memory */
3329 4640 : SpinLockAcquire(&walsnd->mutex);
3330 4640 : if (walsnd->pid == 0)
3331 : {
3332 4126 : SpinLockRelease(&walsnd->mutex);
3333 4126 : continue;
3334 : }
3335 514 : pid = walsnd->pid;
3336 514 : sentPtr = walsnd->sentPtr;
3337 514 : state = walsnd->state;
3338 514 : write = walsnd->write;
3339 514 : flush = walsnd->flush;
3340 514 : apply = walsnd->apply;
3341 514 : writeLag = walsnd->writeLag;
3342 514 : flushLag = walsnd->flushLag;
3343 514 : applyLag = walsnd->applyLag;
3344 514 : priority = walsnd->sync_standby_priority;
3345 514 : replyTime = walsnd->replyTime;
3346 514 : SpinLockRelease(&walsnd->mutex);
3347 :
3348 : /*
3349 : * Detect whether walsender is/was considered synchronous. We can
3350 : * provide some protection against stale data by checking the PID
3351 : * along with walsnd_index.
3352 : */
3353 514 : is_sync_standby = false;
3354 514 : for (j = 0; j < num_standbys; j++)
3355 : {
3356 0 : if (sync_standbys[j].walsnd_index == i &&
3357 0 : sync_standbys[j].pid == pid)
3358 : {
3359 0 : is_sync_standby = true;
3360 0 : break;
3361 : }
3362 : }
3363 :
3364 514 : memset(nulls, 0, sizeof(nulls));
3365 514 : values[0] = Int32GetDatum(pid);
3366 :
3367 514 : if (!is_member_of_role(GetUserId(), DEFAULT_ROLE_READ_ALL_STATS))
3368 : {
3369 : /*
3370 : * Only superusers and members of pg_read_all_stats can see
3371 : * details. Other users only get the pid value to know it's a
3372 : * walsender, but no details.
3373 : */
3374 0 : MemSet(&nulls[1], true, PG_STAT_GET_WAL_SENDERS_COLS - 1);
3375 : }
3376 : else
3377 : {
3378 514 : values[1] = CStringGetTextDatum(WalSndGetStateString(state));
3379 :
3380 514 : if (XLogRecPtrIsInvalid(sentPtr))
3381 16 : nulls[2] = true;
3382 514 : values[2] = LSNGetDatum(sentPtr);
3383 :
3384 514 : if (XLogRecPtrIsInvalid(write))
3385 18 : nulls[3] = true;
3386 514 : values[3] = LSNGetDatum(write);
3387 :
3388 514 : if (XLogRecPtrIsInvalid(flush))
3389 18 : nulls[4] = true;
3390 514 : values[4] = LSNGetDatum(flush);
3391 :
3392 514 : if (XLogRecPtrIsInvalid(apply))
3393 18 : nulls[5] = true;
3394 514 : values[5] = LSNGetDatum(apply);
3395 :
3396 : /*
3397 : * Treat a standby such as a pg_basebackup background process
3398 : * which always returns an invalid flush location, as an
3399 : * asynchronous standby.
3400 : */
3401 514 : priority = XLogRecPtrIsInvalid(flush) ? 0 : priority;
3402 :
3403 514 : if (writeLag < 0)
3404 374 : nulls[6] = true;
3405 : else
3406 140 : values[6] = IntervalPGetDatum(offset_to_interval(writeLag));
3407 :
3408 514 : if (flushLag < 0)
3409 342 : nulls[7] = true;
3410 : else
3411 172 : values[7] = IntervalPGetDatum(offset_to_interval(flushLag));
3412 :
3413 514 : if (applyLag < 0)
3414 374 : nulls[8] = true;
3415 : else
3416 140 : values[8] = IntervalPGetDatum(offset_to_interval(applyLag));
3417 :
3418 514 : values[9] = Int32GetDatum(priority);
3419 :
3420 : /*
3421 : * More easily understood version of standby state. This is purely
3422 : * informational.
3423 : *
3424 : * In quorum-based sync replication, the role of each standby
3425 : * listed in synchronous_standby_names can be changing very
3426 : * frequently. Any standbys considered as "sync" at one moment can
3427 : * be switched to "potential" ones at the next moment. So, it's
3428 : * basically useless to report "sync" or "potential" as their sync
3429 : * states. We report just "quorum" for them.
3430 : */
3431 514 : if (priority == 0)
3432 514 : values[10] = CStringGetTextDatum("async");
3433 0 : else if (is_sync_standby)
3434 0 : values[10] = SyncRepConfig->syncrep_method == SYNC_REP_PRIORITY ?
3435 0 : CStringGetTextDatum("sync") : CStringGetTextDatum("quorum");
3436 : else
3437 0 : values[10] = CStringGetTextDatum("potential");
3438 :
3439 514 : if (replyTime == 0)
3440 18 : nulls[11] = true;
3441 : else
3442 496 : values[11] = TimestampTzGetDatum(replyTime);
3443 : }
3444 :
3445 514 : tuplestore_putvalues(tupstore, tupdesc, values, nulls);
3446 : }
3447 :
3448 : /* clean up and return the tuplestore */
3449 : tuplestore_donestoring(tupstore);
3450 :
3451 464 : return (Datum) 0;
3452 : }
3453 :
3454 : /*
3455 : * Send a keepalive message to standby.
3456 : *
3457 : * If requestReply is set, the message requests the other party to send
3458 : * a message back to us, for heartbeat purposes. We also set a flag to
3459 : * let nearby code that we're waiting for that response, to avoid
3460 : * repeated requests.
3461 : */
3462 : static void
3463 4272 : WalSndKeepalive(bool requestReply)
3464 : {
3465 4272 : elog(DEBUG2, "sending replication keepalive");
3466 :
3467 : /* construct the message... */
3468 4272 : resetStringInfo(&output_message);
3469 4272 : pq_sendbyte(&output_message, 'k');
3470 4272 : pq_sendint64(&output_message, sentPtr);
3471 4272 : pq_sendint64(&output_message, GetCurrentTimestamp());
3472 4272 : pq_sendbyte(&output_message, requestReply ? 1 : 0);
3473 :
3474 : /* ... and send it wrapped in CopyData */
3475 4272 : pq_putmessage_noblock('d', output_message.data, output_message.len);
3476 :
3477 : /* Set local flag */
3478 4272 : if (requestReply)
3479 3344 : waiting_for_ping_response = true;
3480 4272 : }
3481 :
3482 : /*
3483 : * Send keepalive message if too much time has elapsed.
3484 : */
3485 : static void
3486 1183798 : WalSndKeepaliveIfNecessary(void)
3487 : {
3488 : TimestampTz ping_time;
3489 :
3490 : /*
3491 : * Don't send keepalive messages if timeouts are globally disabled or
3492 : * we're doing something not partaking in timeouts.
3493 : */
3494 1183798 : if (wal_sender_timeout <= 0 || last_reply_timestamp <= 0)
3495 6 : return;
3496 :
3497 1183792 : if (waiting_for_ping_response)
3498 12444 : return;
3499 :
3500 : /*
3501 : * If half of wal_sender_timeout has lapsed without receiving any reply
3502 : * from the standby, send a keep-alive message to the standby requesting
3503 : * an immediate reply.
3504 : */
3505 1171348 : ping_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
3506 : wal_sender_timeout / 2);
3507 1171348 : if (last_processing >= ping_time)
3508 : {
3509 0 : WalSndKeepalive(true);
3510 :
3511 : /* Try to flush pending output to the client */
3512 0 : if (pq_flush_if_writable() != 0)
3513 0 : WalSndShutdown();
3514 : }
3515 : }
3516 :
3517 : /*
3518 : * Record the end of the WAL and the time it was flushed locally, so that
3519 : * LagTrackerRead can compute the elapsed time (lag) when this WAL location is
3520 : * eventually reported to have been written, flushed and applied by the
3521 : * standby in a reply message.
3522 : */
3523 : static void
3524 76 : LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time)
3525 : {
3526 : bool buffer_full;
3527 : int new_write_head;
3528 : int i;
3529 :
3530 76 : if (!am_walsender)
3531 0 : return;
3532 :
3533 : /*
3534 : * If the lsn hasn't advanced since last time, then do nothing. This way
3535 : * we only record a new sample when new WAL has been written.
3536 : */
3537 76 : if (lag_tracker->last_lsn == lsn)
3538 0 : return;
3539 76 : lag_tracker->last_lsn = lsn;
3540 :
3541 : /*
3542 : * If advancing the write head of the circular buffer would crash into any
3543 : * of the read heads, then the buffer is full. In other words, the
3544 : * slowest reader (presumably apply) is the one that controls the release
3545 : * of space.
3546 : */
3547 76 : new_write_head = (lag_tracker->write_head + 1) % LAG_TRACKER_BUFFER_SIZE;
3548 76 : buffer_full = false;
3549 304 : for (i = 0; i < NUM_SYNC_REP_WAIT_MODE; ++i)
3550 : {
3551 228 : if (new_write_head == lag_tracker->read_heads[i])
3552 0 : buffer_full = true;
3553 : }
3554 :
3555 : /*
3556 : * If the buffer is full, for now we just rewind by one slot and overwrite
3557 : * the last sample, as a simple (if somewhat uneven) way to lower the
3558 : * sampling rate. There may be better adaptive compaction algorithms.
3559 : */
3560 76 : if (buffer_full)
3561 : {
3562 0 : new_write_head = lag_tracker->write_head;
3563 0 : if (lag_tracker->write_head > 0)
3564 0 : lag_tracker->write_head--;
3565 : else
3566 0 : lag_tracker->write_head = LAG_TRACKER_BUFFER_SIZE - 1;
3567 : }
3568 :
3569 : /* Store a sample at the current write head position. */
3570 76 : lag_tracker->buffer[lag_tracker->write_head].lsn = lsn;
3571 76 : lag_tracker->buffer[lag_tracker->write_head].time = local_flush_time;
3572 76 : lag_tracker->write_head = new_write_head;
3573 : }
3574 :
3575 : /*
3576 : * Find out how much time has elapsed between the moment WAL location 'lsn'
3577 : * (or the highest known earlier LSN) was flushed locally and the time 'now'.
3578 : * We have a separate read head for each of the reported LSN locations we
3579 : * receive in replies from standby; 'head' controls which read head is
3580 : * used. Whenever a read head crosses an LSN which was written into the
3581 : * lag buffer with LagTrackerWrite, we can use the associated timestamp to
3582 : * find out the time this LSN (or an earlier one) was flushed locally, and
3583 : * therefore compute the lag.
3584 : *
3585 : * Return -1 if no new sample data is available, and otherwise the elapsed
3586 : * time in microseconds.
3587 : */
3588 : static TimeOffset
3589 150402 : LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now)
3590 : {
3591 150402 : TimestampTz time = 0;
3592 :
3593 : /* Read all unread samples up to this LSN or end of buffer. */
3594 305774 : while (lag_tracker->read_heads[head] != lag_tracker->write_head &&
3595 4754 : lag_tracker->buffer[lag_tracker->read_heads[head]].lsn <= lsn)
3596 : {
3597 216 : time = lag_tracker->buffer[lag_tracker->read_heads[head]].time;
3598 432 : lag_tracker->last_read[head] =
3599 216 : lag_tracker->buffer[lag_tracker->read_heads[head]];
3600 432 : lag_tracker->read_heads[head] =
3601 216 : (lag_tracker->read_heads[head] + 1) % LAG_TRACKER_BUFFER_SIZE;
3602 : }
3603 :
3604 : /*
3605 : * If the lag tracker is empty, that means the standby has processed
3606 : * everything we've ever sent so we should now clear 'last_read'. If we
3607 : * didn't do that, we'd risk using a stale and irrelevant sample for
3608 : * interpolation at the beginning of the next burst of WAL after a period
3609 : * of idleness.
3610 : */
3611 150402 : if (lag_tracker->read_heads[head] == lag_tracker->write_head)
3612 145864 : lag_tracker->last_read[head].time = 0;
3613 :
3614 150402 : if (time > now)
3615 : {
3616 : /* If the clock somehow went backwards, treat as not found. */
3617 0 : return -1;
3618 : }
3619 150402 : else if (time == 0)
3620 : {
3621 : /*
3622 : * We didn't cross a time. If there is a future sample that we
3623 : * haven't reached yet, and we've already reached at least one sample,
3624 : * let's interpolate the local flushed time. This is mainly useful
3625 : * for reporting a completely stuck apply position as having
3626 : * increasing lag, since otherwise we'd have to wait for it to
3627 : * eventually start moving again and cross one of our samples before
3628 : * we can show the lag increasing.
3629 : */
3630 150186 : if (lag_tracker->read_heads[head] == lag_tracker->write_head)
3631 : {
3632 : /* There are no future samples, so we can't interpolate. */
3633 145648 : return -1;
3634 : }
3635 4538 : else if (lag_tracker->last_read[head].time != 0)
3636 : {
3637 : /* We can interpolate between last_read and the next sample. */
3638 : double fraction;
3639 0 : WalTimeSample prev = lag_tracker->last_read[head];
3640 0 : WalTimeSample next = lag_tracker->buffer[lag_tracker->read_heads[head]];
3641 :
3642 0 : if (lsn < prev.lsn)
3643 : {
3644 : /*
3645 : * Reported LSNs shouldn't normally go backwards, but it's
3646 : * possible when there is a timeline change. Treat as not
3647 : * found.
3648 : */
3649 0 : return -1;
3650 : }
3651 :
3652 0 : Assert(prev.lsn < next.lsn);
3653 :
3654 0 : if (prev.time > next.time)
3655 : {
3656 : /* If the clock somehow went backwards, treat as not found. */
3657 0 : return -1;
3658 : }
3659 :
3660 : /* See how far we are between the previous and next samples. */
3661 0 : fraction =
3662 0 : (double) (lsn - prev.lsn) / (double) (next.lsn - prev.lsn);
3663 :
3664 : /* Scale the local flush time proportionally. */
3665 0 : time = (TimestampTz)
3666 0 : ((double) prev.time + (next.time - prev.time) * fraction);
3667 : }
3668 : else
3669 : {
3670 : /*
3671 : * We have only a future sample, implying that we were entirely
3672 : * caught up but and now there is a new burst of WAL and the
3673 : * standby hasn't processed the first sample yet. Until the
3674 : * standby reaches the future sample the best we can do is report
3675 : * the hypothetical lag if that sample were to be replayed now.
3676 : */
3677 4538 : time = lag_tracker->buffer[lag_tracker->read_heads[head]].time;
3678 : }
3679 : }
3680 :
3681 : /* Return the elapsed time since local flush time in microseconds. */
3682 4754 : Assert(time != 0);
3683 4754 : return now - time;
3684 : }
|