Line data Source code
1 : /*-------------------------------------------------------------------------
2 : * tablesync.c
3 : * PostgreSQL logical replication: initial table data synchronization
4 : *
5 : * Copyright (c) 2012-2020, PostgreSQL Global Development Group
6 : *
7 : * IDENTIFICATION
8 : * src/backend/replication/logical/tablesync.c
9 : *
10 : * NOTES
11 : * This file contains code for initial table data synchronization for
12 : * logical replication.
13 : *
14 : * The initial data synchronization is done separately for each table,
15 : * in a separate apply worker that only fetches the initial snapshot data
16 : * from the publisher and then synchronizes the position in the stream with
17 : * the main apply worker.
18 : *
19 : * There are several reasons for doing the synchronization this way:
20 : * - It allows us to parallelize the initial data synchronization
21 : * which lowers the time needed for it to happen.
22 : * - The initial synchronization does not have to hold the xid and LSN
23 : * for the time it takes to copy data of all tables, causing less
24 : * bloat and lower disk consumption compared to doing the
25 : * synchronization in a single process for the whole database.
26 : * - It allows us to synchronize any tables added after the initial
27 : * synchronization has finished.
28 : *
29 : * The stream position synchronization works in multiple steps:
30 : * - Apply worker requests a tablesync worker to start, setting the new
31 : * table state to INIT.
32 : * - Tablesync worker starts; changes table state from INIT to DATASYNC while
33 : * copying.
34 : * - Tablesync worker finishes the copy and sets table state to SYNCWAIT;
35 : * waits for state change.
36 : * - Apply worker periodically checks for tables in SYNCWAIT state. When
37 : * any appear, it sets the table state to CATCHUP and starts loop-waiting
38 : * until either the table state is set to SYNCDONE or the sync worker
39 : * exits.
40 : * - After the sync worker has seen the state change to CATCHUP, it will
41 : * read the stream and apply changes (acting like an apply worker) until
42 : * it catches up to the specified stream position. Then it sets the
43 : * state to SYNCDONE. There might be zero changes applied between
44 : * CATCHUP and SYNCDONE, because the sync worker might be ahead of the
45 : * apply worker.
46 : * - Once the state is set to SYNCDONE, the apply will continue tracking
47 : * the table until it reaches the SYNCDONE stream position, at which
48 : * point it sets state to READY and stops tracking. Again, there might
49 : * be zero changes in between.
50 : *
51 : * So the state progression is always: INIT -> DATASYNC -> SYNCWAIT ->
52 : * CATCHUP -> SYNCDONE -> READY.
53 : *
54 : * The catalog pg_subscription_rel is used to keep information about
55 : * subscribed tables and their state. Some transient state during data
56 : * synchronization is kept in shared memory. The states SYNCWAIT and
57 : * CATCHUP only appear in memory.
58 : *
59 : * Example flows look like this:
60 : * - Apply is in front:
61 : * sync:8
62 : * -> set in memory SYNCWAIT
63 : * apply:10
64 : * -> set in memory CATCHUP
65 : * -> enter wait-loop
66 : * sync:10
67 : * -> set in catalog SYNCDONE
68 : * -> exit
69 : * apply:10
70 : * -> exit wait-loop
71 : * -> continue rep
72 : * apply:11
73 : * -> set in catalog READY
74 : *
75 : * - Sync is in front:
76 : * sync:10
77 : * -> set in memory SYNCWAIT
78 : * apply:8
79 : * -> set in memory CATCHUP
80 : * -> continue per-table filtering
81 : * sync:10
82 : * -> set in catalog SYNCDONE
83 : * -> exit
84 : * apply:10
85 : * -> set in catalog READY
86 : * -> stop per-table filtering
87 : * -> continue rep
88 : *-------------------------------------------------------------------------
89 : */
90 :
91 : #include "postgres.h"
92 :
93 : #include "access/table.h"
94 : #include "access/xact.h"
95 : #include "catalog/pg_subscription_rel.h"
96 : #include "catalog/pg_type.h"
97 : #include "commands/copy.h"
98 : #include "miscadmin.h"
99 : #include "parser/parse_relation.h"
100 : #include "pgstat.h"
101 : #include "replication/logicallauncher.h"
102 : #include "replication/logicalrelation.h"
103 : #include "replication/walreceiver.h"
104 : #include "replication/worker_internal.h"
105 : #include "storage/ipc.h"
106 : #include "utils/builtins.h"
107 : #include "utils/lsyscache.h"
108 : #include "utils/memutils.h"
109 : #include "utils/snapmgr.h"
110 :
111 : static bool table_states_valid = false;
112 :
113 : StringInfo copybuf = NULL;
114 :
115 : /*
116 : * Exit routine for synchronization worker.
117 : */
118 : static void
119 : pg_attribute_noreturn()
120 122 : finish_sync_worker(void)
121 : {
122 : /*
123 : * Commit any outstanding transaction. This is the usual case, unless
124 : * there was nothing to do for the table.
125 : */
126 122 : if (IsTransactionState())
127 : {
128 122 : CommitTransactionCommand();
129 122 : pgstat_report_stat(false);
130 : }
131 :
132 : /* And flush all writes. */
133 122 : XLogFlush(GetXLogWriteRecPtr());
134 :
135 122 : StartTransactionCommand();
136 122 : ereport(LOG,
137 : (errmsg("logical replication table synchronization worker for subscription \"%s\", table \"%s\" has finished",
138 : MySubscription->name,
139 : get_rel_name(MyLogicalRepWorker->relid))));
140 122 : CommitTransactionCommand();
141 :
142 : /* Find the main apply worker and signal it. */
143 122 : logicalrep_worker_wakeup(MyLogicalRepWorker->subid, InvalidOid);
144 :
145 : /* Stop gracefully */
146 122 : proc_exit(0);
147 : }
148 :
149 : /*
150 : * Wait until the relation sync state is set in the catalog to the expected
151 : * one; return true when it happens.
152 : *
153 : * Returns false if the table sync worker or the table itself have
154 : * disappeared, or the table state has been reset.
155 : *
156 : * Currently, this is used in the apply worker when transitioning from
157 : * CATCHUP state to SYNCDONE.
158 : */
159 : static bool
160 238 : wait_for_relation_state_change(Oid relid, char expected_state)
161 : {
162 : char state;
163 :
164 : for (;;)
165 : {
166 : LogicalRepWorker *worker;
167 : XLogRecPtr statelsn;
168 :
169 238 : CHECK_FOR_INTERRUPTS();
170 :
171 238 : InvalidateCatalogSnapshot();
172 238 : state = GetSubscriptionRelState(MyLogicalRepWorker->subid,
173 : relid, &statelsn);
174 :
175 238 : if (state == SUBREL_STATE_UNKNOWN)
176 112 : break;
177 :
178 238 : if (state == expected_state)
179 0 : return true;
180 :
181 : /* Check if the sync worker is still running and bail if not. */
182 238 : LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
183 238 : worker = logicalrep_worker_find(MyLogicalRepWorker->subid, relid,
184 : false);
185 238 : LWLockRelease(LogicalRepWorkerLock);
186 238 : if (!worker)
187 112 : break;
188 :
189 126 : (void) WaitLatch(MyLatch,
190 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
191 : 1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE);
192 :
193 126 : ResetLatch(MyLatch);
194 126 : }
195 :
196 112 : return false;
197 : }
198 :
199 : /*
200 : * Wait until the apply worker changes the state of our synchronization
201 : * worker to the expected one.
202 : *
203 : * Used when transitioning from SYNCWAIT state to CATCHUP.
204 : *
205 : * Returns false if the apply worker has disappeared.
206 : */
207 : static bool
208 244 : wait_for_worker_state_change(char expected_state)
209 : {
210 : int rc;
211 :
212 : for (;;)
213 : {
214 : LogicalRepWorker *worker;
215 :
216 244 : CHECK_FOR_INTERRUPTS();
217 :
218 : /*
219 : * Done if already in correct state. (We assume this fetch is atomic
220 : * enough to not give a misleading answer if we do it with no lock.)
221 : */
222 244 : if (MyLogicalRepWorker->relstate == expected_state)
223 122 : return true;
224 :
225 : /*
226 : * Bail out if the apply worker has died, else signal it we're
227 : * waiting.
228 : */
229 122 : LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
230 122 : worker = logicalrep_worker_find(MyLogicalRepWorker->subid,
231 : InvalidOid, false);
232 122 : if (worker && worker->proc)
233 122 : logicalrep_worker_wakeup_ptr(worker);
234 122 : LWLockRelease(LogicalRepWorkerLock);
235 122 : if (!worker)
236 0 : break;
237 :
238 : /*
239 : * Wait. We expect to get a latch signal back from the apply worker,
240 : * but use a timeout in case it dies without sending one.
241 : */
242 122 : rc = WaitLatch(MyLatch,
243 : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
244 : 1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE);
245 :
246 122 : if (rc & WL_LATCH_SET)
247 122 : ResetLatch(MyLatch);
248 122 : }
249 :
250 0 : return false;
251 : }
252 :
253 : /*
254 : * Callback from syscache invalidation.
255 : */
256 : void
257 562 : invalidate_syncing_table_states(Datum arg, int cacheid, uint32 hashvalue)
258 : {
259 562 : table_states_valid = false;
260 562 : }
261 :
262 : /*
263 : * Handle table synchronization cooperation from the synchronization
264 : * worker.
265 : *
266 : * If the sync worker is in CATCHUP state and reached (or passed) the
267 : * predetermined synchronization point in the WAL stream, mark the table as
268 : * SYNCDONE and finish.
269 : */
270 : static void
271 130 : process_syncing_tables_for_sync(XLogRecPtr current_lsn)
272 : {
273 130 : Assert(IsTransactionState());
274 :
275 130 : SpinLockAcquire(&MyLogicalRepWorker->relmutex);
276 :
277 260 : if (MyLogicalRepWorker->relstate == SUBREL_STATE_CATCHUP &&
278 130 : current_lsn >= MyLogicalRepWorker->relstate_lsn)
279 : {
280 : TimeLineID tli;
281 :
282 122 : MyLogicalRepWorker->relstate = SUBREL_STATE_SYNCDONE;
283 122 : MyLogicalRepWorker->relstate_lsn = current_lsn;
284 :
285 122 : SpinLockRelease(&MyLogicalRepWorker->relmutex);
286 :
287 366 : UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
288 122 : MyLogicalRepWorker->relid,
289 122 : MyLogicalRepWorker->relstate,
290 122 : MyLogicalRepWorker->relstate_lsn);
291 :
292 122 : walrcv_endstreaming(wrconn, &tli);
293 122 : finish_sync_worker();
294 : }
295 : else
296 8 : SpinLockRelease(&MyLogicalRepWorker->relmutex);
297 8 : }
298 :
299 : /*
300 : * Handle table synchronization cooperation from the apply worker.
301 : *
302 : * Walk over all subscription tables that are individually tracked by the
303 : * apply process (currently, all that have state other than
304 : * SUBREL_STATE_READY) and manage synchronization for them.
305 : *
306 : * If there are tables that need synchronizing and are not being synchronized
307 : * yet, start sync workers for them (if there are free slots for sync
308 : * workers). To prevent starting the sync worker for the same relation at a
309 : * high frequency after a failure, we store its last start time with each sync
310 : * state info. We start the sync worker for the same relation after waiting
311 : * at least wal_retrieve_retry_interval.
312 : *
313 : * For tables that are being synchronized already, check if sync workers
314 : * either need action from the apply worker or have finished. This is the
315 : * SYNCWAIT to CATCHUP transition.
316 : *
317 : * If the synchronization position is reached (SYNCDONE), then the table can
318 : * be marked as READY and is no longer tracked.
319 : */
320 : static void
321 5448 : process_syncing_tables_for_apply(XLogRecPtr current_lsn)
322 : {
323 : struct tablesync_start_time_mapping
324 : {
325 : Oid relid;
326 : TimestampTz last_start_time;
327 : };
328 : static List *table_states = NIL;
329 : static HTAB *last_start_times = NULL;
330 : ListCell *lc;
331 5448 : bool started_tx = false;
332 :
333 5448 : Assert(!IsTransactionState());
334 :
335 : /* We need up-to-date sync state info for subscription tables here. */
336 5448 : if (!table_states_valid)
337 : {
338 : MemoryContext oldctx;
339 : List *rstates;
340 : ListCell *lc;
341 : SubscriptionRelState *rstate;
342 :
343 : /* Clean the old list. */
344 366 : list_free_deep(table_states);
345 366 : table_states = NIL;
346 :
347 366 : StartTransactionCommand();
348 366 : started_tx = true;
349 :
350 : /* Fetch all non-ready tables. */
351 366 : rstates = GetSubscriptionNotReadyRelations(MySubscription->oid);
352 :
353 : /* Allocate the tracking info in a permanent memory context. */
354 366 : oldctx = MemoryContextSwitchTo(CacheMemoryContext);
355 1180 : foreach(lc, rstates)
356 : {
357 814 : rstate = palloc(sizeof(SubscriptionRelState));
358 814 : memcpy(rstate, lfirst(lc), sizeof(SubscriptionRelState));
359 814 : table_states = lappend(table_states, rstate);
360 : }
361 366 : MemoryContextSwitchTo(oldctx);
362 :
363 366 : table_states_valid = true;
364 : }
365 :
366 : /*
367 : * Prepare a hash table for tracking last start times of workers, to avoid
368 : * immediate restarts. We don't need it if there are no tables that need
369 : * syncing.
370 : */
371 5448 : if (table_states && !last_start_times)
372 52 : {
373 : HASHCTL ctl;
374 :
375 52 : memset(&ctl, 0, sizeof(ctl));
376 52 : ctl.keysize = sizeof(Oid);
377 52 : ctl.entrysize = sizeof(struct tablesync_start_time_mapping);
378 52 : last_start_times = hash_create("Logical replication table sync worker start times",
379 : 256, &ctl, HASH_ELEM | HASH_BLOBS);
380 : }
381 :
382 : /*
383 : * Clean up the hash table when we're done with all tables (just to
384 : * release the bit of memory).
385 : */
386 5396 : else if (!table_states && last_start_times)
387 : {
388 48 : hash_destroy(last_start_times);
389 48 : last_start_times = NULL;
390 : }
391 :
392 : /*
393 : * Process all tables that are being synchronized.
394 : */
395 6474 : foreach(lc, table_states)
396 : {
397 1026 : SubscriptionRelState *rstate = (SubscriptionRelState *) lfirst(lc);
398 :
399 1026 : if (rstate->state == SUBREL_STATE_SYNCDONE)
400 : {
401 : /*
402 : * Apply has caught up to the position where the table sync has
403 : * finished. Mark the table as ready so that the apply will just
404 : * continue to replicate it normally.
405 : */
406 110 : if (current_lsn >= rstate->lsn)
407 : {
408 110 : rstate->state = SUBREL_STATE_READY;
409 110 : rstate->lsn = current_lsn;
410 110 : if (!started_tx)
411 : {
412 0 : StartTransactionCommand();
413 0 : started_tx = true;
414 : }
415 :
416 220 : UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
417 110 : rstate->relid, rstate->state,
418 : rstate->lsn);
419 : }
420 : }
421 : else
422 : {
423 : LogicalRepWorker *syncworker;
424 :
425 : /*
426 : * Look for a sync worker for this relation.
427 : */
428 916 : LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
429 :
430 916 : syncworker = logicalrep_worker_find(MyLogicalRepWorker->subid,
431 : rstate->relid, false);
432 :
433 916 : if (syncworker)
434 : {
435 : /* Found one, update our copy of its state */
436 268 : SpinLockAcquire(&syncworker->relmutex);
437 268 : rstate->state = syncworker->relstate;
438 268 : rstate->lsn = syncworker->relstate_lsn;
439 268 : if (rstate->state == SUBREL_STATE_SYNCWAIT)
440 : {
441 : /*
442 : * Sync worker is waiting for apply. Tell sync worker it
443 : * can catchup now.
444 : */
445 112 : syncworker->relstate = SUBREL_STATE_CATCHUP;
446 112 : syncworker->relstate_lsn =
447 112 : Max(syncworker->relstate_lsn, current_lsn);
448 : }
449 268 : SpinLockRelease(&syncworker->relmutex);
450 :
451 : /* If we told worker to catch up, wait for it. */
452 268 : if (rstate->state == SUBREL_STATE_SYNCWAIT)
453 : {
454 : /* Signal the sync worker, as it may be waiting for us. */
455 112 : if (syncworker->proc)
456 112 : logicalrep_worker_wakeup_ptr(syncworker);
457 :
458 : /* Now safe to release the LWLock */
459 112 : LWLockRelease(LogicalRepWorkerLock);
460 :
461 : /*
462 : * Enter busy loop and wait for synchronization worker to
463 : * reach expected state (or die trying).
464 : */
465 112 : if (!started_tx)
466 : {
467 54 : StartTransactionCommand();
468 54 : started_tx = true;
469 : }
470 :
471 112 : wait_for_relation_state_change(rstate->relid,
472 : SUBREL_STATE_SYNCDONE);
473 : }
474 : else
475 156 : LWLockRelease(LogicalRepWorkerLock);
476 : }
477 : else
478 : {
479 : /*
480 : * If there is no sync worker for this table yet, count
481 : * running sync workers for this subscription, while we have
482 : * the lock.
483 : */
484 648 : int nsyncworkers =
485 648 : logicalrep_sync_worker_count(MyLogicalRepWorker->subid);
486 :
487 : /* Now safe to release the LWLock */
488 648 : LWLockRelease(LogicalRepWorkerLock);
489 :
490 : /*
491 : * If there are free sync worker slot(s), start a new sync
492 : * worker for the table.
493 : */
494 648 : if (nsyncworkers < max_sync_workers_per_subscription)
495 : {
496 132 : TimestampTz now = GetCurrentTimestamp();
497 : struct tablesync_start_time_mapping *hentry;
498 : bool found;
499 :
500 132 : hentry = hash_search(last_start_times, &rstate->relid,
501 : HASH_ENTER, &found);
502 :
503 152 : if (!found ||
504 20 : TimestampDifferenceExceeds(hentry->last_start_time, now,
505 : wal_retrieve_retry_interval))
506 : {
507 366 : logicalrep_worker_launch(MyLogicalRepWorker->dbid,
508 122 : MySubscription->oid,
509 122 : MySubscription->name,
510 122 : MyLogicalRepWorker->userid,
511 : rstate->relid);
512 122 : hentry->last_start_time = now;
513 : }
514 : }
515 : }
516 : }
517 : }
518 :
519 5448 : if (started_tx)
520 : {
521 420 : CommitTransactionCommand();
522 420 : pgstat_report_stat(false);
523 : }
524 5448 : }
525 :
526 : /*
527 : * Process possible state change(s) of tables that are being synchronized.
528 : */
529 : void
530 5578 : process_syncing_tables(XLogRecPtr current_lsn)
531 : {
532 5578 : if (am_tablesync_worker())
533 130 : process_syncing_tables_for_sync(current_lsn);
534 : else
535 5448 : process_syncing_tables_for_apply(current_lsn);
536 5456 : }
537 :
538 : /*
539 : * Create list of columns for COPY based on logical relation mapping.
540 : */
541 : static List *
542 126 : make_copy_attnamelist(LogicalRepRelMapEntry *rel)
543 : {
544 126 : List *attnamelist = NIL;
545 : int i;
546 :
547 350 : for (i = 0; i < rel->remoterel.natts; i++)
548 : {
549 224 : attnamelist = lappend(attnamelist,
550 224 : makeString(rel->remoterel.attnames[i]));
551 : }
552 :
553 :
554 126 : return attnamelist;
555 : }
556 :
557 : /*
558 : * Data source callback for the COPY FROM, which reads from the remote
559 : * connection and passes the data back to our local COPY.
560 : */
561 : static int
562 26418 : copy_read_data(void *outbuf, int minread, int maxread)
563 : {
564 26418 : int bytesread = 0;
565 : int avail;
566 :
567 : /* If there are some leftover data from previous read, use it. */
568 26418 : avail = copybuf->len - copybuf->cursor;
569 26418 : if (avail)
570 : {
571 0 : if (avail > maxread)
572 0 : avail = maxread;
573 0 : memcpy(outbuf, ©buf->data[copybuf->cursor], avail);
574 0 : copybuf->cursor += avail;
575 0 : maxread -= avail;
576 0 : bytesread += avail;
577 : }
578 :
579 52836 : while (maxread > 0 && bytesread < minread)
580 : {
581 26418 : pgsocket fd = PGINVALID_SOCKET;
582 : int len;
583 26418 : char *buf = NULL;
584 :
585 : for (;;)
586 : {
587 : /* Try read the data. */
588 26418 : len = walrcv_receive(wrconn, &buf, &fd);
589 :
590 26418 : CHECK_FOR_INTERRUPTS();
591 :
592 26418 : if (len == 0)
593 0 : break;
594 26418 : else if (len < 0)
595 26544 : return bytesread;
596 : else
597 : {
598 : /* Process the data */
599 26292 : copybuf->data = buf;
600 26292 : copybuf->len = len;
601 26292 : copybuf->cursor = 0;
602 :
603 26292 : avail = copybuf->len - copybuf->cursor;
604 26292 : if (avail > maxread)
605 0 : avail = maxread;
606 26292 : memcpy(outbuf, ©buf->data[copybuf->cursor], avail);
607 26292 : outbuf = (void *) ((char *) outbuf + avail);
608 26292 : copybuf->cursor += avail;
609 26292 : maxread -= avail;
610 26292 : bytesread += avail;
611 : }
612 :
613 26292 : if (maxread <= 0 || bytesread >= minread)
614 26292 : return bytesread;
615 0 : }
616 :
617 : /*
618 : * Wait for more data or latch.
619 : */
620 0 : (void) WaitLatchOrSocket(MyLatch,
621 : WL_SOCKET_READABLE | WL_LATCH_SET |
622 : WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
623 : fd, 1000L, WAIT_EVENT_LOGICAL_SYNC_DATA);
624 :
625 0 : ResetLatch(MyLatch);
626 : }
627 :
628 0 : return bytesread;
629 : }
630 :
631 :
632 : /*
633 : * Get information about remote relation in similar fashion the RELATION
634 : * message provides during replication.
635 : */
636 : static void
637 126 : fetch_remote_table_info(char *nspname, char *relname,
638 : LogicalRepRelation *lrel)
639 : {
640 : WalRcvExecResult *res;
641 : StringInfoData cmd;
642 : TupleTableSlot *slot;
643 126 : Oid tableRow[] = {OIDOID, CHAROID, CHAROID};
644 126 : Oid attrRow[] = {TEXTOID, OIDOID, INT4OID, BOOLOID};
645 : bool isnull;
646 : int natt;
647 :
648 126 : lrel->nspname = nspname;
649 126 : lrel->relname = relname;
650 :
651 : /* First fetch Oid and replica identity. */
652 126 : initStringInfo(&cmd);
653 126 : appendStringInfo(&cmd, "SELECT c.oid, c.relreplident, c.relkind"
654 : " FROM pg_catalog.pg_class c"
655 : " INNER JOIN pg_catalog.pg_namespace n"
656 : " ON (c.relnamespace = n.oid)"
657 : " WHERE n.nspname = %s"
658 : " AND c.relname = %s",
659 : quote_literal_cstr(nspname),
660 : quote_literal_cstr(relname));
661 126 : res = walrcv_exec(wrconn, cmd.data, lengthof(tableRow), tableRow);
662 :
663 126 : if (res->status != WALRCV_OK_TUPLES)
664 0 : ereport(ERROR,
665 : (errmsg("could not fetch table info for table \"%s.%s\" from publisher: %s",
666 : nspname, relname, res->err)));
667 :
668 126 : slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple);
669 126 : if (!tuplestore_gettupleslot(res->tuplestore, true, false, slot))
670 0 : ereport(ERROR,
671 : (errmsg("table \"%s.%s\" not found on publisher",
672 : nspname, relname)));
673 :
674 126 : lrel->remoteid = DatumGetObjectId(slot_getattr(slot, 1, &isnull));
675 126 : Assert(!isnull);
676 126 : lrel->replident = DatumGetChar(slot_getattr(slot, 2, &isnull));
677 126 : Assert(!isnull);
678 126 : lrel->relkind = DatumGetChar(slot_getattr(slot, 3, &isnull));
679 126 : Assert(!isnull);
680 :
681 126 : ExecDropSingleTupleTableSlot(slot);
682 126 : walrcv_clear_result(res);
683 :
684 : /* Now fetch columns. */
685 126 : resetStringInfo(&cmd);
686 252 : appendStringInfo(&cmd,
687 : "SELECT a.attname,"
688 : " a.atttypid,"
689 : " a.atttypmod,"
690 : " a.attnum = ANY(i.indkey)"
691 : " FROM pg_catalog.pg_attribute a"
692 : " LEFT JOIN pg_catalog.pg_index i"
693 : " ON (i.indexrelid = pg_get_replica_identity_index(%u))"
694 : " WHERE a.attnum > 0::pg_catalog.int2"
695 : " AND NOT a.attisdropped %s"
696 : " AND a.attrelid = %u"
697 : " ORDER BY a.attnum",
698 : lrel->remoteid,
699 126 : (walrcv_server_version(wrconn) >= 120000 ? "AND a.attgenerated = ''" : ""),
700 : lrel->remoteid);
701 126 : res = walrcv_exec(wrconn, cmd.data, lengthof(attrRow), attrRow);
702 :
703 126 : if (res->status != WALRCV_OK_TUPLES)
704 0 : ereport(ERROR,
705 : (errmsg("could not fetch table info for table \"%s.%s\": %s",
706 : nspname, relname, res->err)));
707 :
708 : /* We don't know the number of rows coming, so allocate enough space. */
709 126 : lrel->attnames = palloc0(MaxTupleAttributeNumber * sizeof(char *));
710 126 : lrel->atttyps = palloc0(MaxTupleAttributeNumber * sizeof(Oid));
711 126 : lrel->attkeys = NULL;
712 :
713 126 : natt = 0;
714 126 : slot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple);
715 476 : while (tuplestore_gettupleslot(res->tuplestore, true, false, slot))
716 : {
717 448 : lrel->attnames[natt] =
718 224 : TextDatumGetCString(slot_getattr(slot, 1, &isnull));
719 224 : Assert(!isnull);
720 224 : lrel->atttyps[natt] = DatumGetObjectId(slot_getattr(slot, 2, &isnull));
721 224 : Assert(!isnull);
722 224 : if (DatumGetBool(slot_getattr(slot, 4, &isnull)))
723 100 : lrel->attkeys = bms_add_member(lrel->attkeys, natt);
724 :
725 : /* Should never happen. */
726 224 : if (++natt >= MaxTupleAttributeNumber)
727 0 : elog(ERROR, "too many columns in remote table \"%s.%s\"",
728 : nspname, relname);
729 :
730 224 : ExecClearTuple(slot);
731 : }
732 126 : ExecDropSingleTupleTableSlot(slot);
733 :
734 126 : lrel->natts = natt;
735 :
736 126 : walrcv_clear_result(res);
737 126 : pfree(cmd.data);
738 126 : }
739 :
740 : /*
741 : * Copy existing data of a table from publisher.
742 : *
743 : * Caller is responsible for locking the local relation.
744 : */
745 : static void
746 126 : copy_table(Relation rel)
747 : {
748 : LogicalRepRelMapEntry *relmapentry;
749 : LogicalRepRelation lrel;
750 : WalRcvExecResult *res;
751 : StringInfoData cmd;
752 : CopyState cstate;
753 : List *attnamelist;
754 : ParseState *pstate;
755 :
756 : /* Get the publisher relation info. */
757 126 : fetch_remote_table_info(get_namespace_name(RelationGetNamespace(rel)),
758 126 : RelationGetRelationName(rel), &lrel);
759 :
760 : /* Put the relation into relmap. */
761 126 : logicalrep_relmap_update(&lrel);
762 :
763 : /* Map the publisher relation to local one. */
764 126 : relmapentry = logicalrep_rel_open(lrel.remoteid, NoLock);
765 126 : Assert(rel == relmapentry->localrel);
766 :
767 : /* Start copy on the publisher. */
768 126 : initStringInfo(&cmd);
769 126 : if (lrel.relkind == RELKIND_RELATION)
770 118 : appendStringInfo(&cmd, "COPY %s TO STDOUT",
771 118 : quote_qualified_identifier(lrel.nspname, lrel.relname));
772 : else
773 : {
774 : /*
775 : * For non-tables, we need to do COPY (SELECT ...), but we can't just
776 : * do SELECT * because we need to not copy generated columns.
777 : */
778 8 : appendStringInfoString(&cmd, "COPY (SELECT ");
779 24 : for (int i = 0; i < lrel.natts; i++)
780 : {
781 16 : appendStringInfoString(&cmd, quote_identifier(lrel.attnames[i]));
782 16 : if (i < lrel.natts - 1)
783 8 : appendStringInfoString(&cmd, ", ");
784 : }
785 8 : appendStringInfo(&cmd, " FROM %s) TO STDOUT",
786 8 : quote_qualified_identifier(lrel.nspname, lrel.relname));
787 : }
788 126 : res = walrcv_exec(wrconn, cmd.data, 0, NULL);
789 126 : pfree(cmd.data);
790 126 : if (res->status != WALRCV_OK_COPY_OUT)
791 0 : ereport(ERROR,
792 : (errmsg("could not start initial contents copy for table \"%s.%s\": %s",
793 : lrel.nspname, lrel.relname, res->err)));
794 126 : walrcv_clear_result(res);
795 :
796 126 : copybuf = makeStringInfo();
797 :
798 126 : pstate = make_parsestate(NULL);
799 126 : (void) addRangeTableEntryForRelation(pstate, rel, AccessShareLock,
800 : NULL, false, false);
801 :
802 126 : attnamelist = make_copy_attnamelist(relmapentry);
803 126 : cstate = BeginCopyFrom(pstate, rel, NULL, false, copy_read_data, attnamelist, NIL);
804 :
805 : /* Do the copy */
806 126 : (void) CopyFrom(cstate);
807 :
808 122 : logicalrep_rel_close(relmapentry, NoLock);
809 122 : }
810 :
811 : /*
812 : * Start syncing the table in the sync worker.
813 : *
814 : * If nothing needs to be done to sync the table, we exit the worker without
815 : * any further action.
816 : *
817 : * The returned slot name is palloc'ed in current memory context.
818 : */
819 : char *
820 126 : LogicalRepSyncTableStart(XLogRecPtr *origin_startpos)
821 : {
822 : char *slotname;
823 : char *err;
824 : char relstate;
825 : XLogRecPtr relstate_lsn;
826 : Relation rel;
827 : WalRcvExecResult *res;
828 :
829 : /* Check the state of the table synchronization. */
830 126 : StartTransactionCommand();
831 126 : relstate = GetSubscriptionRelState(MyLogicalRepWorker->subid,
832 126 : MyLogicalRepWorker->relid,
833 : &relstate_lsn);
834 126 : CommitTransactionCommand();
835 :
836 126 : SpinLockAcquire(&MyLogicalRepWorker->relmutex);
837 126 : MyLogicalRepWorker->relstate = relstate;
838 126 : MyLogicalRepWorker->relstate_lsn = relstate_lsn;
839 126 : SpinLockRelease(&MyLogicalRepWorker->relmutex);
840 :
841 : /*
842 : * If synchronization is already done or no longer necessary, exit now
843 : * that we've updated shared memory state.
844 : */
845 126 : switch (relstate)
846 : {
847 : case SUBREL_STATE_SYNCDONE:
848 : case SUBREL_STATE_READY:
849 : case SUBREL_STATE_UNKNOWN:
850 0 : finish_sync_worker(); /* doesn't return */
851 : }
852 :
853 : /*
854 : * To build a slot name for the sync work, we are limited to NAMEDATALEN -
855 : * 1 characters. We cut the original slot name to NAMEDATALEN - 28 chars
856 : * and append _%u_sync_%u (1 + 10 + 6 + 10 + '\0'). (It's actually the
857 : * NAMEDATALEN on the remote that matters, but this scheme will also work
858 : * reasonably if that is different.)
859 : */
860 : StaticAssertStmt(NAMEDATALEN >= 32, "NAMEDATALEN too small"); /* for sanity */
861 378 : slotname = psprintf("%.*s_%u_sync_%u",
862 : NAMEDATALEN - 28,
863 126 : MySubscription->slotname,
864 126 : MySubscription->oid,
865 126 : MyLogicalRepWorker->relid);
866 :
867 : /*
868 : * Here we use the slot name instead of the subscription name as the
869 : * application_name, so that it is different from the main apply worker,
870 : * so that synchronous replication can distinguish them.
871 : */
872 126 : wrconn = walrcv_connect(MySubscription->conninfo, true, slotname, &err);
873 126 : if (wrconn == NULL)
874 0 : ereport(ERROR,
875 : (errmsg("could not connect to the publisher: %s", err)));
876 :
877 126 : Assert(MyLogicalRepWorker->relstate == SUBREL_STATE_INIT ||
878 : MyLogicalRepWorker->relstate == SUBREL_STATE_DATASYNC);
879 :
880 126 : SpinLockAcquire(&MyLogicalRepWorker->relmutex);
881 126 : MyLogicalRepWorker->relstate = SUBREL_STATE_DATASYNC;
882 126 : MyLogicalRepWorker->relstate_lsn = InvalidXLogRecPtr;
883 126 : SpinLockRelease(&MyLogicalRepWorker->relmutex);
884 :
885 : /* Update the state and make it visible to others. */
886 126 : StartTransactionCommand();
887 378 : UpdateSubscriptionRelState(MyLogicalRepWorker->subid,
888 126 : MyLogicalRepWorker->relid,
889 126 : MyLogicalRepWorker->relstate,
890 126 : MyLogicalRepWorker->relstate_lsn);
891 126 : CommitTransactionCommand();
892 126 : pgstat_report_stat(false);
893 :
894 : /*
895 : * We want to do the table data sync in a single transaction.
896 : */
897 126 : StartTransactionCommand();
898 :
899 : /*
900 : * Use a standard write lock here. It might be better to disallow access
901 : * to the table while it's being synchronized. But we don't want to block
902 : * the main apply process from working and it has to open the relation in
903 : * RowExclusiveLock when remapping remote relation id to local one.
904 : */
905 126 : rel = table_open(MyLogicalRepWorker->relid, RowExclusiveLock);
906 :
907 : /*
908 : * Start a transaction in the remote node in REPEATABLE READ mode. This
909 : * ensures that both the replication slot we create (see below) and the
910 : * COPY are consistent with each other.
911 : */
912 126 : res = walrcv_exec(wrconn,
913 : "BEGIN READ ONLY ISOLATION LEVEL REPEATABLE READ",
914 : 0, NULL);
915 126 : if (res->status != WALRCV_OK_COMMAND)
916 0 : ereport(ERROR,
917 : (errmsg("table copy could not start transaction on publisher"),
918 : errdetail("The error was: %s", res->err)));
919 126 : walrcv_clear_result(res);
920 :
921 : /*
922 : * Create a new temporary logical decoding slot. This slot will be used
923 : * for the catchup phase after COPY is done, so tell it to use the
924 : * snapshot to make the final data consistent.
925 : */
926 126 : walrcv_create_slot(wrconn, slotname, true,
927 : CRS_USE_SNAPSHOT, origin_startpos);
928 :
929 : /* Now do the initial data copy */
930 126 : PushActiveSnapshot(GetTransactionSnapshot());
931 126 : copy_table(rel);
932 122 : PopActiveSnapshot();
933 :
934 122 : res = walrcv_exec(wrconn, "COMMIT", 0, NULL);
935 122 : if (res->status != WALRCV_OK_COMMAND)
936 0 : ereport(ERROR,
937 : (errmsg("table copy could not finish transaction on publisher"),
938 : errdetail("The error was: %s", res->err)));
939 122 : walrcv_clear_result(res);
940 :
941 122 : table_close(rel, NoLock);
942 :
943 : /* Make the copy visible. */
944 122 : CommandCounterIncrement();
945 :
946 : /*
947 : * We are done with the initial data synchronization, update the state.
948 : */
949 122 : SpinLockAcquire(&MyLogicalRepWorker->relmutex);
950 122 : MyLogicalRepWorker->relstate = SUBREL_STATE_SYNCWAIT;
951 122 : MyLogicalRepWorker->relstate_lsn = *origin_startpos;
952 122 : SpinLockRelease(&MyLogicalRepWorker->relmutex);
953 :
954 : /*
955 : * Finally, wait until the main apply worker tells us to catch up and then
956 : * return to let LogicalRepApplyLoop do it.
957 : */
958 122 : wait_for_worker_state_change(SUBREL_STATE_CATCHUP);
959 122 : return slotname;
960 : }
|