Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * snapbuild.c
4 : *
5 : * Infrastructure for building historic catalog snapshots based on contents
6 : * of the WAL, for the purpose of decoding heapam.c style values in the
7 : * WAL.
8 : *
9 : * NOTES:
10 : *
11 : * We build snapshots which can *only* be used to read catalog contents and we
12 : * do so by reading and interpreting the WAL stream. The aim is to build a
13 : * snapshot that behaves the same as a freshly taken MVCC snapshot would have
14 : * at the time the XLogRecord was generated.
15 : *
16 : * To build the snapshots we reuse the infrastructure built for Hot
17 : * Standby. The in-memory snapshots we build look different than HS' because
18 : * we have different needs. To successfully decode data from the WAL we only
19 : * need to access catalog tables and (sys|rel|cat)cache, not the actual user
20 : * tables since the data we decode is wholly contained in the WAL
21 : * records. Also, our snapshots need to be different in comparison to normal
22 : * MVCC ones because in contrast to those we cannot fully rely on the clog and
23 : * pg_subtrans for information about committed transactions because they might
24 : * commit in the future from the POV of the WAL entry we're currently
25 : * decoding. This definition has the advantage that we only need to prevent
26 : * removal of catalog rows, while normal table's rows can still be
27 : * removed. This is achieved by using the replication slot mechanism.
28 : *
29 : * As the percentage of transactions modifying the catalog normally is fairly
30 : * small in comparisons to ones only manipulating user data, we keep track of
31 : * the committed catalog modifying ones inside [xmin, xmax) instead of keeping
32 : * track of all running transactions like it's done in a normal snapshot. Note
33 : * that we're generally only looking at transactions that have acquired an
34 : * xid. That is we keep a list of transactions between snapshot->(xmin, xmax)
35 : * that we consider committed, everything else is considered aborted/in
36 : * progress. That also allows us not to care about subtransactions before they
37 : * have committed which means this module, in contrast to HS, doesn't have to
38 : * care about suboverflowed subtransactions and similar.
39 : *
40 : * One complexity of doing this is that to e.g. handle mixed DDL/DML
41 : * transactions we need Snapshots that see intermediate versions of the
42 : * catalog in a transaction. During normal operation this is achieved by using
43 : * CommandIds/cmin/cmax. The problem with that however is that for space
44 : * efficiency reasons only one value of that is stored
45 : * (cf. combocid.c). Since ComboCids are only available in memory we log
46 : * additional information which allows us to get the original (cmin, cmax)
47 : * pair during visibility checks. Check the reorderbuffer.c's comment above
48 : * ResolveCminCmaxDuringDecoding() for details.
49 : *
50 : * To facilitate all this we need our own visibility routine, as the normal
51 : * ones are optimized for different usecases.
52 : *
53 : * To replace the normal catalog snapshots with decoding ones use the
54 : * SetupHistoricSnapshot() and TeardownHistoricSnapshot() functions.
55 : *
56 : *
57 : *
58 : * The snapbuild machinery is starting up in several stages, as illustrated
59 : * by the following graph describing the SnapBuild->state transitions:
60 : *
61 : * +-------------------------+
62 : * +----| START |-------------+
63 : * | +-------------------------+ |
64 : * | | |
65 : * | | |
66 : * | running_xacts #1 |
67 : * | | |
68 : * | | |
69 : * | v |
70 : * | +-------------------------+ v
71 : * | | BUILDING_SNAPSHOT |------------>|
72 : * | +-------------------------+ |
73 : * | | |
74 : * | | |
75 : * | running_xacts #2, xacts from #1 finished |
76 : * | | |
77 : * | | |
78 : * | v |
79 : * | +-------------------------+ v
80 : * | | FULL_SNAPSHOT |------------>|
81 : * | +-------------------------+ |
82 : * | | |
83 : * running_xacts | saved snapshot
84 : * with zero xacts | at running_xacts's lsn
85 : * | | |
86 : * | running_xacts with xacts from #2 finished |
87 : * | | |
88 : * | v |
89 : * | +-------------------------+ |
90 : * +--->|SNAPBUILD_CONSISTENT |<------------+
91 : * +-------------------------+
92 : *
93 : * Initially the machinery is in the START stage. When an xl_running_xacts
94 : * record is read that is sufficiently new (above the safe xmin horizon),
95 : * there's a state transition. If there were no running xacts when the
96 : * running_xacts record was generated, we'll directly go into CONSISTENT
97 : * state, otherwise we'll switch to the BUILDING_SNAPSHOT state. Having a full
98 : * snapshot means that all transactions that start henceforth can be decoded
99 : * in their entirety, but transactions that started previously can't. In
100 : * FULL_SNAPSHOT we'll switch into CONSISTENT once all those previously
101 : * running transactions have committed or aborted.
102 : *
103 : * Only transactions that commit after CONSISTENT state has been reached will
104 : * be replayed, even though they might have started while still in
105 : * FULL_SNAPSHOT. That ensures that we'll reach a point where no previous
106 : * changes has been exported, but all the following ones will be. That point
107 : * is a convenient point to initialize replication from, which is why we
108 : * export a snapshot at that point, which *can* be used to read normal data.
109 : *
110 : * Copyright (c) 2012-2020, PostgreSQL Global Development Group
111 : *
112 : * IDENTIFICATION
113 : * src/backend/replication/snapbuild.c
114 : *
115 : *-------------------------------------------------------------------------
116 : */
117 :
118 : #include "postgres.h"
119 :
120 : #include <sys/stat.h>
121 : #include <unistd.h>
122 :
123 : #include "access/heapam_xlog.h"
124 : #include "access/transam.h"
125 : #include "access/xact.h"
126 : #include "miscadmin.h"
127 : #include "pgstat.h"
128 : #include "replication/logical.h"
129 : #include "replication/reorderbuffer.h"
130 : #include "replication/snapbuild.h"
131 : #include "storage/block.h" /* debugging output */
132 : #include "storage/fd.h"
133 : #include "storage/lmgr.h"
134 : #include "storage/proc.h"
135 : #include "storage/procarray.h"
136 : #include "storage/standby.h"
137 : #include "utils/builtins.h"
138 : #include "utils/memutils.h"
139 : #include "utils/snapmgr.h"
140 : #include "utils/snapshot.h"
141 :
142 : /*
143 : * This struct contains the current state of the snapshot building
144 : * machinery. Besides a forward declaration in the header, it is not exposed
145 : * to the public, so we can easily change its contents.
146 : */
147 : struct SnapBuild
148 : {
149 : /* how far are we along building our first full snapshot */
150 : SnapBuildState state;
151 :
152 : /* private memory context used to allocate memory for this module. */
153 : MemoryContext context;
154 :
155 : /* all transactions < than this have committed/aborted */
156 : TransactionId xmin;
157 :
158 : /* all transactions >= than this are uncommitted */
159 : TransactionId xmax;
160 :
161 : /*
162 : * Don't replay commits from an LSN < this LSN. This can be set externally
163 : * but it will also be advanced (never retreat) from within snapbuild.c.
164 : */
165 : XLogRecPtr start_decoding_at;
166 :
167 : /*
168 : * Don't start decoding WAL until the "xl_running_xacts" information
169 : * indicates there are no running xids with an xid smaller than this.
170 : */
171 : TransactionId initial_xmin_horizon;
172 :
173 : /* Indicates if we are building full snapshot or just catalog one. */
174 : bool building_full_snapshot;
175 :
176 : /*
177 : * Snapshot that's valid to see the catalog state seen at this moment.
178 : */
179 : Snapshot snapshot;
180 :
181 : /*
182 : * LSN of the last location we are sure a snapshot has been serialized to.
183 : */
184 : XLogRecPtr last_serialized_snapshot;
185 :
186 : /*
187 : * The reorderbuffer we need to update with usable snapshots et al.
188 : */
189 : ReorderBuffer *reorder;
190 :
191 : /*
192 : * Outdated: This struct isn't used for its original purpose anymore, but
193 : * can't be removed / changed in a minor version, because it's stored
194 : * on-disk.
195 : */
196 : struct
197 : {
198 : /*
199 : * NB: This field is misused, until a major version can break on-disk
200 : * compatibility. See SnapBuildNextPhaseAt() /
201 : * SnapBuildStartNextPhaseAt().
202 : */
203 : TransactionId was_xmin;
204 : TransactionId was_xmax;
205 :
206 : size_t was_xcnt; /* number of used xip entries */
207 : size_t was_xcnt_space; /* allocated size of xip */
208 : TransactionId *was_xip; /* running xacts array, xidComparator-sorted */
209 : } was_running;
210 :
211 : /*
212 : * Array of transactions which could have catalog changes that committed
213 : * between xmin and xmax.
214 : */
215 : struct
216 : {
217 : /* number of committed transactions */
218 : size_t xcnt;
219 :
220 : /* available space for committed transactions */
221 : size_t xcnt_space;
222 :
223 : /*
224 : * Until we reach a CONSISTENT state, we record commits of all
225 : * transactions, not just the catalog changing ones. Record when that
226 : * changes so we know we cannot export a snapshot safely anymore.
227 : */
228 : bool includes_all_transactions;
229 :
230 : /*
231 : * Array of committed transactions that have modified the catalog.
232 : *
233 : * As this array is frequently modified we do *not* keep it in
234 : * xidComparator order. Instead we sort the array when building &
235 : * distributing a snapshot.
236 : *
237 : * TODO: It's unclear whether that reasoning has much merit. Every
238 : * time we add something here after becoming consistent will also
239 : * require distributing a snapshot. Storing them sorted would
240 : * potentially also make it easier to purge (but more complicated wrt
241 : * wraparound?). Should be improved if sorting while building the
242 : * snapshot shows up in profiles.
243 : */
244 : TransactionId *xip;
245 : } committed;
246 : };
247 :
248 : /*
249 : * Starting a transaction -- which we need to do while exporting a snapshot --
250 : * removes knowledge about the previously used resowner, so we save it here.
251 : */
252 : static ResourceOwner SavedResourceOwnerDuringExport = NULL;
253 : static bool ExportInProgress = false;
254 :
255 : /* ->committed manipulation */
256 : static void SnapBuildPurgeCommittedTxn(SnapBuild *builder);
257 :
258 : /* snapshot building/manipulation/distribution functions */
259 : static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder);
260 :
261 : static void SnapBuildFreeSnapshot(Snapshot snap);
262 :
263 : static void SnapBuildSnapIncRefcount(Snapshot snap);
264 :
265 : static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn);
266 :
267 : /* xlog reading helper functions for SnapBuildProcessRunningXacts */
268 : static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running);
269 : static void SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff);
270 :
271 : /* serialization functions */
272 : static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn);
273 : static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn);
274 :
275 : /*
276 : * Return TransactionId after which the next phase of initial snapshot
277 : * building will happen.
278 : */
279 : static inline TransactionId
280 1996 : SnapBuildNextPhaseAt(SnapBuild *builder)
281 : {
282 : /*
283 : * For backward compatibility reasons this has to be stored in the wrongly
284 : * named field. Will be fixed in next major version.
285 : */
286 1996 : return builder->was_running.was_xmax;
287 : }
288 :
289 : /*
290 : * Set TransactionId after which the next phase of initial snapshot building
291 : * will happen.
292 : */
293 : static inline void
294 812 : SnapBuildStartNextPhaseAt(SnapBuild *builder, TransactionId at)
295 : {
296 : /*
297 : * For backward compatibility reasons this has to be stored in the wrongly
298 : * named field. Will be fixed in next major version.
299 : */
300 812 : builder->was_running.was_xmax = at;
301 812 : }
302 :
303 : /*
304 : * Allocate a new snapshot builder.
305 : *
306 : * xmin_horizon is the xid >= which we can be sure no catalog rows have been
307 : * removed, start_lsn is the LSN >= we want to replay commits.
308 : */
309 : SnapBuild *
310 820 : AllocateSnapshotBuilder(ReorderBuffer *reorder,
311 : TransactionId xmin_horizon,
312 : XLogRecPtr start_lsn,
313 : bool need_full_snapshot)
314 : {
315 : MemoryContext context;
316 : MemoryContext oldcontext;
317 : SnapBuild *builder;
318 :
319 : /* allocate memory in own context, to have better accountability */
320 820 : context = AllocSetContextCreate(CurrentMemoryContext,
321 : "snapshot builder context",
322 : ALLOCSET_DEFAULT_SIZES);
323 820 : oldcontext = MemoryContextSwitchTo(context);
324 :
325 820 : builder = palloc0(sizeof(SnapBuild));
326 :
327 820 : builder->state = SNAPBUILD_START;
328 820 : builder->context = context;
329 820 : builder->reorder = reorder;
330 : /* Other struct members initialized by zeroing via palloc0 above */
331 :
332 820 : builder->committed.xcnt = 0;
333 820 : builder->committed.xcnt_space = 128; /* arbitrary number */
334 820 : builder->committed.xip =
335 820 : palloc0(builder->committed.xcnt_space * sizeof(TransactionId));
336 820 : builder->committed.includes_all_transactions = true;
337 :
338 820 : builder->initial_xmin_horizon = xmin_horizon;
339 820 : builder->start_decoding_at = start_lsn;
340 820 : builder->building_full_snapshot = need_full_snapshot;
341 :
342 820 : MemoryContextSwitchTo(oldcontext);
343 :
344 820 : return builder;
345 : }
346 :
347 : /*
348 : * Free a snapshot builder.
349 : */
350 : void
351 738 : FreeSnapshotBuilder(SnapBuild *builder)
352 : {
353 738 : MemoryContext context = builder->context;
354 :
355 : /* free snapshot explicitly, that contains some error checking */
356 738 : if (builder->snapshot != NULL)
357 : {
358 270 : SnapBuildSnapDecRefcount(builder->snapshot);
359 270 : builder->snapshot = NULL;
360 : }
361 :
362 : /* other resources are deallocated via memory context reset */
363 738 : MemoryContextDelete(context);
364 738 : }
365 :
366 : /*
367 : * Free an unreferenced snapshot that has previously been built by us.
368 : */
369 : static void
370 1396 : SnapBuildFreeSnapshot(Snapshot snap)
371 : {
372 : /* make sure we don't get passed an external snapshot */
373 1396 : Assert(snap->snapshot_type == SNAPSHOT_HISTORIC_MVCC);
374 :
375 : /* make sure nobody modified our snapshot */
376 1396 : Assert(snap->curcid == FirstCommandId);
377 1396 : Assert(!snap->suboverflowed);
378 1396 : Assert(!snap->takenDuringRecovery);
379 1396 : Assert(snap->regd_count == 0);
380 :
381 : /* slightly more likely, so it's checked even without c-asserts */
382 1396 : if (snap->copied)
383 0 : elog(ERROR, "cannot free a copied snapshot");
384 :
385 1396 : if (snap->active_count)
386 0 : elog(ERROR, "cannot free an active snapshot");
387 :
388 1396 : pfree(snap);
389 1396 : }
390 :
391 : /*
392 : * In which state of snapshot building are we?
393 : */
394 : SnapBuildState
395 3735030 : SnapBuildCurrentState(SnapBuild *builder)
396 : {
397 3735030 : return builder->state;
398 : }
399 :
400 : /*
401 : * Should the contents of transaction ending at 'ptr' be decoded?
402 : */
403 : bool
404 566792 : SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr)
405 : {
406 566792 : return ptr < builder->start_decoding_at;
407 : }
408 :
409 : /*
410 : * Increase refcount of a snapshot.
411 : *
412 : * This is used when handing out a snapshot to some external resource or when
413 : * adding a Snapshot as builder->snapshot.
414 : */
415 : static void
416 5934 : SnapBuildSnapIncRefcount(Snapshot snap)
417 : {
418 5934 : snap->active_count++;
419 5934 : }
420 :
421 : /*
422 : * Decrease refcount of a snapshot and free if the refcount reaches zero.
423 : *
424 : * Externally visible, so that external resources that have been handed an
425 : * IncRef'ed Snapshot can adjust its refcount easily.
426 : */
427 : void
428 5816 : SnapBuildSnapDecRefcount(Snapshot snap)
429 : {
430 : /* make sure we don't get passed an external snapshot */
431 5816 : Assert(snap->snapshot_type == SNAPSHOT_HISTORIC_MVCC);
432 :
433 : /* make sure nobody modified our snapshot */
434 5816 : Assert(snap->curcid == FirstCommandId);
435 5816 : Assert(!snap->suboverflowed);
436 5816 : Assert(!snap->takenDuringRecovery);
437 :
438 5816 : Assert(snap->regd_count == 0);
439 :
440 5816 : Assert(snap->active_count > 0);
441 :
442 : /* slightly more likely, so it's checked even without casserts */
443 5816 : if (snap->copied)
444 0 : elog(ERROR, "cannot free a copied snapshot");
445 :
446 5816 : snap->active_count--;
447 5816 : if (snap->active_count == 0)
448 1396 : SnapBuildFreeSnapshot(snap);
449 5816 : }
450 :
451 : /*
452 : * Build a new snapshot, based on currently committed catalog-modifying
453 : * transactions.
454 : *
455 : * In-progress transactions with catalog access are *not* allowed to modify
456 : * these snapshots; they have to copy them and fill in appropriate ->curcid
457 : * and ->subxip/subxcnt values.
458 : */
459 : static Snapshot
460 1626 : SnapBuildBuildSnapshot(SnapBuild *builder)
461 : {
462 : Snapshot snapshot;
463 : Size ssize;
464 :
465 1626 : Assert(builder->state >= SNAPBUILD_FULL_SNAPSHOT);
466 :
467 1626 : ssize = sizeof(SnapshotData)
468 1626 : + sizeof(TransactionId) * builder->committed.xcnt
469 1626 : + sizeof(TransactionId) * 1 /* toplevel xid */ ;
470 :
471 1626 : snapshot = MemoryContextAllocZero(builder->context, ssize);
472 :
473 1626 : snapshot->snapshot_type = SNAPSHOT_HISTORIC_MVCC;
474 :
475 : /*
476 : * We misuse the original meaning of SnapshotData's xip and subxip fields
477 : * to make the more fitting for our needs.
478 : *
479 : * In the 'xip' array we store transactions that have to be treated as
480 : * committed. Since we will only ever look at tuples from transactions
481 : * that have modified the catalog it's more efficient to store those few
482 : * that exist between xmin and xmax (frequently there are none).
483 : *
484 : * Snapshots that are used in transactions that have modified the catalog
485 : * also use the 'subxip' array to store their toplevel xid and all the
486 : * subtransaction xids so we can recognize when we need to treat rows as
487 : * visible that are not in xip but still need to be visible. Subxip only
488 : * gets filled when the transaction is copied into the context of a
489 : * catalog modifying transaction since we otherwise share a snapshot
490 : * between transactions. As long as a txn hasn't modified the catalog it
491 : * doesn't need to treat any uncommitted rows as visible, so there is no
492 : * need for those xids.
493 : *
494 : * Both arrays are qsort'ed so that we can use bsearch() on them.
495 : */
496 1626 : Assert(TransactionIdIsNormal(builder->xmin));
497 1626 : Assert(TransactionIdIsNormal(builder->xmax));
498 :
499 1626 : snapshot->xmin = builder->xmin;
500 1626 : snapshot->xmax = builder->xmax;
501 :
502 : /* store all transactions to be treated as committed by this snapshot */
503 1626 : snapshot->xip =
504 1626 : (TransactionId *) ((char *) snapshot + sizeof(SnapshotData));
505 1626 : snapshot->xcnt = builder->committed.xcnt;
506 3252 : memcpy(snapshot->xip,
507 1626 : builder->committed.xip,
508 1626 : builder->committed.xcnt * sizeof(TransactionId));
509 :
510 : /* sort so we can bsearch() */
511 1626 : qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator);
512 :
513 : /*
514 : * Initially, subxip is empty, i.e. it's a snapshot to be used by
515 : * transactions that don't modify the catalog. Will be filled by
516 : * ReorderBufferCopySnap() if necessary.
517 : */
518 1626 : snapshot->subxcnt = 0;
519 1626 : snapshot->subxip = NULL;
520 :
521 1626 : snapshot->suboverflowed = false;
522 1626 : snapshot->takenDuringRecovery = false;
523 1626 : snapshot->copied = false;
524 1626 : snapshot->curcid = FirstCommandId;
525 1626 : snapshot->active_count = 0;
526 1626 : snapshot->regd_count = 0;
527 1626 : snapshot->snapXactCompletionCount = 0;
528 :
529 1626 : return snapshot;
530 : }
531 :
532 : /*
533 : * Build the initial slot snapshot and convert it to a normal snapshot that
534 : * is understood by HeapTupleSatisfiesMVCC.
535 : *
536 : * The snapshot will be usable directly in current transaction or exported
537 : * for loading in different transaction.
538 : */
539 : Snapshot
540 126 : SnapBuildInitialSnapshot(SnapBuild *builder)
541 : {
542 : Snapshot snap;
543 : TransactionId xid;
544 : TransactionId *newxip;
545 126 : int newxcnt = 0;
546 :
547 126 : Assert(!FirstSnapshotSet);
548 126 : Assert(XactIsoLevel == XACT_REPEATABLE_READ);
549 :
550 126 : if (builder->state != SNAPBUILD_CONSISTENT)
551 0 : elog(ERROR, "cannot build an initial slot snapshot before reaching a consistent state");
552 :
553 126 : if (!builder->committed.includes_all_transactions)
554 0 : elog(ERROR, "cannot build an initial slot snapshot, not all transactions are monitored anymore");
555 :
556 : /* so we don't overwrite the existing value */
557 126 : if (TransactionIdIsValid(MyProc->xmin))
558 0 : elog(ERROR, "cannot build an initial slot snapshot when MyProc->xmin already is valid");
559 :
560 126 : snap = SnapBuildBuildSnapshot(builder);
561 :
562 : /*
563 : * We know that snap->xmin is alive, enforced by the logical xmin
564 : * mechanism. Due to that we can do this without locks, we're only
565 : * changing our own value.
566 : */
567 : #ifdef USE_ASSERT_CHECKING
568 : {
569 : TransactionId safeXid;
570 :
571 126 : LWLockAcquire(ProcArrayLock, LW_SHARED);
572 126 : safeXid = GetOldestSafeDecodingTransactionId(false);
573 126 : LWLockRelease(ProcArrayLock);
574 :
575 126 : Assert(TransactionIdPrecedesOrEquals(safeXid, snap->xmin));
576 : }
577 : #endif
578 :
579 126 : MyProc->xmin = snap->xmin;
580 :
581 : /* allocate in transaction context */
582 126 : newxip = (TransactionId *)
583 126 : palloc(sizeof(TransactionId) * GetMaxSnapshotXidCount());
584 :
585 : /*
586 : * snapbuild.c builds transactions in an "inverted" manner, which means it
587 : * stores committed transactions in ->xip, not ones in progress. Build a
588 : * classical snapshot by marking all non-committed transactions as
589 : * in-progress. This can be expensive.
590 : */
591 252 : for (xid = snap->xmin; NormalTransactionIdPrecedes(xid, snap->xmax);)
592 : {
593 : void *test;
594 :
595 : /*
596 : * Check whether transaction committed using the decoding snapshot
597 : * meaning of ->xip.
598 : */
599 0 : test = bsearch(&xid, snap->xip, snap->xcnt,
600 : sizeof(TransactionId), xidComparator);
601 :
602 0 : if (test == NULL)
603 : {
604 0 : if (newxcnt >= GetMaxSnapshotXidCount())
605 0 : ereport(ERROR,
606 : (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
607 : errmsg("initial slot snapshot too large")));
608 :
609 0 : newxip[newxcnt++] = xid;
610 : }
611 :
612 0 : TransactionIdAdvance(xid);
613 : }
614 :
615 : /* adjust remaining snapshot fields as needed */
616 126 : snap->snapshot_type = SNAPSHOT_MVCC;
617 126 : snap->xcnt = newxcnt;
618 126 : snap->xip = newxip;
619 :
620 126 : return snap;
621 : }
622 :
623 : /*
624 : * Export a snapshot so it can be set in another session with SET TRANSACTION
625 : * SNAPSHOT.
626 : *
627 : * For that we need to start a transaction in the current backend as the
628 : * importing side checks whether the source transaction is still open to make
629 : * sure the xmin horizon hasn't advanced since then.
630 : */
631 : const char *
632 0 : SnapBuildExportSnapshot(SnapBuild *builder)
633 : {
634 : Snapshot snap;
635 : char *snapname;
636 :
637 0 : if (IsTransactionOrTransactionBlock())
638 0 : elog(ERROR, "cannot export a snapshot from within a transaction");
639 :
640 0 : if (SavedResourceOwnerDuringExport)
641 0 : elog(ERROR, "can only export one snapshot at a time");
642 :
643 0 : SavedResourceOwnerDuringExport = CurrentResourceOwner;
644 0 : ExportInProgress = true;
645 :
646 0 : StartTransactionCommand();
647 :
648 : /* There doesn't seem to a nice API to set these */
649 0 : XactIsoLevel = XACT_REPEATABLE_READ;
650 0 : XactReadOnly = true;
651 :
652 0 : snap = SnapBuildInitialSnapshot(builder);
653 :
654 : /*
655 : * now that we've built a plain snapshot, make it active and use the
656 : * normal mechanisms for exporting it
657 : */
658 0 : snapname = ExportSnapshot(snap);
659 :
660 0 : ereport(LOG,
661 : (errmsg_plural("exported logical decoding snapshot: \"%s\" with %u transaction ID",
662 : "exported logical decoding snapshot: \"%s\" with %u transaction IDs",
663 : snap->xcnt,
664 : snapname, snap->xcnt)));
665 0 : return snapname;
666 : }
667 :
668 : /*
669 : * Ensure there is a snapshot and if not build one for current transaction.
670 : */
671 : Snapshot
672 56 : SnapBuildGetOrBuildSnapshot(SnapBuild *builder, TransactionId xid)
673 : {
674 56 : Assert(builder->state == SNAPBUILD_CONSISTENT);
675 :
676 : /* only build a new snapshot if we don't have a prebuilt one */
677 56 : if (builder->snapshot == NULL)
678 : {
679 0 : builder->snapshot = SnapBuildBuildSnapshot(builder);
680 : /* increase refcount for the snapshot builder */
681 0 : SnapBuildSnapIncRefcount(builder->snapshot);
682 : }
683 :
684 56 : return builder->snapshot;
685 : }
686 :
687 : /*
688 : * Reset a previously SnapBuildExportSnapshot()'ed snapshot if there is
689 : * any. Aborts the previously started transaction and resets the resource
690 : * owner back to its original value.
691 : */
692 : void
693 1454 : SnapBuildClearExportedSnapshot(void)
694 : {
695 : /* nothing exported, that is the usual case */
696 1454 : if (!ExportInProgress)
697 2908 : return;
698 :
699 0 : if (!IsTransactionState())
700 0 : elog(ERROR, "clearing exported snapshot in wrong transaction state");
701 :
702 : /* make sure nothing could have ever happened */
703 0 : AbortCurrentTransaction();
704 :
705 0 : CurrentResourceOwner = SavedResourceOwnerDuringExport;
706 0 : SavedResourceOwnerDuringExport = NULL;
707 0 : ExportInProgress = false;
708 : }
709 :
710 : /*
711 : * Handle the effects of a single heap change, appropriate to the current state
712 : * of the snapshot builder and returns whether changes made at (xid, lsn) can
713 : * be decoded.
714 : */
715 : bool
716 2992850 : SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn)
717 : {
718 : /*
719 : * We can't handle data in transactions if we haven't built a snapshot
720 : * yet, so don't store them.
721 : */
722 2992850 : if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
723 0 : return false;
724 :
725 : /*
726 : * No point in keeping track of changes in transactions that we don't have
727 : * enough information about to decode. This means that they started before
728 : * we got into the SNAPBUILD_FULL_SNAPSHOT state.
729 : */
730 2994836 : if (builder->state < SNAPBUILD_CONSISTENT &&
731 1986 : TransactionIdPrecedes(xid, SnapBuildNextPhaseAt(builder)))
732 1986 : return false;
733 :
734 : /*
735 : * If the reorderbuffer doesn't yet have a snapshot, add one now, it will
736 : * be needed to decode the change we're currently processing.
737 : */
738 2990864 : if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
739 : {
740 : /* only build a new snapshot if we don't have a prebuilt one */
741 3226 : if (builder->snapshot == NULL)
742 : {
743 328 : builder->snapshot = SnapBuildBuildSnapshot(builder);
744 : /* increase refcount for the snapshot builder */
745 328 : SnapBuildSnapIncRefcount(builder->snapshot);
746 : }
747 :
748 : /*
749 : * Increase refcount for the transaction we're handing the snapshot
750 : * out to.
751 : */
752 3226 : SnapBuildSnapIncRefcount(builder->snapshot);
753 3226 : ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
754 : builder->snapshot);
755 : }
756 :
757 2990864 : return true;
758 : }
759 :
760 : /*
761 : * Do CommandId/ComboCid handling after reading an xl_heap_new_cid record.
762 : * This implies that a transaction has done some form of write to system
763 : * catalogs.
764 : */
765 : void
766 30900 : SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
767 : XLogRecPtr lsn, xl_heap_new_cid *xlrec)
768 : {
769 : CommandId cid;
770 :
771 : /*
772 : * we only log new_cid's if a catalog tuple was modified, so mark the
773 : * transaction as containing catalog modifications
774 : */
775 30900 : ReorderBufferXidSetCatalogChanges(builder->reorder, xid, lsn);
776 :
777 30900 : ReorderBufferAddNewTupleCids(builder->reorder, xlrec->top_xid, lsn,
778 : xlrec->target_node, xlrec->target_tid,
779 : xlrec->cmin, xlrec->cmax,
780 : xlrec->combocid);
781 :
782 : /* figure out new command id */
783 59452 : if (xlrec->cmin != InvalidCommandId &&
784 28552 : xlrec->cmax != InvalidCommandId)
785 5776 : cid = Max(xlrec->cmin, xlrec->cmax);
786 25124 : else if (xlrec->cmax != InvalidCommandId)
787 2348 : cid = xlrec->cmax;
788 22776 : else if (xlrec->cmin != InvalidCommandId)
789 22776 : cid = xlrec->cmin;
790 : else
791 : {
792 0 : cid = InvalidCommandId; /* silence compiler */
793 0 : elog(ERROR, "xl_heap_new_cid record without a valid CommandId");
794 : }
795 :
796 30900 : ReorderBufferAddNewCommandId(builder->reorder, xid, lsn, cid + 1);
797 30900 : }
798 :
799 : /*
800 : * Add a new Snapshot to all transactions we're decoding that currently are
801 : * in-progress so they can see new catalog contents made by the transaction
802 : * that just committed. This is necessary because those in-progress
803 : * transactions will use the new catalog's contents from here on (at the very
804 : * least everything they do needs to be compatible with newer catalog
805 : * contents).
806 : */
807 : static void
808 1164 : SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn)
809 : {
810 : dlist_iter txn_i;
811 : ReorderBufferTXN *txn;
812 :
813 : /*
814 : * Iterate through all toplevel transactions. This can include
815 : * subtransactions which we just don't yet know to be that, but that's
816 : * fine, they will just get an unnecessary snapshot queued.
817 : */
818 2372 : dlist_foreach(txn_i, &builder->reorder->toplevel_by_lsn)
819 : {
820 1208 : txn = dlist_container(ReorderBufferTXN, node, txn_i.cur);
821 :
822 1208 : Assert(TransactionIdIsValid(txn->xid));
823 :
824 : /*
825 : * If we don't have a base snapshot yet, there are no changes in this
826 : * transaction which in turn implies we don't yet need a snapshot at
827 : * all. We'll add a snapshot when the first change gets queued.
828 : *
829 : * NB: This works correctly even for subtransactions because
830 : * ReorderBufferAssignChild() takes care to transfer the base snapshot
831 : * to the top-level transaction, and while iterating the changequeue
832 : * we'll get the change from the subtxn.
833 : */
834 1208 : if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, txn->xid))
835 4 : continue;
836 :
837 1204 : elog(DEBUG2, "adding a new snapshot to %u at %X/%X",
838 : txn->xid, (uint32) (lsn >> 32), (uint32) lsn);
839 :
840 : /*
841 : * increase the snapshot's refcount for the transaction we are handing
842 : * it out to
843 : */
844 1204 : SnapBuildSnapIncRefcount(builder->snapshot);
845 1204 : ReorderBufferAddSnapshot(builder->reorder, txn->xid, lsn,
846 : builder->snapshot);
847 : }
848 1164 : }
849 :
850 : /*
851 : * Keep track of a new catalog changing transaction that has committed.
852 : */
853 : static void
854 1176 : SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid)
855 : {
856 1176 : Assert(TransactionIdIsValid(xid));
857 :
858 1176 : if (builder->committed.xcnt == builder->committed.xcnt_space)
859 : {
860 4 : builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1;
861 :
862 4 : elog(DEBUG1, "increasing space for committed transactions to %u",
863 : (uint32) builder->committed.xcnt_space);
864 :
865 4 : builder->committed.xip = repalloc(builder->committed.xip,
866 4 : builder->committed.xcnt_space * sizeof(TransactionId));
867 : }
868 :
869 : /*
870 : * TODO: It might make sense to keep the array sorted here instead of
871 : * doing it every time we build a new snapshot. On the other hand this
872 : * gets called repeatedly when a transaction with subtransactions commits.
873 : */
874 1176 : builder->committed.xip[builder->committed.xcnt++] = xid;
875 1176 : }
876 :
877 : /*
878 : * Remove knowledge about transactions we treat as committed that are smaller
879 : * than ->xmin. Those won't ever get checked via the ->committed array but via
880 : * the clog machinery, so we don't need to waste memory on them.
881 : */
882 : static void
883 186 : SnapBuildPurgeCommittedTxn(SnapBuild *builder)
884 : {
885 : int off;
886 : TransactionId *workspace;
887 186 : int surviving_xids = 0;
888 :
889 : /* not ready yet */
890 186 : if (!TransactionIdIsNormal(builder->xmin))
891 186 : return;
892 :
893 : /* TODO: Neater algorithm than just copying and iterating? */
894 186 : workspace =
895 186 : MemoryContextAlloc(builder->context,
896 186 : builder->committed.xcnt * sizeof(TransactionId));
897 :
898 : /* copy xids that still are interesting to workspace */
899 262 : for (off = 0; off < builder->committed.xcnt; off++)
900 : {
901 76 : if (NormalTransactionIdPrecedes(builder->committed.xip[off],
902 : builder->xmin))
903 : ; /* remove */
904 : else
905 4 : workspace[surviving_xids++] = builder->committed.xip[off];
906 : }
907 :
908 : /* copy workspace back to persistent state */
909 186 : memcpy(builder->committed.xip, workspace,
910 : surviving_xids * sizeof(TransactionId));
911 :
912 186 : elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u",
913 : (uint32) builder->committed.xcnt, (uint32) surviving_xids,
914 : builder->xmin, builder->xmax);
915 186 : builder->committed.xcnt = surviving_xids;
916 :
917 186 : pfree(workspace);
918 : }
919 :
920 : /*
921 : * Handle everything that needs to be done when a transaction commits
922 : */
923 : void
924 3138 : SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
925 : int nsubxacts, TransactionId *subxacts)
926 : {
927 : int nxact;
928 :
929 3138 : bool needs_snapshot = false;
930 3138 : bool needs_timetravel = false;
931 3138 : bool sub_needs_timetravel = false;
932 :
933 3138 : TransactionId xmax = xid;
934 :
935 : /*
936 : * Transactions preceding BUILDING_SNAPSHOT will neither be decoded, nor
937 : * will they be part of a snapshot. So we don't need to record anything.
938 : */
939 6276 : if (builder->state == SNAPBUILD_START ||
940 3138 : (builder->state == SNAPBUILD_BUILDING_SNAPSHOT &&
941 0 : TransactionIdPrecedes(xid, SnapBuildNextPhaseAt(builder))))
942 : {
943 : /* ensure that only commits after this are getting replayed */
944 0 : if (builder->start_decoding_at <= lsn)
945 0 : builder->start_decoding_at = lsn + 1;
946 0 : return;
947 : }
948 :
949 3138 : if (builder->state < SNAPBUILD_CONSISTENT)
950 : {
951 : /* ensure that only commits after this are getting replayed */
952 6 : if (builder->start_decoding_at <= lsn)
953 4 : builder->start_decoding_at = lsn + 1;
954 :
955 : /*
956 : * If building an exportable snapshot, force xid to be tracked, even
957 : * if the transaction didn't modify the catalog.
958 : */
959 6 : if (builder->building_full_snapshot)
960 : {
961 2 : needs_timetravel = true;
962 : }
963 : }
964 :
965 5588 : for (nxact = 0; nxact < nsubxacts; nxact++)
966 : {
967 2450 : TransactionId subxid = subxacts[nxact];
968 :
969 : /*
970 : * Add subtransaction to base snapshot if catalog modifying, we don't
971 : * distinguish to toplevel transactions there.
972 : */
973 2450 : if (ReorderBufferXidHasCatalogChanges(builder->reorder, subxid))
974 : {
975 10 : sub_needs_timetravel = true;
976 10 : needs_snapshot = true;
977 :
978 10 : elog(DEBUG1, "found subtransaction %u:%u with catalog changes",
979 : xid, subxid);
980 :
981 10 : SnapBuildAddCommittedTxn(builder, subxid);
982 :
983 10 : if (NormalTransactionIdFollows(subxid, xmax))
984 10 : xmax = subxid;
985 : }
986 :
987 : /*
988 : * If we're forcing timetravel we also need visibility information
989 : * about subtransaction, so keep track of subtransaction's state, even
990 : * if not catalog modifying. Don't need to distribute a snapshot in
991 : * that case.
992 : */
993 2440 : else if (needs_timetravel)
994 : {
995 0 : SnapBuildAddCommittedTxn(builder, subxid);
996 0 : if (NormalTransactionIdFollows(subxid, xmax))
997 0 : xmax = subxid;
998 : }
999 : }
1000 :
1001 : /* if top-level modified catalog, it'll need a snapshot */
1002 3138 : if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid))
1003 : {
1004 1164 : elog(DEBUG2, "found top level transaction %u, with catalog changes",
1005 : xid);
1006 1164 : needs_snapshot = true;
1007 1164 : needs_timetravel = true;
1008 1164 : SnapBuildAddCommittedTxn(builder, xid);
1009 : }
1010 1974 : else if (sub_needs_timetravel)
1011 : {
1012 : /* track toplevel txn as well, subxact alone isn't meaningful */
1013 0 : SnapBuildAddCommittedTxn(builder, xid);
1014 : }
1015 1974 : else if (needs_timetravel)
1016 : {
1017 2 : elog(DEBUG2, "forced transaction %u to do timetravel", xid);
1018 :
1019 2 : SnapBuildAddCommittedTxn(builder, xid);
1020 : }
1021 :
1022 3138 : if (!needs_timetravel)
1023 : {
1024 : /* record that we cannot export a general snapshot anymore */
1025 1972 : builder->committed.includes_all_transactions = false;
1026 : }
1027 :
1028 3138 : Assert(!needs_snapshot || needs_timetravel);
1029 :
1030 : /*
1031 : * Adjust xmax of the snapshot builder, we only do that for committed,
1032 : * catalog modifying, transactions, everything else isn't interesting for
1033 : * us since we'll never look at the respective rows.
1034 : */
1035 4304 : if (needs_timetravel &&
1036 2332 : (!TransactionIdIsValid(builder->xmax) ||
1037 1166 : TransactionIdFollowsOrEquals(xmax, builder->xmax)))
1038 : {
1039 1164 : builder->xmax = xmax;
1040 1164 : TransactionIdAdvance(builder->xmax);
1041 : }
1042 :
1043 : /* if there's any reason to build a historic snapshot, do so now */
1044 3138 : if (needs_snapshot)
1045 : {
1046 : /*
1047 : * If we haven't built a complete snapshot yet there's no need to hand
1048 : * it out, it wouldn't (and couldn't) be used anyway.
1049 : */
1050 1164 : if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
1051 0 : return;
1052 :
1053 : /*
1054 : * Decrease the snapshot builder's refcount of the old snapshot, note
1055 : * that it still will be used if it has been handed out to the
1056 : * reorderbuffer earlier.
1057 : */
1058 1164 : if (builder->snapshot)
1059 1164 : SnapBuildSnapDecRefcount(builder->snapshot);
1060 :
1061 1164 : builder->snapshot = SnapBuildBuildSnapshot(builder);
1062 :
1063 : /* we might need to execute invalidations, add snapshot */
1064 1164 : if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
1065 : {
1066 4 : SnapBuildSnapIncRefcount(builder->snapshot);
1067 4 : ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
1068 : builder->snapshot);
1069 : }
1070 :
1071 : /* refcount of the snapshot builder for the new snapshot */
1072 1164 : SnapBuildSnapIncRefcount(builder->snapshot);
1073 :
1074 : /* add a new catalog snapshot to all currently running transactions */
1075 1164 : SnapBuildDistributeNewCatalogSnapshot(builder, lsn);
1076 : }
1077 : }
1078 :
1079 :
1080 : /* -----------------------------------
1081 : * Snapshot building functions dealing with xlog records
1082 : * -----------------------------------
1083 : */
1084 :
1085 : /*
1086 : * Process a running xacts record, and use its information to first build a
1087 : * historic snapshot and later to release resources that aren't needed
1088 : * anymore.
1089 : */
1090 : void
1091 982 : SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
1092 : {
1093 : ReorderBufferTXN *txn;
1094 : TransactionId xmin;
1095 :
1096 : /*
1097 : * If we're not consistent yet, inspect the record to see whether it
1098 : * allows to get closer to being consistent. If we are consistent, dump
1099 : * our snapshot so others or we, after a restart, can use it.
1100 : */
1101 982 : if (builder->state < SNAPBUILD_CONSISTENT)
1102 : {
1103 : /* returns false if there's no point in performing cleanup just yet */
1104 820 : if (!SnapBuildFindSnapshot(builder, lsn, running))
1105 796 : return;
1106 : }
1107 : else
1108 162 : SnapBuildSerialize(builder, lsn);
1109 :
1110 : /*
1111 : * Update range of interesting xids based on the running xacts
1112 : * information. We don't increase ->xmax using it, because once we are in
1113 : * a consistent state we can do that ourselves and much more efficiently
1114 : * so, because we only need to do it for catalog transactions since we
1115 : * only ever look at those.
1116 : *
1117 : * NB: We only increase xmax when a catalog modifying transaction commits
1118 : * (see SnapBuildCommitTxn). Because of this, xmax can be lower than
1119 : * xmin, which looks odd but is correct and actually more efficient, since
1120 : * we hit fast paths in heapam_visibility.c.
1121 : */
1122 186 : builder->xmin = running->oldestRunningXid;
1123 :
1124 : /* Remove transactions we don't need to keep track off anymore */
1125 186 : SnapBuildPurgeCommittedTxn(builder);
1126 :
1127 : /*
1128 : * Advance the xmin limit for the current replication slot, to allow
1129 : * vacuum to clean up the tuples this slot has been protecting.
1130 : *
1131 : * The reorderbuffer might have an xmin among the currently running
1132 : * snapshots; use it if so. If not, we need only consider the snapshots
1133 : * we'll produce later, which can't be less than the oldest running xid in
1134 : * the record we're reading now.
1135 : */
1136 186 : xmin = ReorderBufferGetOldestXmin(builder->reorder);
1137 186 : if (xmin == InvalidTransactionId)
1138 168 : xmin = running->oldestRunningXid;
1139 186 : elog(DEBUG3, "xmin: %u, xmax: %u, oldest running: %u, oldest xmin: %u",
1140 : builder->xmin, builder->xmax, running->oldestRunningXid, xmin);
1141 186 : LogicalIncreaseXminForSlot(lsn, xmin);
1142 :
1143 : /*
1144 : * Also tell the slot where we can restart decoding from. We don't want to
1145 : * do that after every commit because changing that implies an fsync of
1146 : * the logical slot's state file, so we only do it every time we see a
1147 : * running xacts record.
1148 : *
1149 : * Do so by looking for the oldest in progress transaction (determined by
1150 : * the first LSN of any of its relevant records). Every transaction
1151 : * remembers the last location we stored the snapshot to disk before its
1152 : * beginning. That point is where we can restart from.
1153 : */
1154 :
1155 : /*
1156 : * Can't know about a serialized snapshot's location if we're not
1157 : * consistent.
1158 : */
1159 186 : if (builder->state < SNAPBUILD_CONSISTENT)
1160 20 : return;
1161 :
1162 166 : txn = ReorderBufferGetOldestTXN(builder->reorder);
1163 :
1164 : /*
1165 : * oldest ongoing txn might have started when we didn't yet serialize
1166 : * anything because we hadn't reached a consistent state yet.
1167 : */
1168 166 : if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr)
1169 10 : LogicalIncreaseRestartDecodingForSlot(lsn, txn->restart_decoding_lsn);
1170 :
1171 : /*
1172 : * No in-progress transaction, can reuse the last serialized snapshot if
1173 : * we have one.
1174 : */
1175 304 : else if (txn == NULL &&
1176 292 : builder->reorder->current_restart_decoding_lsn != InvalidXLogRecPtr &&
1177 144 : builder->last_serialized_snapshot != InvalidXLogRecPtr)
1178 144 : LogicalIncreaseRestartDecodingForSlot(lsn,
1179 : builder->last_serialized_snapshot);
1180 : }
1181 :
1182 :
1183 : /*
1184 : * Build the start of a snapshot that's capable of decoding the catalog.
1185 : *
1186 : * Helper function for SnapBuildProcessRunningXacts() while we're not yet
1187 : * consistent.
1188 : *
1189 : * Returns true if there is a point in performing internal maintenance/cleanup
1190 : * using the xl_running_xacts record.
1191 : */
1192 : static bool
1193 820 : SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
1194 : {
1195 : /* ---
1196 : * Build catalog decoding snapshot incrementally using information about
1197 : * the currently running transactions. There are several ways to do that:
1198 : *
1199 : * a) There were no running transactions when the xl_running_xacts record
1200 : * was inserted, jump to CONSISTENT immediately. We might find such a
1201 : * state while waiting on c)'s sub-states.
1202 : *
1203 : * b) This (in a previous run) or another decoding slot serialized a
1204 : * snapshot to disk that we can use. Can't use this method for the
1205 : * initial snapshot when slot is being created and needs full snapshot
1206 : * for export or direct use, as that snapshot will only contain catalog
1207 : * modifying transactions.
1208 : *
1209 : * c) First incrementally build a snapshot for catalog tuples
1210 : * (BUILDING_SNAPSHOT), that requires all, already in-progress,
1211 : * transactions to finish. Every transaction starting after that
1212 : * (FULL_SNAPSHOT state), has enough information to be decoded. But
1213 : * for older running transactions no viable snapshot exists yet, so
1214 : * CONSISTENT will only be reached once all of those have finished.
1215 : * ---
1216 : */
1217 :
1218 : /*
1219 : * xl_running_xact record is older than what we can use, we might not have
1220 : * all necessary catalog rows anymore.
1221 : */
1222 1148 : if (TransactionIdIsNormal(builder->initial_xmin_horizon) &&
1223 328 : NormalTransactionIdPrecedes(running->oldestRunningXid,
1224 : builder->initial_xmin_horizon))
1225 : {
1226 0 : ereport(DEBUG1,
1227 : (errmsg_internal("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low",
1228 : (uint32) (lsn >> 32), (uint32) lsn),
1229 : errdetail_internal("initial xmin horizon of %u vs the snapshot's %u",
1230 : builder->initial_xmin_horizon, running->oldestRunningXid)));
1231 :
1232 :
1233 0 : SnapBuildWaitSnapshot(running, builder->initial_xmin_horizon);
1234 :
1235 0 : return true;
1236 : }
1237 :
1238 : /*
1239 : * a) No transaction were running, we can jump to consistent.
1240 : *
1241 : * This is not affected by races around xl_running_xacts, because we can
1242 : * miss transaction commits, but currently not transactions starting.
1243 : *
1244 : * NB: We might have already started to incrementally assemble a snapshot,
1245 : * so we need to be careful to deal with that.
1246 : */
1247 820 : if (running->oldestRunningXid == running->nextXid)
1248 : {
1249 1264 : if (builder->start_decoding_at == InvalidXLogRecPtr ||
1250 476 : builder->start_decoding_at <= lsn)
1251 : /* can decode everything after this */
1252 314 : builder->start_decoding_at = lsn + 1;
1253 :
1254 : /* As no transactions were running xmin/xmax can be trivially set. */
1255 788 : builder->xmin = running->nextXid; /* < are finished */
1256 788 : builder->xmax = running->nextXid; /* >= are running */
1257 :
1258 : /* so we can safely use the faster comparisons */
1259 788 : Assert(TransactionIdIsNormal(builder->xmin));
1260 788 : Assert(TransactionIdIsNormal(builder->xmax));
1261 :
1262 788 : builder->state = SNAPBUILD_CONSISTENT;
1263 788 : SnapBuildStartNextPhaseAt(builder, InvalidTransactionId);
1264 :
1265 788 : ereport(LOG,
1266 : (errmsg("logical decoding found consistent point at %X/%X",
1267 : (uint32) (lsn >> 32), (uint32) lsn),
1268 : errdetail("There are no running transactions.")));
1269 :
1270 788 : return false;
1271 : }
1272 : /* b) valid on disk state and not building full snapshot */
1273 58 : else if (!builder->building_full_snapshot &&
1274 26 : SnapBuildRestore(builder, lsn))
1275 : {
1276 : /* there won't be any state to cleanup */
1277 8 : return false;
1278 : }
1279 :
1280 : /*
1281 : * c) transition from START to BUILDING_SNAPSHOT.
1282 : *
1283 : * In START state, and a xl_running_xacts record with running xacts is
1284 : * encountered. In that case, switch to BUILDING_SNAPSHOT state, and
1285 : * record xl_running_xacts->nextXid. Once all running xacts have finished
1286 : * (i.e. they're all >= nextXid), we have a complete catalog snapshot. It
1287 : * might look that we could use xl_running_xact's ->xids information to
1288 : * get there quicker, but that is problematic because transactions marked
1289 : * as running, might already have inserted their commit record - it's
1290 : * infeasible to change that with locking.
1291 : */
1292 24 : else if (builder->state == SNAPBUILD_START)
1293 : {
1294 14 : builder->state = SNAPBUILD_BUILDING_SNAPSHOT;
1295 14 : SnapBuildStartNextPhaseAt(builder, running->nextXid);
1296 :
1297 : /*
1298 : * Start with an xmin/xmax that's correct for future, when all the
1299 : * currently running transactions have finished. We'll update both
1300 : * while waiting for the pending transactions to finish.
1301 : */
1302 14 : builder->xmin = running->nextXid; /* < are finished */
1303 14 : builder->xmax = running->nextXid; /* >= are running */
1304 :
1305 : /* so we can safely use the faster comparisons */
1306 14 : Assert(TransactionIdIsNormal(builder->xmin));
1307 14 : Assert(TransactionIdIsNormal(builder->xmax));
1308 :
1309 14 : ereport(LOG,
1310 : (errmsg("logical decoding found initial starting point at %X/%X",
1311 : (uint32) (lsn >> 32), (uint32) lsn),
1312 : errdetail("Waiting for transactions (approximately %d) older than %u to end.",
1313 : running->xcnt, running->nextXid)));
1314 :
1315 14 : SnapBuildWaitSnapshot(running, running->nextXid);
1316 : }
1317 :
1318 : /*
1319 : * c) transition from BUILDING_SNAPSHOT to FULL_SNAPSHOT.
1320 : *
1321 : * In BUILDING_SNAPSHOT state, and this xl_running_xacts' oldestRunningXid
1322 : * is >= than nextXid from when we switched to BUILDING_SNAPSHOT. This
1323 : * means all transactions starting afterwards have enough information to
1324 : * be decoded. Switch to FULL_SNAPSHOT.
1325 : */
1326 16 : else if (builder->state == SNAPBUILD_BUILDING_SNAPSHOT &&
1327 6 : TransactionIdPrecedesOrEquals(SnapBuildNextPhaseAt(builder),
1328 : running->oldestRunningXid))
1329 : {
1330 6 : builder->state = SNAPBUILD_FULL_SNAPSHOT;
1331 6 : SnapBuildStartNextPhaseAt(builder, running->nextXid);
1332 :
1333 6 : ereport(LOG,
1334 : (errmsg("logical decoding found initial consistent point at %X/%X",
1335 : (uint32) (lsn >> 32), (uint32) lsn),
1336 : errdetail("Waiting for transactions (approximately %d) older than %u to end.",
1337 : running->xcnt, running->nextXid)));
1338 :
1339 6 : SnapBuildWaitSnapshot(running, running->nextXid);
1340 : }
1341 :
1342 : /*
1343 : * c) transition from FULL_SNAPSHOT to CONSISTENT.
1344 : *
1345 : * In FULL_SNAPSHOT state (see d) ), and this xl_running_xacts'
1346 : * oldestRunningXid is >= than nextXid from when we switched to
1347 : * FULL_SNAPSHOT. This means all transactions that are currently in
1348 : * progress have a catalog snapshot, and all their changes have been
1349 : * collected. Switch to CONSISTENT.
1350 : */
1351 8 : else if (builder->state == SNAPBUILD_FULL_SNAPSHOT &&
1352 4 : TransactionIdPrecedesOrEquals(SnapBuildNextPhaseAt(builder),
1353 : running->oldestRunningXid))
1354 : {
1355 4 : builder->state = SNAPBUILD_CONSISTENT;
1356 4 : SnapBuildStartNextPhaseAt(builder, InvalidTransactionId);
1357 :
1358 4 : ereport(LOG,
1359 : (errmsg("logical decoding found consistent point at %X/%X",
1360 : (uint32) (lsn >> 32), (uint32) lsn),
1361 : errdetail("There are no old transactions anymore.")));
1362 : }
1363 :
1364 : /*
1365 : * We already started to track running xacts and need to wait for all
1366 : * in-progress ones to finish. We fall through to the normal processing of
1367 : * records so incremental cleanup can be performed.
1368 : */
1369 24 : return true;
1370 :
1371 : }
1372 :
1373 : /* ---
1374 : * Iterate through xids in record, wait for all older than the cutoff to
1375 : * finish. Then, if possible, log a new xl_running_xacts record.
1376 : *
1377 : * This isn't required for the correctness of decoding, but to:
1378 : * a) allow isolationtester to notice that we're currently waiting for
1379 : * something.
1380 : * b) log a new xl_running_xacts record where it'd be helpful, without having
1381 : * to write for bgwriter or checkpointer.
1382 : * ---
1383 : */
1384 : static void
1385 20 : SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff)
1386 : {
1387 : int off;
1388 :
1389 40 : for (off = 0; off < running->xcnt; off++)
1390 : {
1391 20 : TransactionId xid = running->xids[off];
1392 :
1393 : /*
1394 : * Upper layers should prevent that we ever need to wait on ourselves.
1395 : * Check anyway, since failing to do so would either result in an
1396 : * endless wait or an Assert() failure.
1397 : */
1398 20 : if (TransactionIdIsCurrentTransactionId(xid))
1399 0 : elog(ERROR, "waiting for ourselves");
1400 :
1401 20 : if (TransactionIdFollows(xid, cutoff))
1402 0 : continue;
1403 :
1404 20 : XactLockTableWait(xid, NULL, NULL, XLTW_None);
1405 : }
1406 :
1407 : /*
1408 : * All transactions we needed to finish finished - try to ensure there is
1409 : * another xl_running_xacts record in a timely manner, without having to
1410 : * write for bgwriter or checkpointer to log one. During recovery we
1411 : * can't enforce that, so we'll have to wait.
1412 : */
1413 20 : if (!RecoveryInProgress())
1414 : {
1415 20 : LogStandbySnapshot();
1416 : }
1417 20 : }
1418 :
1419 : /* -----------------------------------
1420 : * Snapshot serialization support
1421 : * -----------------------------------
1422 : */
1423 :
1424 : /*
1425 : * We store current state of struct SnapBuild on disk in the following manner:
1426 : *
1427 : * struct SnapBuildOnDisk;
1428 : * TransactionId * running.xcnt_space;
1429 : * TransactionId * committed.xcnt; (*not xcnt_space*)
1430 : *
1431 : */
1432 : typedef struct SnapBuildOnDisk
1433 : {
1434 : /* first part of this struct needs to be version independent */
1435 :
1436 : /* data not covered by checksum */
1437 : uint32 magic;
1438 : pg_crc32c checksum;
1439 :
1440 : /* data covered by checksum */
1441 :
1442 : /* version, in case we want to support pg_upgrade */
1443 : uint32 version;
1444 : /* how large is the on disk data, excluding the constant sized part */
1445 : uint32 length;
1446 :
1447 : /* version dependent part */
1448 : SnapBuild builder;
1449 :
1450 : /* variable amount of TransactionIds follows */
1451 : } SnapBuildOnDisk;
1452 :
1453 : #define SnapBuildOnDiskConstantSize \
1454 : offsetof(SnapBuildOnDisk, builder)
1455 : #define SnapBuildOnDiskNotChecksummedSize \
1456 : offsetof(SnapBuildOnDisk, version)
1457 :
1458 : #define SNAPBUILD_MAGIC 0x51A1E001
1459 : #define SNAPBUILD_VERSION 2
1460 :
1461 : /*
1462 : * Store/Load a snapshot from disk, depending on the snapshot builder's state.
1463 : *
1464 : * Supposed to be used by external (i.e. not snapbuild.c) code that just read
1465 : * a record that's a potential location for a serialized snapshot.
1466 : */
1467 : void
1468 30 : SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn)
1469 : {
1470 30 : if (builder->state < SNAPBUILD_CONSISTENT)
1471 0 : SnapBuildRestore(builder, lsn);
1472 : else
1473 30 : SnapBuildSerialize(builder, lsn);
1474 30 : }
1475 :
1476 : /*
1477 : * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already
1478 : * been done by another decoding process.
1479 : */
1480 : static void
1481 192 : SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
1482 : {
1483 : Size needed_length;
1484 : SnapBuildOnDisk *ondisk;
1485 : char *ondisk_c;
1486 : int fd;
1487 : char tmppath[MAXPGPATH];
1488 : char path[MAXPGPATH];
1489 : int ret;
1490 : struct stat stat_buf;
1491 : Size sz;
1492 :
1493 192 : Assert(lsn != InvalidXLogRecPtr);
1494 192 : Assert(builder->last_serialized_snapshot == InvalidXLogRecPtr ||
1495 : builder->last_serialized_snapshot <= lsn);
1496 :
1497 : /*
1498 : * no point in serializing if we cannot continue to work immediately after
1499 : * restoring the snapshot
1500 : */
1501 192 : if (builder->state < SNAPBUILD_CONSISTENT)
1502 192 : return;
1503 :
1504 : /*
1505 : * We identify snapshots by the LSN they are valid for. We don't need to
1506 : * include timelines in the name as each LSN maps to exactly one timeline
1507 : * unless the user used pg_resetwal or similar. If a user did so, there's
1508 : * no hope continuing to decode anyway.
1509 : */
1510 384 : sprintf(path, "pg_logical/snapshots/%X-%X.snap",
1511 192 : (uint32) (lsn >> 32), (uint32) lsn);
1512 :
1513 : /*
1514 : * first check whether some other backend already has written the snapshot
1515 : * for this LSN. It's perfectly fine if there's none, so we accept ENOENT
1516 : * as a valid state. Everything else is an unexpected error.
1517 : */
1518 192 : ret = stat(path, &stat_buf);
1519 :
1520 192 : if (ret != 0 && errno != ENOENT)
1521 0 : ereport(ERROR,
1522 : (errcode_for_file_access(),
1523 : errmsg("could not stat file \"%s\": %m", path)));
1524 :
1525 192 : else if (ret == 0)
1526 : {
1527 : /*
1528 : * somebody else has already serialized to this point, don't overwrite
1529 : * but remember location, so we don't need to read old data again.
1530 : *
1531 : * To be sure it has been synced to disk after the rename() from the
1532 : * tempfile filename to the real filename, we just repeat the fsync.
1533 : * That ought to be cheap because in most scenarios it should already
1534 : * be safely on disk.
1535 : */
1536 44 : fsync_fname(path, false);
1537 44 : fsync_fname("pg_logical/snapshots", true);
1538 :
1539 44 : builder->last_serialized_snapshot = lsn;
1540 44 : goto out;
1541 : }
1542 :
1543 : /*
1544 : * there is an obvious race condition here between the time we stat(2) the
1545 : * file and us writing the file. But we rename the file into place
1546 : * atomically and all files created need to contain the same data anyway,
1547 : * so this is perfectly fine, although a bit of a resource waste. Locking
1548 : * seems like pointless complication.
1549 : */
1550 148 : elog(DEBUG1, "serializing snapshot to %s", path);
1551 :
1552 : /* to make sure only we will write to this tempfile, include pid */
1553 296 : sprintf(tmppath, "pg_logical/snapshots/%X-%X.snap.%u.tmp",
1554 148 : (uint32) (lsn >> 32), (uint32) lsn, MyProcPid);
1555 :
1556 : /*
1557 : * Unlink temporary file if it already exists, needs to have been before a
1558 : * crash/error since we won't enter this function twice from within a
1559 : * single decoding slot/backend and the temporary file contains the pid of
1560 : * the current process.
1561 : */
1562 148 : if (unlink(tmppath) != 0 && errno != ENOENT)
1563 0 : ereport(ERROR,
1564 : (errcode_for_file_access(),
1565 : errmsg("could not remove file \"%s\": %m", tmppath)));
1566 :
1567 148 : needed_length = sizeof(SnapBuildOnDisk) +
1568 148 : sizeof(TransactionId) * builder->committed.xcnt;
1569 :
1570 148 : ondisk_c = MemoryContextAllocZero(builder->context, needed_length);
1571 148 : ondisk = (SnapBuildOnDisk *) ondisk_c;
1572 148 : ondisk->magic = SNAPBUILD_MAGIC;
1573 148 : ondisk->version = SNAPBUILD_VERSION;
1574 148 : ondisk->length = needed_length;
1575 148 : INIT_CRC32C(ondisk->checksum);
1576 148 : COMP_CRC32C(ondisk->checksum,
1577 : ((char *) ondisk) + SnapBuildOnDiskNotChecksummedSize,
1578 : SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize);
1579 148 : ondisk_c += sizeof(SnapBuildOnDisk);
1580 :
1581 148 : memcpy(&ondisk->builder, builder, sizeof(SnapBuild));
1582 : /* NULL-ify memory-only data */
1583 148 : ondisk->builder.context = NULL;
1584 148 : ondisk->builder.snapshot = NULL;
1585 148 : ondisk->builder.reorder = NULL;
1586 148 : ondisk->builder.committed.xip = NULL;
1587 :
1588 148 : COMP_CRC32C(ondisk->checksum,
1589 : &ondisk->builder,
1590 : sizeof(SnapBuild));
1591 :
1592 : /* there shouldn't be any running xacts */
1593 148 : Assert(builder->was_running.was_xcnt == 0);
1594 :
1595 : /* copy committed xacts */
1596 148 : sz = sizeof(TransactionId) * builder->committed.xcnt;
1597 148 : memcpy(ondisk_c, builder->committed.xip, sz);
1598 148 : COMP_CRC32C(ondisk->checksum, ondisk_c, sz);
1599 148 : ondisk_c += sz;
1600 :
1601 148 : FIN_CRC32C(ondisk->checksum);
1602 :
1603 : /* we have valid data now, open tempfile and write it there */
1604 148 : fd = OpenTransientFile(tmppath,
1605 : O_CREAT | O_EXCL | O_WRONLY | PG_BINARY);
1606 148 : if (fd < 0)
1607 0 : ereport(ERROR,
1608 : (errcode_for_file_access(),
1609 : errmsg("could not open file \"%s\": %m", tmppath)));
1610 :
1611 148 : errno = 0;
1612 148 : pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_WRITE);
1613 148 : if ((write(fd, ondisk, needed_length)) != needed_length)
1614 : {
1615 0 : int save_errno = errno;
1616 :
1617 0 : CloseTransientFile(fd);
1618 :
1619 : /* if write didn't set errno, assume problem is no disk space */
1620 0 : errno = save_errno ? save_errno : ENOSPC;
1621 0 : ereport(ERROR,
1622 : (errcode_for_file_access(),
1623 : errmsg("could not write to file \"%s\": %m", tmppath)));
1624 : }
1625 148 : pgstat_report_wait_end();
1626 :
1627 : /*
1628 : * fsync the file before renaming so that even if we crash after this we
1629 : * have either a fully valid file or nothing.
1630 : *
1631 : * It's safe to just ERROR on fsync() here because we'll retry the whole
1632 : * operation including the writes.
1633 : *
1634 : * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
1635 : * some noticeable overhead since it's performed synchronously during
1636 : * decoding?
1637 : */
1638 148 : pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_SYNC);
1639 148 : if (pg_fsync(fd) != 0)
1640 : {
1641 0 : int save_errno = errno;
1642 :
1643 0 : CloseTransientFile(fd);
1644 0 : errno = save_errno;
1645 0 : ereport(ERROR,
1646 : (errcode_for_file_access(),
1647 : errmsg("could not fsync file \"%s\": %m", tmppath)));
1648 : }
1649 148 : pgstat_report_wait_end();
1650 :
1651 148 : if (CloseTransientFile(fd) != 0)
1652 0 : ereport(ERROR,
1653 : (errcode_for_file_access(),
1654 : errmsg("could not close file \"%s\": %m", tmppath)));
1655 :
1656 148 : fsync_fname("pg_logical/snapshots", true);
1657 :
1658 : /*
1659 : * We may overwrite the work from some other backend, but that's ok, our
1660 : * snapshot is valid as well, we'll just have done some superfluous work.
1661 : */
1662 148 : if (rename(tmppath, path) != 0)
1663 : {
1664 0 : ereport(ERROR,
1665 : (errcode_for_file_access(),
1666 : errmsg("could not rename file \"%s\" to \"%s\": %m",
1667 : tmppath, path)));
1668 : }
1669 :
1670 : /* make sure we persist */
1671 148 : fsync_fname(path, false);
1672 148 : fsync_fname("pg_logical/snapshots", true);
1673 :
1674 : /*
1675 : * Now there's no way we can loose the dumped state anymore, remember this
1676 : * as a serialization point.
1677 : */
1678 148 : builder->last_serialized_snapshot = lsn;
1679 :
1680 : out:
1681 192 : ReorderBufferSetRestartPoint(builder->reorder,
1682 : builder->last_serialized_snapshot);
1683 : }
1684 :
1685 : /*
1686 : * Restore a snapshot into 'builder' if previously one has been stored at the
1687 : * location indicated by 'lsn'. Returns true if successful, false otherwise.
1688 : */
1689 : static bool
1690 26 : SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn)
1691 : {
1692 : SnapBuildOnDisk ondisk;
1693 : int fd;
1694 : char path[MAXPGPATH];
1695 : Size sz;
1696 : int readBytes;
1697 : pg_crc32c checksum;
1698 :
1699 : /* no point in loading a snapshot if we're already there */
1700 26 : if (builder->state == SNAPBUILD_CONSISTENT)
1701 0 : return false;
1702 :
1703 52 : sprintf(path, "pg_logical/snapshots/%X-%X.snap",
1704 26 : (uint32) (lsn >> 32), (uint32) lsn);
1705 :
1706 26 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
1707 :
1708 26 : if (fd < 0 && errno == ENOENT)
1709 18 : return false;
1710 8 : else if (fd < 0)
1711 0 : ereport(ERROR,
1712 : (errcode_for_file_access(),
1713 : errmsg("could not open file \"%s\": %m", path)));
1714 :
1715 : /* ----
1716 : * Make sure the snapshot had been stored safely to disk, that's normally
1717 : * cheap.
1718 : * Note that we do not need PANIC here, nobody will be able to use the
1719 : * slot without fsyncing, and saving it won't succeed without an fsync()
1720 : * either...
1721 : * ----
1722 : */
1723 8 : fsync_fname(path, false);
1724 8 : fsync_fname("pg_logical/snapshots", true);
1725 :
1726 :
1727 : /* read statically sized portion of snapshot */
1728 8 : pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ);
1729 8 : readBytes = read(fd, &ondisk, SnapBuildOnDiskConstantSize);
1730 8 : pgstat_report_wait_end();
1731 8 : if (readBytes != SnapBuildOnDiskConstantSize)
1732 : {
1733 0 : int save_errno = errno;
1734 :
1735 0 : CloseTransientFile(fd);
1736 :
1737 0 : if (readBytes < 0)
1738 : {
1739 0 : errno = save_errno;
1740 0 : ereport(ERROR,
1741 : (errcode_for_file_access(),
1742 : errmsg("could not read file \"%s\": %m", path)));
1743 : }
1744 : else
1745 0 : ereport(ERROR,
1746 : (errcode(ERRCODE_DATA_CORRUPTED),
1747 : errmsg("could not read file \"%s\": read %d of %zu",
1748 : path, readBytes,
1749 : (Size) SnapBuildOnDiskConstantSize)));
1750 : }
1751 :
1752 8 : if (ondisk.magic != SNAPBUILD_MAGIC)
1753 0 : ereport(ERROR,
1754 : (errcode(ERRCODE_DATA_CORRUPTED),
1755 : errmsg("snapbuild state file \"%s\" has wrong magic number: %u instead of %u",
1756 : path, ondisk.magic, SNAPBUILD_MAGIC)));
1757 :
1758 8 : if (ondisk.version != SNAPBUILD_VERSION)
1759 0 : ereport(ERROR,
1760 : (errcode(ERRCODE_DATA_CORRUPTED),
1761 : errmsg("snapbuild state file \"%s\" has unsupported version: %u instead of %u",
1762 : path, ondisk.version, SNAPBUILD_VERSION)));
1763 :
1764 8 : INIT_CRC32C(checksum);
1765 8 : COMP_CRC32C(checksum,
1766 : ((char *) &ondisk) + SnapBuildOnDiskNotChecksummedSize,
1767 : SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize);
1768 :
1769 : /* read SnapBuild */
1770 8 : pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ);
1771 8 : readBytes = read(fd, &ondisk.builder, sizeof(SnapBuild));
1772 8 : pgstat_report_wait_end();
1773 8 : if (readBytes != sizeof(SnapBuild))
1774 : {
1775 0 : int save_errno = errno;
1776 :
1777 0 : CloseTransientFile(fd);
1778 :
1779 0 : if (readBytes < 0)
1780 : {
1781 0 : errno = save_errno;
1782 0 : ereport(ERROR,
1783 : (errcode_for_file_access(),
1784 : errmsg("could not read file \"%s\": %m", path)));
1785 : }
1786 : else
1787 0 : ereport(ERROR,
1788 : (errcode(ERRCODE_DATA_CORRUPTED),
1789 : errmsg("could not read file \"%s\": read %d of %zu",
1790 : path, readBytes, sizeof(SnapBuild))));
1791 : }
1792 8 : COMP_CRC32C(checksum, &ondisk.builder, sizeof(SnapBuild));
1793 :
1794 : /* restore running xacts (dead, but kept for backward compat) */
1795 8 : sz = sizeof(TransactionId) * ondisk.builder.was_running.was_xcnt_space;
1796 8 : ondisk.builder.was_running.was_xip =
1797 8 : MemoryContextAllocZero(builder->context, sz);
1798 8 : pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ);
1799 8 : readBytes = read(fd, ondisk.builder.was_running.was_xip, sz);
1800 8 : pgstat_report_wait_end();
1801 8 : if (readBytes != sz)
1802 : {
1803 0 : int save_errno = errno;
1804 :
1805 0 : CloseTransientFile(fd);
1806 :
1807 0 : if (readBytes < 0)
1808 : {
1809 0 : errno = save_errno;
1810 0 : ereport(ERROR,
1811 : (errcode_for_file_access(),
1812 : errmsg("could not read file \"%s\": %m", path)));
1813 : }
1814 : else
1815 0 : ereport(ERROR,
1816 : (errcode(ERRCODE_DATA_CORRUPTED),
1817 : errmsg("could not read file \"%s\": read %d of %zu",
1818 : path, readBytes, sz)));
1819 : }
1820 8 : COMP_CRC32C(checksum, ondisk.builder.was_running.was_xip, sz);
1821 :
1822 : /* restore committed xacts information */
1823 8 : sz = sizeof(TransactionId) * ondisk.builder.committed.xcnt;
1824 8 : ondisk.builder.committed.xip = MemoryContextAllocZero(builder->context, sz);
1825 8 : pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ);
1826 8 : readBytes = read(fd, ondisk.builder.committed.xip, sz);
1827 8 : pgstat_report_wait_end();
1828 8 : if (readBytes != sz)
1829 : {
1830 0 : int save_errno = errno;
1831 :
1832 0 : CloseTransientFile(fd);
1833 :
1834 0 : if (readBytes < 0)
1835 : {
1836 0 : errno = save_errno;
1837 0 : ereport(ERROR,
1838 : (errcode_for_file_access(),
1839 : errmsg("could not read file \"%s\": %m", path)));
1840 : }
1841 : else
1842 0 : ereport(ERROR,
1843 : (errcode(ERRCODE_DATA_CORRUPTED),
1844 : errmsg("could not read file \"%s\": read %d of %zu",
1845 : path, readBytes, sz)));
1846 : }
1847 8 : COMP_CRC32C(checksum, ondisk.builder.committed.xip, sz);
1848 :
1849 8 : if (CloseTransientFile(fd) != 0)
1850 0 : ereport(ERROR,
1851 : (errcode_for_file_access(),
1852 : errmsg("could not close file \"%s\": %m", path)));
1853 :
1854 8 : FIN_CRC32C(checksum);
1855 :
1856 : /* verify checksum of what we've read */
1857 8 : if (!EQ_CRC32C(checksum, ondisk.checksum))
1858 0 : ereport(ERROR,
1859 : (errcode(ERRCODE_DATA_CORRUPTED),
1860 : errmsg("checksum mismatch for snapbuild state file \"%s\": is %u, should be %u",
1861 : path, checksum, ondisk.checksum)));
1862 :
1863 : /*
1864 : * ok, we now have a sensible snapshot here, figure out if it has more
1865 : * information than we have.
1866 : */
1867 :
1868 : /*
1869 : * We are only interested in consistent snapshots for now, comparing
1870 : * whether one incomplete snapshot is more "advanced" seems to be
1871 : * unnecessarily complex.
1872 : */
1873 8 : if (ondisk.builder.state < SNAPBUILD_CONSISTENT)
1874 0 : goto snapshot_not_interesting;
1875 :
1876 : /*
1877 : * Don't use a snapshot that requires an xmin that we cannot guarantee to
1878 : * be available.
1879 : */
1880 8 : if (TransactionIdPrecedes(ondisk.builder.xmin, builder->initial_xmin_horizon))
1881 0 : goto snapshot_not_interesting;
1882 :
1883 :
1884 : /* ok, we think the snapshot is sensible, copy over everything important */
1885 8 : builder->xmin = ondisk.builder.xmin;
1886 8 : builder->xmax = ondisk.builder.xmax;
1887 8 : builder->state = ondisk.builder.state;
1888 :
1889 8 : builder->committed.xcnt = ondisk.builder.committed.xcnt;
1890 : /* We only allocated/stored xcnt, not xcnt_space xids ! */
1891 : /* don't overwrite preallocated xip, if we don't have anything here */
1892 8 : if (builder->committed.xcnt > 0)
1893 : {
1894 8 : pfree(builder->committed.xip);
1895 8 : builder->committed.xcnt_space = ondisk.builder.committed.xcnt;
1896 8 : builder->committed.xip = ondisk.builder.committed.xip;
1897 : }
1898 8 : ondisk.builder.committed.xip = NULL;
1899 :
1900 : /* our snapshot is not interesting anymore, build a new one */
1901 8 : if (builder->snapshot != NULL)
1902 : {
1903 0 : SnapBuildSnapDecRefcount(builder->snapshot);
1904 : }
1905 8 : builder->snapshot = SnapBuildBuildSnapshot(builder);
1906 8 : SnapBuildSnapIncRefcount(builder->snapshot);
1907 :
1908 8 : ReorderBufferSetRestartPoint(builder->reorder, lsn);
1909 :
1910 8 : Assert(builder->state == SNAPBUILD_CONSISTENT);
1911 :
1912 8 : ereport(LOG,
1913 : (errmsg("logical decoding found consistent point at %X/%X",
1914 : (uint32) (lsn >> 32), (uint32) lsn),
1915 : errdetail("Logical decoding will begin using saved snapshot.")));
1916 8 : return true;
1917 :
1918 : snapshot_not_interesting:
1919 0 : if (ondisk.builder.committed.xip != NULL)
1920 0 : pfree(ondisk.builder.committed.xip);
1921 0 : return false;
1922 : }
1923 :
1924 : /*
1925 : * Remove all serialized snapshots that are not required anymore because no
1926 : * slot can need them. This doesn't actually have to run during a checkpoint,
1927 : * but it's a convenient point to schedule this.
1928 : *
1929 : * NB: We run this during checkpoints even if logical decoding is disabled so
1930 : * we cleanup old slots at some point after it got disabled.
1931 : */
1932 : void
1933 702 : CheckPointSnapBuild(void)
1934 : {
1935 : XLogRecPtr cutoff;
1936 : XLogRecPtr redo;
1937 : DIR *snap_dir;
1938 : struct dirent *snap_de;
1939 : char path[MAXPGPATH + 21];
1940 :
1941 : /*
1942 : * We start off with a minimum of the last redo pointer. No new
1943 : * replication slot will start before that, so that's a safe upper bound
1944 : * for removal.
1945 : */
1946 702 : redo = GetRedoRecPtr();
1947 :
1948 : /* now check for the restart ptrs from existing slots */
1949 702 : cutoff = ReplicationSlotsComputeLogicalRestartLSN();
1950 :
1951 : /* don't start earlier than the restart lsn */
1952 702 : if (redo < cutoff)
1953 0 : cutoff = redo;
1954 :
1955 702 : snap_dir = AllocateDir("pg_logical/snapshots");
1956 2962 : while ((snap_de = ReadDir(snap_dir, "pg_logical/snapshots")) != NULL)
1957 : {
1958 : uint32 hi;
1959 : uint32 lo;
1960 : XLogRecPtr lsn;
1961 : struct stat statbuf;
1962 :
1963 2414 : if (strcmp(snap_de->d_name, ".") == 0 ||
1964 856 : strcmp(snap_de->d_name, "..") == 0)
1965 2808 : continue;
1966 :
1967 154 : snprintf(path, sizeof(path), "pg_logical/snapshots/%s", snap_de->d_name);
1968 :
1969 154 : if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
1970 : {
1971 0 : elog(DEBUG1, "only regular files expected: %s", path);
1972 0 : continue;
1973 : }
1974 :
1975 : /*
1976 : * temporary filenames from SnapBuildSerialize() include the LSN and
1977 : * everything but are postfixed by .$pid.tmp. We can just remove them
1978 : * the same as other files because there can be none that are
1979 : * currently being written that are older than cutoff.
1980 : *
1981 : * We just log a message if a file doesn't fit the pattern, it's
1982 : * probably some editors lock/state file or similar...
1983 : */
1984 154 : if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2)
1985 : {
1986 0 : ereport(LOG,
1987 : (errmsg("could not parse file name \"%s\"", path)));
1988 0 : continue;
1989 : }
1990 :
1991 154 : lsn = ((uint64) hi) << 32 | lo;
1992 :
1993 : /* check whether we still need it */
1994 154 : if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
1995 : {
1996 104 : elog(DEBUG1, "removing snapbuild snapshot %s", path);
1997 :
1998 : /*
1999 : * It's not particularly harmful, though strange, if we can't
2000 : * remove the file here. Don't prevent the checkpoint from
2001 : * completing, that'd be a cure worse than the disease.
2002 : */
2003 104 : if (unlink(path) < 0)
2004 : {
2005 0 : ereport(LOG,
2006 : (errcode_for_file_access(),
2007 : errmsg("could not remove file \"%s\": %m",
2008 : path)));
2009 0 : continue;
2010 : }
2011 : }
2012 : }
2013 702 : FreeDir(snap_dir);
2014 702 : }
|