LCOV - code coverage report
Current view: top level - src/backend/replication/logical - snapbuild.c (source / functions) Hit Total Coverage
Test: PostgreSQL 14devel Lines: 425 533 79.7 %
Date: 2020-10-28 11:24:57 Functions: 26 27 96.3 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * snapbuild.c
       4             :  *
       5             :  *    Infrastructure for building historic catalog snapshots based on contents
       6             :  *    of the WAL, for the purpose of decoding heapam.c style values in the
       7             :  *    WAL.
       8             :  *
       9             :  * NOTES:
      10             :  *
      11             :  * We build snapshots which can *only* be used to read catalog contents and we
      12             :  * do so by reading and interpreting the WAL stream. The aim is to build a
      13             :  * snapshot that behaves the same as a freshly taken MVCC snapshot would have
      14             :  * at the time the XLogRecord was generated.
      15             :  *
      16             :  * To build the snapshots we reuse the infrastructure built for Hot
      17             :  * Standby. The in-memory snapshots we build look different than HS' because
      18             :  * we have different needs. To successfully decode data from the WAL we only
      19             :  * need to access catalog tables and (sys|rel|cat)cache, not the actual user
      20             :  * tables since the data we decode is wholly contained in the WAL
      21             :  * records. Also, our snapshots need to be different in comparison to normal
      22             :  * MVCC ones because in contrast to those we cannot fully rely on the clog and
      23             :  * pg_subtrans for information about committed transactions because they might
      24             :  * commit in the future from the POV of the WAL entry we're currently
      25             :  * decoding. This definition has the advantage that we only need to prevent
      26             :  * removal of catalog rows, while normal table's rows can still be
      27             :  * removed. This is achieved by using the replication slot mechanism.
      28             :  *
      29             :  * As the percentage of transactions modifying the catalog normally is fairly
      30             :  * small in comparisons to ones only manipulating user data, we keep track of
      31             :  * the committed catalog modifying ones inside [xmin, xmax) instead of keeping
      32             :  * track of all running transactions like it's done in a normal snapshot. Note
      33             :  * that we're generally only looking at transactions that have acquired an
      34             :  * xid. That is we keep a list of transactions between snapshot->(xmin, xmax)
      35             :  * that we consider committed, everything else is considered aborted/in
      36             :  * progress. That also allows us not to care about subtransactions before they
      37             :  * have committed which means this module, in contrast to HS, doesn't have to
      38             :  * care about suboverflowed subtransactions and similar.
      39             :  *
      40             :  * One complexity of doing this is that to e.g. handle mixed DDL/DML
      41             :  * transactions we need Snapshots that see intermediate versions of the
      42             :  * catalog in a transaction. During normal operation this is achieved by using
      43             :  * CommandIds/cmin/cmax. The problem with that however is that for space
      44             :  * efficiency reasons only one value of that is stored
      45             :  * (cf. combocid.c). Since ComboCids are only available in memory we log
      46             :  * additional information which allows us to get the original (cmin, cmax)
      47             :  * pair during visibility checks. Check the reorderbuffer.c's comment above
      48             :  * ResolveCminCmaxDuringDecoding() for details.
      49             :  *
      50             :  * To facilitate all this we need our own visibility routine, as the normal
      51             :  * ones are optimized for different usecases.
      52             :  *
      53             :  * To replace the normal catalog snapshots with decoding ones use the
      54             :  * SetupHistoricSnapshot() and TeardownHistoricSnapshot() functions.
      55             :  *
      56             :  *
      57             :  *
      58             :  * The snapbuild machinery is starting up in several stages, as illustrated
      59             :  * by the following graph describing the SnapBuild->state transitions:
      60             :  *
      61             :  *         +-------------------------+
      62             :  *    +----|         START           |-------------+
      63             :  *    |    +-------------------------+             |
      64             :  *    |                 |                          |
      65             :  *    |                 |                          |
      66             :  *    |        running_xacts #1                    |
      67             :  *    |                 |                          |
      68             :  *    |                 |                          |
      69             :  *    |                 v                          |
      70             :  *    |    +-------------------------+             v
      71             :  *    |    |   BUILDING_SNAPSHOT     |------------>|
      72             :  *    |    +-------------------------+             |
      73             :  *    |                 |                          |
      74             :  *    |                 |                          |
      75             :  *    | running_xacts #2, xacts from #1 finished   |
      76             :  *    |                 |                          |
      77             :  *    |                 |                          |
      78             :  *    |                 v                          |
      79             :  *    |    +-------------------------+             v
      80             :  *    |    |       FULL_SNAPSHOT     |------------>|
      81             :  *    |    +-------------------------+             |
      82             :  *    |                 |                          |
      83             :  * running_xacts        |                      saved snapshot
      84             :  * with zero xacts      |                 at running_xacts's lsn
      85             :  *    |                 |                          |
      86             :  *    | running_xacts with xacts from #2 finished  |
      87             :  *    |                 |                          |
      88             :  *    |                 v                          |
      89             :  *    |    +-------------------------+             |
      90             :  *    +--->|SNAPBUILD_CONSISTENT  |<------------+
      91             :  *         +-------------------------+
      92             :  *
      93             :  * Initially the machinery is in the START stage. When an xl_running_xacts
      94             :  * record is read that is sufficiently new (above the safe xmin horizon),
      95             :  * there's a state transition. If there were no running xacts when the
      96             :  * running_xacts record was generated, we'll directly go into CONSISTENT
      97             :  * state, otherwise we'll switch to the BUILDING_SNAPSHOT state. Having a full
      98             :  * snapshot means that all transactions that start henceforth can be decoded
      99             :  * in their entirety, but transactions that started previously can't. In
     100             :  * FULL_SNAPSHOT we'll switch into CONSISTENT once all those previously
     101             :  * running transactions have committed or aborted.
     102             :  *
     103             :  * Only transactions that commit after CONSISTENT state has been reached will
     104             :  * be replayed, even though they might have started while still in
     105             :  * FULL_SNAPSHOT. That ensures that we'll reach a point where no previous
     106             :  * changes has been exported, but all the following ones will be. That point
     107             :  * is a convenient point to initialize replication from, which is why we
     108             :  * export a snapshot at that point, which *can* be used to read normal data.
     109             :  *
     110             :  * Copyright (c) 2012-2020, PostgreSQL Global Development Group
     111             :  *
     112             :  * IDENTIFICATION
     113             :  *    src/backend/replication/snapbuild.c
     114             :  *
     115             :  *-------------------------------------------------------------------------
     116             :  */
     117             : 
     118             : #include "postgres.h"
     119             : 
     120             : #include <sys/stat.h>
     121             : #include <unistd.h>
     122             : 
     123             : #include "access/heapam_xlog.h"
     124             : #include "access/transam.h"
     125             : #include "access/xact.h"
     126             : #include "miscadmin.h"
     127             : #include "pgstat.h"
     128             : #include "replication/logical.h"
     129             : #include "replication/reorderbuffer.h"
     130             : #include "replication/snapbuild.h"
     131             : #include "storage/block.h"        /* debugging output */
     132             : #include "storage/fd.h"
     133             : #include "storage/lmgr.h"
     134             : #include "storage/proc.h"
     135             : #include "storage/procarray.h"
     136             : #include "storage/standby.h"
     137             : #include "utils/builtins.h"
     138             : #include "utils/memutils.h"
     139             : #include "utils/snapmgr.h"
     140             : #include "utils/snapshot.h"
     141             : 
     142             : /*
     143             :  * This struct contains the current state of the snapshot building
     144             :  * machinery. Besides a forward declaration in the header, it is not exposed
     145             :  * to the public, so we can easily change its contents.
     146             :  */
     147             : struct SnapBuild
     148             : {
     149             :     /* how far are we along building our first full snapshot */
     150             :     SnapBuildState state;
     151             : 
     152             :     /* private memory context used to allocate memory for this module. */
     153             :     MemoryContext context;
     154             : 
     155             :     /* all transactions < than this have committed/aborted */
     156             :     TransactionId xmin;
     157             : 
     158             :     /* all transactions >= than this are uncommitted */
     159             :     TransactionId xmax;
     160             : 
     161             :     /*
     162             :      * Don't replay commits from an LSN < this LSN. This can be set externally
     163             :      * but it will also be advanced (never retreat) from within snapbuild.c.
     164             :      */
     165             :     XLogRecPtr  start_decoding_at;
     166             : 
     167             :     /*
     168             :      * Don't start decoding WAL until the "xl_running_xacts" information
     169             :      * indicates there are no running xids with an xid smaller than this.
     170             :      */
     171             :     TransactionId initial_xmin_horizon;
     172             : 
     173             :     /* Indicates if we are building full snapshot or just catalog one. */
     174             :     bool        building_full_snapshot;
     175             : 
     176             :     /*
     177             :      * Snapshot that's valid to see the catalog state seen at this moment.
     178             :      */
     179             :     Snapshot    snapshot;
     180             : 
     181             :     /*
     182             :      * LSN of the last location we are sure a snapshot has been serialized to.
     183             :      */
     184             :     XLogRecPtr  last_serialized_snapshot;
     185             : 
     186             :     /*
     187             :      * The reorderbuffer we need to update with usable snapshots et al.
     188             :      */
     189             :     ReorderBuffer *reorder;
     190             : 
     191             :     /*
     192             :      * Outdated: This struct isn't used for its original purpose anymore, but
     193             :      * can't be removed / changed in a minor version, because it's stored
     194             :      * on-disk.
     195             :      */
     196             :     struct
     197             :     {
     198             :         /*
     199             :          * NB: This field is misused, until a major version can break on-disk
     200             :          * compatibility. See SnapBuildNextPhaseAt() /
     201             :          * SnapBuildStartNextPhaseAt().
     202             :          */
     203             :         TransactionId was_xmin;
     204             :         TransactionId was_xmax;
     205             : 
     206             :         size_t      was_xcnt;   /* number of used xip entries */
     207             :         size_t      was_xcnt_space; /* allocated size of xip */
     208             :         TransactionId *was_xip; /* running xacts array, xidComparator-sorted */
     209             :     }           was_running;
     210             : 
     211             :     /*
     212             :      * Array of transactions which could have catalog changes that committed
     213             :      * between xmin and xmax.
     214             :      */
     215             :     struct
     216             :     {
     217             :         /* number of committed transactions */
     218             :         size_t      xcnt;
     219             : 
     220             :         /* available space for committed transactions */
     221             :         size_t      xcnt_space;
     222             : 
     223             :         /*
     224             :          * Until we reach a CONSISTENT state, we record commits of all
     225             :          * transactions, not just the catalog changing ones. Record when that
     226             :          * changes so we know we cannot export a snapshot safely anymore.
     227             :          */
     228             :         bool        includes_all_transactions;
     229             : 
     230             :         /*
     231             :          * Array of committed transactions that have modified the catalog.
     232             :          *
     233             :          * As this array is frequently modified we do *not* keep it in
     234             :          * xidComparator order. Instead we sort the array when building &
     235             :          * distributing a snapshot.
     236             :          *
     237             :          * TODO: It's unclear whether that reasoning has much merit. Every
     238             :          * time we add something here after becoming consistent will also
     239             :          * require distributing a snapshot. Storing them sorted would
     240             :          * potentially also make it easier to purge (but more complicated wrt
     241             :          * wraparound?). Should be improved if sorting while building the
     242             :          * snapshot shows up in profiles.
     243             :          */
     244             :         TransactionId *xip;
     245             :     }           committed;
     246             : };
     247             : 
     248             : /*
     249             :  * Starting a transaction -- which we need to do while exporting a snapshot --
     250             :  * removes knowledge about the previously used resowner, so we save it here.
     251             :  */
     252             : static ResourceOwner SavedResourceOwnerDuringExport = NULL;
     253             : static bool ExportInProgress = false;
     254             : 
     255             : /* ->committed manipulation */
     256             : static void SnapBuildPurgeCommittedTxn(SnapBuild *builder);
     257             : 
     258             : /* snapshot building/manipulation/distribution functions */
     259             : static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder);
     260             : 
     261             : static void SnapBuildFreeSnapshot(Snapshot snap);
     262             : 
     263             : static void SnapBuildSnapIncRefcount(Snapshot snap);
     264             : 
     265             : static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn);
     266             : 
     267             : /* xlog reading helper functions for SnapBuildProcessRunningXacts */
     268             : static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running);
     269             : static void SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff);
     270             : 
     271             : /* serialization functions */
     272             : static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn);
     273             : static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn);
     274             : 
     275             : /*
     276             :  * Return TransactionId after which the next phase of initial snapshot
     277             :  * building will happen.
     278             :  */
     279             : static inline TransactionId
     280        1996 : SnapBuildNextPhaseAt(SnapBuild *builder)
     281             : {
     282             :     /*
     283             :      * For backward compatibility reasons this has to be stored in the wrongly
     284             :      * named field.  Will be fixed in next major version.
     285             :      */
     286        1996 :     return builder->was_running.was_xmax;
     287             : }
     288             : 
     289             : /*
     290             :  * Set TransactionId after which the next phase of initial snapshot building
     291             :  * will happen.
     292             :  */
     293             : static inline void
     294         812 : SnapBuildStartNextPhaseAt(SnapBuild *builder, TransactionId at)
     295             : {
     296             :     /*
     297             :      * For backward compatibility reasons this has to be stored in the wrongly
     298             :      * named field.  Will be fixed in next major version.
     299             :      */
     300         812 :     builder->was_running.was_xmax = at;
     301         812 : }
     302             : 
     303             : /*
     304             :  * Allocate a new snapshot builder.
     305             :  *
     306             :  * xmin_horizon is the xid >= which we can be sure no catalog rows have been
     307             :  * removed, start_lsn is the LSN >= we want to replay commits.
     308             :  */
     309             : SnapBuild *
     310         820 : AllocateSnapshotBuilder(ReorderBuffer *reorder,
     311             :                         TransactionId xmin_horizon,
     312             :                         XLogRecPtr start_lsn,
     313             :                         bool need_full_snapshot)
     314             : {
     315             :     MemoryContext context;
     316             :     MemoryContext oldcontext;
     317             :     SnapBuild  *builder;
     318             : 
     319             :     /* allocate memory in own context, to have better accountability */
     320         820 :     context = AllocSetContextCreate(CurrentMemoryContext,
     321             :                                     "snapshot builder context",
     322             :                                     ALLOCSET_DEFAULT_SIZES);
     323         820 :     oldcontext = MemoryContextSwitchTo(context);
     324             : 
     325         820 :     builder = palloc0(sizeof(SnapBuild));
     326             : 
     327         820 :     builder->state = SNAPBUILD_START;
     328         820 :     builder->context = context;
     329         820 :     builder->reorder = reorder;
     330             :     /* Other struct members initialized by zeroing via palloc0 above */
     331             : 
     332         820 :     builder->committed.xcnt = 0;
     333         820 :     builder->committed.xcnt_space = 128; /* arbitrary number */
     334         820 :     builder->committed.xip =
     335         820 :         palloc0(builder->committed.xcnt_space * sizeof(TransactionId));
     336         820 :     builder->committed.includes_all_transactions = true;
     337             : 
     338         820 :     builder->initial_xmin_horizon = xmin_horizon;
     339         820 :     builder->start_decoding_at = start_lsn;
     340         820 :     builder->building_full_snapshot = need_full_snapshot;
     341             : 
     342         820 :     MemoryContextSwitchTo(oldcontext);
     343             : 
     344         820 :     return builder;
     345             : }
     346             : 
     347             : /*
     348             :  * Free a snapshot builder.
     349             :  */
     350             : void
     351         738 : FreeSnapshotBuilder(SnapBuild *builder)
     352             : {
     353         738 :     MemoryContext context = builder->context;
     354             : 
     355             :     /* free snapshot explicitly, that contains some error checking */
     356         738 :     if (builder->snapshot != NULL)
     357             :     {
     358         270 :         SnapBuildSnapDecRefcount(builder->snapshot);
     359         270 :         builder->snapshot = NULL;
     360             :     }
     361             : 
     362             :     /* other resources are deallocated via memory context reset */
     363         738 :     MemoryContextDelete(context);
     364         738 : }
     365             : 
     366             : /*
     367             :  * Free an unreferenced snapshot that has previously been built by us.
     368             :  */
     369             : static void
     370        1396 : SnapBuildFreeSnapshot(Snapshot snap)
     371             : {
     372             :     /* make sure we don't get passed an external snapshot */
     373        1396 :     Assert(snap->snapshot_type == SNAPSHOT_HISTORIC_MVCC);
     374             : 
     375             :     /* make sure nobody modified our snapshot */
     376        1396 :     Assert(snap->curcid == FirstCommandId);
     377        1396 :     Assert(!snap->suboverflowed);
     378        1396 :     Assert(!snap->takenDuringRecovery);
     379        1396 :     Assert(snap->regd_count == 0);
     380             : 
     381             :     /* slightly more likely, so it's checked even without c-asserts */
     382        1396 :     if (snap->copied)
     383           0 :         elog(ERROR, "cannot free a copied snapshot");
     384             : 
     385        1396 :     if (snap->active_count)
     386           0 :         elog(ERROR, "cannot free an active snapshot");
     387             : 
     388        1396 :     pfree(snap);
     389        1396 : }
     390             : 
     391             : /*
     392             :  * In which state of snapshot building are we?
     393             :  */
     394             : SnapBuildState
     395     3735030 : SnapBuildCurrentState(SnapBuild *builder)
     396             : {
     397     3735030 :     return builder->state;
     398             : }
     399             : 
     400             : /*
     401             :  * Should the contents of transaction ending at 'ptr' be decoded?
     402             :  */
     403             : bool
     404      566792 : SnapBuildXactNeedsSkip(SnapBuild *builder, XLogRecPtr ptr)
     405             : {
     406      566792 :     return ptr < builder->start_decoding_at;
     407             : }
     408             : 
     409             : /*
     410             :  * Increase refcount of a snapshot.
     411             :  *
     412             :  * This is used when handing out a snapshot to some external resource or when
     413             :  * adding a Snapshot as builder->snapshot.
     414             :  */
     415             : static void
     416        5934 : SnapBuildSnapIncRefcount(Snapshot snap)
     417             : {
     418        5934 :     snap->active_count++;
     419        5934 : }
     420             : 
     421             : /*
     422             :  * Decrease refcount of a snapshot and free if the refcount reaches zero.
     423             :  *
     424             :  * Externally visible, so that external resources that have been handed an
     425             :  * IncRef'ed Snapshot can adjust its refcount easily.
     426             :  */
     427             : void
     428        5816 : SnapBuildSnapDecRefcount(Snapshot snap)
     429             : {
     430             :     /* make sure we don't get passed an external snapshot */
     431        5816 :     Assert(snap->snapshot_type == SNAPSHOT_HISTORIC_MVCC);
     432             : 
     433             :     /* make sure nobody modified our snapshot */
     434        5816 :     Assert(snap->curcid == FirstCommandId);
     435        5816 :     Assert(!snap->suboverflowed);
     436        5816 :     Assert(!snap->takenDuringRecovery);
     437             : 
     438        5816 :     Assert(snap->regd_count == 0);
     439             : 
     440        5816 :     Assert(snap->active_count > 0);
     441             : 
     442             :     /* slightly more likely, so it's checked even without casserts */
     443        5816 :     if (snap->copied)
     444           0 :         elog(ERROR, "cannot free a copied snapshot");
     445             : 
     446        5816 :     snap->active_count--;
     447        5816 :     if (snap->active_count == 0)
     448        1396 :         SnapBuildFreeSnapshot(snap);
     449        5816 : }
     450             : 
     451             : /*
     452             :  * Build a new snapshot, based on currently committed catalog-modifying
     453             :  * transactions.
     454             :  *
     455             :  * In-progress transactions with catalog access are *not* allowed to modify
     456             :  * these snapshots; they have to copy them and fill in appropriate ->curcid
     457             :  * and ->subxip/subxcnt values.
     458             :  */
     459             : static Snapshot
     460        1626 : SnapBuildBuildSnapshot(SnapBuild *builder)
     461             : {
     462             :     Snapshot    snapshot;
     463             :     Size        ssize;
     464             : 
     465        1626 :     Assert(builder->state >= SNAPBUILD_FULL_SNAPSHOT);
     466             : 
     467        1626 :     ssize = sizeof(SnapshotData)
     468        1626 :         + sizeof(TransactionId) * builder->committed.xcnt
     469        1626 :         + sizeof(TransactionId) * 1 /* toplevel xid */ ;
     470             : 
     471        1626 :     snapshot = MemoryContextAllocZero(builder->context, ssize);
     472             : 
     473        1626 :     snapshot->snapshot_type = SNAPSHOT_HISTORIC_MVCC;
     474             : 
     475             :     /*
     476             :      * We misuse the original meaning of SnapshotData's xip and subxip fields
     477             :      * to make the more fitting for our needs.
     478             :      *
     479             :      * In the 'xip' array we store transactions that have to be treated as
     480             :      * committed. Since we will only ever look at tuples from transactions
     481             :      * that have modified the catalog it's more efficient to store those few
     482             :      * that exist between xmin and xmax (frequently there are none).
     483             :      *
     484             :      * Snapshots that are used in transactions that have modified the catalog
     485             :      * also use the 'subxip' array to store their toplevel xid and all the
     486             :      * subtransaction xids so we can recognize when we need to treat rows as
     487             :      * visible that are not in xip but still need to be visible. Subxip only
     488             :      * gets filled when the transaction is copied into the context of a
     489             :      * catalog modifying transaction since we otherwise share a snapshot
     490             :      * between transactions. As long as a txn hasn't modified the catalog it
     491             :      * doesn't need to treat any uncommitted rows as visible, so there is no
     492             :      * need for those xids.
     493             :      *
     494             :      * Both arrays are qsort'ed so that we can use bsearch() on them.
     495             :      */
     496        1626 :     Assert(TransactionIdIsNormal(builder->xmin));
     497        1626 :     Assert(TransactionIdIsNormal(builder->xmax));
     498             : 
     499        1626 :     snapshot->xmin = builder->xmin;
     500        1626 :     snapshot->xmax = builder->xmax;
     501             : 
     502             :     /* store all transactions to be treated as committed by this snapshot */
     503        1626 :     snapshot->xip =
     504        1626 :         (TransactionId *) ((char *) snapshot + sizeof(SnapshotData));
     505        1626 :     snapshot->xcnt = builder->committed.xcnt;
     506        3252 :     memcpy(snapshot->xip,
     507        1626 :            builder->committed.xip,
     508        1626 :            builder->committed.xcnt * sizeof(TransactionId));
     509             : 
     510             :     /* sort so we can bsearch() */
     511        1626 :     qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator);
     512             : 
     513             :     /*
     514             :      * Initially, subxip is empty, i.e. it's a snapshot to be used by
     515             :      * transactions that don't modify the catalog. Will be filled by
     516             :      * ReorderBufferCopySnap() if necessary.
     517             :      */
     518        1626 :     snapshot->subxcnt = 0;
     519        1626 :     snapshot->subxip = NULL;
     520             : 
     521        1626 :     snapshot->suboverflowed = false;
     522        1626 :     snapshot->takenDuringRecovery = false;
     523        1626 :     snapshot->copied = false;
     524        1626 :     snapshot->curcid = FirstCommandId;
     525        1626 :     snapshot->active_count = 0;
     526        1626 :     snapshot->regd_count = 0;
     527        1626 :     snapshot->snapXactCompletionCount = 0;
     528             : 
     529        1626 :     return snapshot;
     530             : }
     531             : 
     532             : /*
     533             :  * Build the initial slot snapshot and convert it to a normal snapshot that
     534             :  * is understood by HeapTupleSatisfiesMVCC.
     535             :  *
     536             :  * The snapshot will be usable directly in current transaction or exported
     537             :  * for loading in different transaction.
     538             :  */
     539             : Snapshot
     540         126 : SnapBuildInitialSnapshot(SnapBuild *builder)
     541             : {
     542             :     Snapshot    snap;
     543             :     TransactionId xid;
     544             :     TransactionId *newxip;
     545         126 :     int         newxcnt = 0;
     546             : 
     547         126 :     Assert(!FirstSnapshotSet);
     548         126 :     Assert(XactIsoLevel == XACT_REPEATABLE_READ);
     549             : 
     550         126 :     if (builder->state != SNAPBUILD_CONSISTENT)
     551           0 :         elog(ERROR, "cannot build an initial slot snapshot before reaching a consistent state");
     552             : 
     553         126 :     if (!builder->committed.includes_all_transactions)
     554           0 :         elog(ERROR, "cannot build an initial slot snapshot, not all transactions are monitored anymore");
     555             : 
     556             :     /* so we don't overwrite the existing value */
     557         126 :     if (TransactionIdIsValid(MyProc->xmin))
     558           0 :         elog(ERROR, "cannot build an initial slot snapshot when MyProc->xmin already is valid");
     559             : 
     560         126 :     snap = SnapBuildBuildSnapshot(builder);
     561             : 
     562             :     /*
     563             :      * We know that snap->xmin is alive, enforced by the logical xmin
     564             :      * mechanism. Due to that we can do this without locks, we're only
     565             :      * changing our own value.
     566             :      */
     567             : #ifdef USE_ASSERT_CHECKING
     568             :     {
     569             :         TransactionId safeXid;
     570             : 
     571         126 :         LWLockAcquire(ProcArrayLock, LW_SHARED);
     572         126 :         safeXid = GetOldestSafeDecodingTransactionId(false);
     573         126 :         LWLockRelease(ProcArrayLock);
     574             : 
     575         126 :         Assert(TransactionIdPrecedesOrEquals(safeXid, snap->xmin));
     576             :     }
     577             : #endif
     578             : 
     579         126 :     MyProc->xmin = snap->xmin;
     580             : 
     581             :     /* allocate in transaction context */
     582         126 :     newxip = (TransactionId *)
     583         126 :         palloc(sizeof(TransactionId) * GetMaxSnapshotXidCount());
     584             : 
     585             :     /*
     586             :      * snapbuild.c builds transactions in an "inverted" manner, which means it
     587             :      * stores committed transactions in ->xip, not ones in progress. Build a
     588             :      * classical snapshot by marking all non-committed transactions as
     589             :      * in-progress. This can be expensive.
     590             :      */
     591         252 :     for (xid = snap->xmin; NormalTransactionIdPrecedes(xid, snap->xmax);)
     592             :     {
     593             :         void       *test;
     594             : 
     595             :         /*
     596             :          * Check whether transaction committed using the decoding snapshot
     597             :          * meaning of ->xip.
     598             :          */
     599           0 :         test = bsearch(&xid, snap->xip, snap->xcnt,
     600             :                        sizeof(TransactionId), xidComparator);
     601             : 
     602           0 :         if (test == NULL)
     603             :         {
     604           0 :             if (newxcnt >= GetMaxSnapshotXidCount())
     605           0 :                 ereport(ERROR,
     606             :                         (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
     607             :                          errmsg("initial slot snapshot too large")));
     608             : 
     609           0 :             newxip[newxcnt++] = xid;
     610             :         }
     611             : 
     612           0 :         TransactionIdAdvance(xid);
     613             :     }
     614             : 
     615             :     /* adjust remaining snapshot fields as needed */
     616         126 :     snap->snapshot_type = SNAPSHOT_MVCC;
     617         126 :     snap->xcnt = newxcnt;
     618         126 :     snap->xip = newxip;
     619             : 
     620         126 :     return snap;
     621             : }
     622             : 
     623             : /*
     624             :  * Export a snapshot so it can be set in another session with SET TRANSACTION
     625             :  * SNAPSHOT.
     626             :  *
     627             :  * For that we need to start a transaction in the current backend as the
     628             :  * importing side checks whether the source transaction is still open to make
     629             :  * sure the xmin horizon hasn't advanced since then.
     630             :  */
     631             : const char *
     632           0 : SnapBuildExportSnapshot(SnapBuild *builder)
     633             : {
     634             :     Snapshot    snap;
     635             :     char       *snapname;
     636             : 
     637           0 :     if (IsTransactionOrTransactionBlock())
     638           0 :         elog(ERROR, "cannot export a snapshot from within a transaction");
     639             : 
     640           0 :     if (SavedResourceOwnerDuringExport)
     641           0 :         elog(ERROR, "can only export one snapshot at a time");
     642             : 
     643           0 :     SavedResourceOwnerDuringExport = CurrentResourceOwner;
     644           0 :     ExportInProgress = true;
     645             : 
     646           0 :     StartTransactionCommand();
     647             : 
     648             :     /* There doesn't seem to a nice API to set these */
     649           0 :     XactIsoLevel = XACT_REPEATABLE_READ;
     650           0 :     XactReadOnly = true;
     651             : 
     652           0 :     snap = SnapBuildInitialSnapshot(builder);
     653             : 
     654             :     /*
     655             :      * now that we've built a plain snapshot, make it active and use the
     656             :      * normal mechanisms for exporting it
     657             :      */
     658           0 :     snapname = ExportSnapshot(snap);
     659             : 
     660           0 :     ereport(LOG,
     661             :             (errmsg_plural("exported logical decoding snapshot: \"%s\" with %u transaction ID",
     662             :                            "exported logical decoding snapshot: \"%s\" with %u transaction IDs",
     663             :                            snap->xcnt,
     664             :                            snapname, snap->xcnt)));
     665           0 :     return snapname;
     666             : }
     667             : 
     668             : /*
     669             :  * Ensure there is a snapshot and if not build one for current transaction.
     670             :  */
     671             : Snapshot
     672          56 : SnapBuildGetOrBuildSnapshot(SnapBuild *builder, TransactionId xid)
     673             : {
     674          56 :     Assert(builder->state == SNAPBUILD_CONSISTENT);
     675             : 
     676             :     /* only build a new snapshot if we don't have a prebuilt one */
     677          56 :     if (builder->snapshot == NULL)
     678             :     {
     679           0 :         builder->snapshot = SnapBuildBuildSnapshot(builder);
     680             :         /* increase refcount for the snapshot builder */
     681           0 :         SnapBuildSnapIncRefcount(builder->snapshot);
     682             :     }
     683             : 
     684          56 :     return builder->snapshot;
     685             : }
     686             : 
     687             : /*
     688             :  * Reset a previously SnapBuildExportSnapshot()'ed snapshot if there is
     689             :  * any. Aborts the previously started transaction and resets the resource
     690             :  * owner back to its original value.
     691             :  */
     692             : void
     693        1454 : SnapBuildClearExportedSnapshot(void)
     694             : {
     695             :     /* nothing exported, that is the usual case */
     696        1454 :     if (!ExportInProgress)
     697        2908 :         return;
     698             : 
     699           0 :     if (!IsTransactionState())
     700           0 :         elog(ERROR, "clearing exported snapshot in wrong transaction state");
     701             : 
     702             :     /* make sure nothing  could have ever happened */
     703           0 :     AbortCurrentTransaction();
     704             : 
     705           0 :     CurrentResourceOwner = SavedResourceOwnerDuringExport;
     706           0 :     SavedResourceOwnerDuringExport = NULL;
     707           0 :     ExportInProgress = false;
     708             : }
     709             : 
     710             : /*
     711             :  * Handle the effects of a single heap change, appropriate to the current state
     712             :  * of the snapshot builder and returns whether changes made at (xid, lsn) can
     713             :  * be decoded.
     714             :  */
     715             : bool
     716     2992850 : SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn)
     717             : {
     718             :     /*
     719             :      * We can't handle data in transactions if we haven't built a snapshot
     720             :      * yet, so don't store them.
     721             :      */
     722     2992850 :     if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
     723           0 :         return false;
     724             : 
     725             :     /*
     726             :      * No point in keeping track of changes in transactions that we don't have
     727             :      * enough information about to decode. This means that they started before
     728             :      * we got into the SNAPBUILD_FULL_SNAPSHOT state.
     729             :      */
     730     2994836 :     if (builder->state < SNAPBUILD_CONSISTENT &&
     731        1986 :         TransactionIdPrecedes(xid, SnapBuildNextPhaseAt(builder)))
     732        1986 :         return false;
     733             : 
     734             :     /*
     735             :      * If the reorderbuffer doesn't yet have a snapshot, add one now, it will
     736             :      * be needed to decode the change we're currently processing.
     737             :      */
     738     2990864 :     if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
     739             :     {
     740             :         /* only build a new snapshot if we don't have a prebuilt one */
     741        3226 :         if (builder->snapshot == NULL)
     742             :         {
     743         328 :             builder->snapshot = SnapBuildBuildSnapshot(builder);
     744             :             /* increase refcount for the snapshot builder */
     745         328 :             SnapBuildSnapIncRefcount(builder->snapshot);
     746             :         }
     747             : 
     748             :         /*
     749             :          * Increase refcount for the transaction we're handing the snapshot
     750             :          * out to.
     751             :          */
     752        3226 :         SnapBuildSnapIncRefcount(builder->snapshot);
     753        3226 :         ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
     754             :                                      builder->snapshot);
     755             :     }
     756             : 
     757     2990864 :     return true;
     758             : }
     759             : 
     760             : /*
     761             :  * Do CommandId/ComboCid handling after reading an xl_heap_new_cid record.
     762             :  * This implies that a transaction has done some form of write to system
     763             :  * catalogs.
     764             :  */
     765             : void
     766       30900 : SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid,
     767             :                        XLogRecPtr lsn, xl_heap_new_cid *xlrec)
     768             : {
     769             :     CommandId   cid;
     770             : 
     771             :     /*
     772             :      * we only log new_cid's if a catalog tuple was modified, so mark the
     773             :      * transaction as containing catalog modifications
     774             :      */
     775       30900 :     ReorderBufferXidSetCatalogChanges(builder->reorder, xid, lsn);
     776             : 
     777       30900 :     ReorderBufferAddNewTupleCids(builder->reorder, xlrec->top_xid, lsn,
     778             :                                  xlrec->target_node, xlrec->target_tid,
     779             :                                  xlrec->cmin, xlrec->cmax,
     780             :                                  xlrec->combocid);
     781             : 
     782             :     /* figure out new command id */
     783       59452 :     if (xlrec->cmin != InvalidCommandId &&
     784       28552 :         xlrec->cmax != InvalidCommandId)
     785        5776 :         cid = Max(xlrec->cmin, xlrec->cmax);
     786       25124 :     else if (xlrec->cmax != InvalidCommandId)
     787        2348 :         cid = xlrec->cmax;
     788       22776 :     else if (xlrec->cmin != InvalidCommandId)
     789       22776 :         cid = xlrec->cmin;
     790             :     else
     791             :     {
     792           0 :         cid = InvalidCommandId; /* silence compiler */
     793           0 :         elog(ERROR, "xl_heap_new_cid record without a valid CommandId");
     794             :     }
     795             : 
     796       30900 :     ReorderBufferAddNewCommandId(builder->reorder, xid, lsn, cid + 1);
     797       30900 : }
     798             : 
     799             : /*
     800             :  * Add a new Snapshot to all transactions we're decoding that currently are
     801             :  * in-progress so they can see new catalog contents made by the transaction
     802             :  * that just committed. This is necessary because those in-progress
     803             :  * transactions will use the new catalog's contents from here on (at the very
     804             :  * least everything they do needs to be compatible with newer catalog
     805             :  * contents).
     806             :  */
     807             : static void
     808        1164 : SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn)
     809             : {
     810             :     dlist_iter  txn_i;
     811             :     ReorderBufferTXN *txn;
     812             : 
     813             :     /*
     814             :      * Iterate through all toplevel transactions. This can include
     815             :      * subtransactions which we just don't yet know to be that, but that's
     816             :      * fine, they will just get an unnecessary snapshot queued.
     817             :      */
     818        2372 :     dlist_foreach(txn_i, &builder->reorder->toplevel_by_lsn)
     819             :     {
     820        1208 :         txn = dlist_container(ReorderBufferTXN, node, txn_i.cur);
     821             : 
     822        1208 :         Assert(TransactionIdIsValid(txn->xid));
     823             : 
     824             :         /*
     825             :          * If we don't have a base snapshot yet, there are no changes in this
     826             :          * transaction which in turn implies we don't yet need a snapshot at
     827             :          * all. We'll add a snapshot when the first change gets queued.
     828             :          *
     829             :          * NB: This works correctly even for subtransactions because
     830             :          * ReorderBufferAssignChild() takes care to transfer the base snapshot
     831             :          * to the top-level transaction, and while iterating the changequeue
     832             :          * we'll get the change from the subtxn.
     833             :          */
     834        1208 :         if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, txn->xid))
     835           4 :             continue;
     836             : 
     837        1204 :         elog(DEBUG2, "adding a new snapshot to %u at %X/%X",
     838             :              txn->xid, (uint32) (lsn >> 32), (uint32) lsn);
     839             : 
     840             :         /*
     841             :          * increase the snapshot's refcount for the transaction we are handing
     842             :          * it out to
     843             :          */
     844        1204 :         SnapBuildSnapIncRefcount(builder->snapshot);
     845        1204 :         ReorderBufferAddSnapshot(builder->reorder, txn->xid, lsn,
     846             :                                  builder->snapshot);
     847             :     }
     848        1164 : }
     849             : 
     850             : /*
     851             :  * Keep track of a new catalog changing transaction that has committed.
     852             :  */
     853             : static void
     854        1176 : SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid)
     855             : {
     856        1176 :     Assert(TransactionIdIsValid(xid));
     857             : 
     858        1176 :     if (builder->committed.xcnt == builder->committed.xcnt_space)
     859             :     {
     860           4 :         builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1;
     861             : 
     862           4 :         elog(DEBUG1, "increasing space for committed transactions to %u",
     863             :              (uint32) builder->committed.xcnt_space);
     864             : 
     865           4 :         builder->committed.xip = repalloc(builder->committed.xip,
     866           4 :                                           builder->committed.xcnt_space * sizeof(TransactionId));
     867             :     }
     868             : 
     869             :     /*
     870             :      * TODO: It might make sense to keep the array sorted here instead of
     871             :      * doing it every time we build a new snapshot. On the other hand this
     872             :      * gets called repeatedly when a transaction with subtransactions commits.
     873             :      */
     874        1176 :     builder->committed.xip[builder->committed.xcnt++] = xid;
     875        1176 : }
     876             : 
     877             : /*
     878             :  * Remove knowledge about transactions we treat as committed that are smaller
     879             :  * than ->xmin. Those won't ever get checked via the ->committed array but via
     880             :  * the clog machinery, so we don't need to waste memory on them.
     881             :  */
     882             : static void
     883         186 : SnapBuildPurgeCommittedTxn(SnapBuild *builder)
     884             : {
     885             :     int         off;
     886             :     TransactionId *workspace;
     887         186 :     int         surviving_xids = 0;
     888             : 
     889             :     /* not ready yet */
     890         186 :     if (!TransactionIdIsNormal(builder->xmin))
     891         186 :         return;
     892             : 
     893             :     /* TODO: Neater algorithm than just copying and iterating? */
     894         186 :     workspace =
     895         186 :         MemoryContextAlloc(builder->context,
     896         186 :                            builder->committed.xcnt * sizeof(TransactionId));
     897             : 
     898             :     /* copy xids that still are interesting to workspace */
     899         262 :     for (off = 0; off < builder->committed.xcnt; off++)
     900             :     {
     901          76 :         if (NormalTransactionIdPrecedes(builder->committed.xip[off],
     902             :                                         builder->xmin))
     903             :             ;                   /* remove */
     904             :         else
     905           4 :             workspace[surviving_xids++] = builder->committed.xip[off];
     906             :     }
     907             : 
     908             :     /* copy workspace back to persistent state */
     909         186 :     memcpy(builder->committed.xip, workspace,
     910             :            surviving_xids * sizeof(TransactionId));
     911             : 
     912         186 :     elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u",
     913             :          (uint32) builder->committed.xcnt, (uint32) surviving_xids,
     914             :          builder->xmin, builder->xmax);
     915         186 :     builder->committed.xcnt = surviving_xids;
     916             : 
     917         186 :     pfree(workspace);
     918             : }
     919             : 
     920             : /*
     921             :  * Handle everything that needs to be done when a transaction commits
     922             :  */
     923             : void
     924        3138 : SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
     925             :                    int nsubxacts, TransactionId *subxacts)
     926             : {
     927             :     int         nxact;
     928             : 
     929        3138 :     bool        needs_snapshot = false;
     930        3138 :     bool        needs_timetravel = false;
     931        3138 :     bool        sub_needs_timetravel = false;
     932             : 
     933        3138 :     TransactionId xmax = xid;
     934             : 
     935             :     /*
     936             :      * Transactions preceding BUILDING_SNAPSHOT will neither be decoded, nor
     937             :      * will they be part of a snapshot.  So we don't need to record anything.
     938             :      */
     939        6276 :     if (builder->state == SNAPBUILD_START ||
     940        3138 :         (builder->state == SNAPBUILD_BUILDING_SNAPSHOT &&
     941           0 :          TransactionIdPrecedes(xid, SnapBuildNextPhaseAt(builder))))
     942             :     {
     943             :         /* ensure that only commits after this are getting replayed */
     944           0 :         if (builder->start_decoding_at <= lsn)
     945           0 :             builder->start_decoding_at = lsn + 1;
     946           0 :         return;
     947             :     }
     948             : 
     949        3138 :     if (builder->state < SNAPBUILD_CONSISTENT)
     950             :     {
     951             :         /* ensure that only commits after this are getting replayed */
     952           6 :         if (builder->start_decoding_at <= lsn)
     953           4 :             builder->start_decoding_at = lsn + 1;
     954             : 
     955             :         /*
     956             :          * If building an exportable snapshot, force xid to be tracked, even
     957             :          * if the transaction didn't modify the catalog.
     958             :          */
     959           6 :         if (builder->building_full_snapshot)
     960             :         {
     961           2 :             needs_timetravel = true;
     962             :         }
     963             :     }
     964             : 
     965        5588 :     for (nxact = 0; nxact < nsubxacts; nxact++)
     966             :     {
     967        2450 :         TransactionId subxid = subxacts[nxact];
     968             : 
     969             :         /*
     970             :          * Add subtransaction to base snapshot if catalog modifying, we don't
     971             :          * distinguish to toplevel transactions there.
     972             :          */
     973        2450 :         if (ReorderBufferXidHasCatalogChanges(builder->reorder, subxid))
     974             :         {
     975          10 :             sub_needs_timetravel = true;
     976          10 :             needs_snapshot = true;
     977             : 
     978          10 :             elog(DEBUG1, "found subtransaction %u:%u with catalog changes",
     979             :                  xid, subxid);
     980             : 
     981          10 :             SnapBuildAddCommittedTxn(builder, subxid);
     982             : 
     983          10 :             if (NormalTransactionIdFollows(subxid, xmax))
     984          10 :                 xmax = subxid;
     985             :         }
     986             : 
     987             :         /*
     988             :          * If we're forcing timetravel we also need visibility information
     989             :          * about subtransaction, so keep track of subtransaction's state, even
     990             :          * if not catalog modifying.  Don't need to distribute a snapshot in
     991             :          * that case.
     992             :          */
     993        2440 :         else if (needs_timetravel)
     994             :         {
     995           0 :             SnapBuildAddCommittedTxn(builder, subxid);
     996           0 :             if (NormalTransactionIdFollows(subxid, xmax))
     997           0 :                 xmax = subxid;
     998             :         }
     999             :     }
    1000             : 
    1001             :     /* if top-level modified catalog, it'll need a snapshot */
    1002        3138 :     if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid))
    1003             :     {
    1004        1164 :         elog(DEBUG2, "found top level transaction %u, with catalog changes",
    1005             :              xid);
    1006        1164 :         needs_snapshot = true;
    1007        1164 :         needs_timetravel = true;
    1008        1164 :         SnapBuildAddCommittedTxn(builder, xid);
    1009             :     }
    1010        1974 :     else if (sub_needs_timetravel)
    1011             :     {
    1012             :         /* track toplevel txn as well, subxact alone isn't meaningful */
    1013           0 :         SnapBuildAddCommittedTxn(builder, xid);
    1014             :     }
    1015        1974 :     else if (needs_timetravel)
    1016             :     {
    1017           2 :         elog(DEBUG2, "forced transaction %u to do timetravel", xid);
    1018             : 
    1019           2 :         SnapBuildAddCommittedTxn(builder, xid);
    1020             :     }
    1021             : 
    1022        3138 :     if (!needs_timetravel)
    1023             :     {
    1024             :         /* record that we cannot export a general snapshot anymore */
    1025        1972 :         builder->committed.includes_all_transactions = false;
    1026             :     }
    1027             : 
    1028        3138 :     Assert(!needs_snapshot || needs_timetravel);
    1029             : 
    1030             :     /*
    1031             :      * Adjust xmax of the snapshot builder, we only do that for committed,
    1032             :      * catalog modifying, transactions, everything else isn't interesting for
    1033             :      * us since we'll never look at the respective rows.
    1034             :      */
    1035        4304 :     if (needs_timetravel &&
    1036        2332 :         (!TransactionIdIsValid(builder->xmax) ||
    1037        1166 :          TransactionIdFollowsOrEquals(xmax, builder->xmax)))
    1038             :     {
    1039        1164 :         builder->xmax = xmax;
    1040        1164 :         TransactionIdAdvance(builder->xmax);
    1041             :     }
    1042             : 
    1043             :     /* if there's any reason to build a historic snapshot, do so now */
    1044        3138 :     if (needs_snapshot)
    1045             :     {
    1046             :         /*
    1047             :          * If we haven't built a complete snapshot yet there's no need to hand
    1048             :          * it out, it wouldn't (and couldn't) be used anyway.
    1049             :          */
    1050        1164 :         if (builder->state < SNAPBUILD_FULL_SNAPSHOT)
    1051           0 :             return;
    1052             : 
    1053             :         /*
    1054             :          * Decrease the snapshot builder's refcount of the old snapshot, note
    1055             :          * that it still will be used if it has been handed out to the
    1056             :          * reorderbuffer earlier.
    1057             :          */
    1058        1164 :         if (builder->snapshot)
    1059        1164 :             SnapBuildSnapDecRefcount(builder->snapshot);
    1060             : 
    1061        1164 :         builder->snapshot = SnapBuildBuildSnapshot(builder);
    1062             : 
    1063             :         /* we might need to execute invalidations, add snapshot */
    1064        1164 :         if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid))
    1065             :         {
    1066           4 :             SnapBuildSnapIncRefcount(builder->snapshot);
    1067           4 :             ReorderBufferSetBaseSnapshot(builder->reorder, xid, lsn,
    1068             :                                          builder->snapshot);
    1069             :         }
    1070             : 
    1071             :         /* refcount of the snapshot builder for the new snapshot */
    1072        1164 :         SnapBuildSnapIncRefcount(builder->snapshot);
    1073             : 
    1074             :         /* add a new catalog snapshot to all currently running transactions */
    1075        1164 :         SnapBuildDistributeNewCatalogSnapshot(builder, lsn);
    1076             :     }
    1077             : }
    1078             : 
    1079             : 
    1080             : /* -----------------------------------
    1081             :  * Snapshot building functions dealing with xlog records
    1082             :  * -----------------------------------
    1083             :  */
    1084             : 
    1085             : /*
    1086             :  * Process a running xacts record, and use its information to first build a
    1087             :  * historic snapshot and later to release resources that aren't needed
    1088             :  * anymore.
    1089             :  */
    1090             : void
    1091         982 : SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
    1092             : {
    1093             :     ReorderBufferTXN *txn;
    1094             :     TransactionId xmin;
    1095             : 
    1096             :     /*
    1097             :      * If we're not consistent yet, inspect the record to see whether it
    1098             :      * allows to get closer to being consistent. If we are consistent, dump
    1099             :      * our snapshot so others or we, after a restart, can use it.
    1100             :      */
    1101         982 :     if (builder->state < SNAPBUILD_CONSISTENT)
    1102             :     {
    1103             :         /* returns false if there's no point in performing cleanup just yet */
    1104         820 :         if (!SnapBuildFindSnapshot(builder, lsn, running))
    1105         796 :             return;
    1106             :     }
    1107             :     else
    1108         162 :         SnapBuildSerialize(builder, lsn);
    1109             : 
    1110             :     /*
    1111             :      * Update range of interesting xids based on the running xacts
    1112             :      * information. We don't increase ->xmax using it, because once we are in
    1113             :      * a consistent state we can do that ourselves and much more efficiently
    1114             :      * so, because we only need to do it for catalog transactions since we
    1115             :      * only ever look at those.
    1116             :      *
    1117             :      * NB: We only increase xmax when a catalog modifying transaction commits
    1118             :      * (see SnapBuildCommitTxn).  Because of this, xmax can be lower than
    1119             :      * xmin, which looks odd but is correct and actually more efficient, since
    1120             :      * we hit fast paths in heapam_visibility.c.
    1121             :      */
    1122         186 :     builder->xmin = running->oldestRunningXid;
    1123             : 
    1124             :     /* Remove transactions we don't need to keep track off anymore */
    1125         186 :     SnapBuildPurgeCommittedTxn(builder);
    1126             : 
    1127             :     /*
    1128             :      * Advance the xmin limit for the current replication slot, to allow
    1129             :      * vacuum to clean up the tuples this slot has been protecting.
    1130             :      *
    1131             :      * The reorderbuffer might have an xmin among the currently running
    1132             :      * snapshots; use it if so.  If not, we need only consider the snapshots
    1133             :      * we'll produce later, which can't be less than the oldest running xid in
    1134             :      * the record we're reading now.
    1135             :      */
    1136         186 :     xmin = ReorderBufferGetOldestXmin(builder->reorder);
    1137         186 :     if (xmin == InvalidTransactionId)
    1138         168 :         xmin = running->oldestRunningXid;
    1139         186 :     elog(DEBUG3, "xmin: %u, xmax: %u, oldest running: %u, oldest xmin: %u",
    1140             :          builder->xmin, builder->xmax, running->oldestRunningXid, xmin);
    1141         186 :     LogicalIncreaseXminForSlot(lsn, xmin);
    1142             : 
    1143             :     /*
    1144             :      * Also tell the slot where we can restart decoding from. We don't want to
    1145             :      * do that after every commit because changing that implies an fsync of
    1146             :      * the logical slot's state file, so we only do it every time we see a
    1147             :      * running xacts record.
    1148             :      *
    1149             :      * Do so by looking for the oldest in progress transaction (determined by
    1150             :      * the first LSN of any of its relevant records). Every transaction
    1151             :      * remembers the last location we stored the snapshot to disk before its
    1152             :      * beginning. That point is where we can restart from.
    1153             :      */
    1154             : 
    1155             :     /*
    1156             :      * Can't know about a serialized snapshot's location if we're not
    1157             :      * consistent.
    1158             :      */
    1159         186 :     if (builder->state < SNAPBUILD_CONSISTENT)
    1160          20 :         return;
    1161             : 
    1162         166 :     txn = ReorderBufferGetOldestTXN(builder->reorder);
    1163             : 
    1164             :     /*
    1165             :      * oldest ongoing txn might have started when we didn't yet serialize
    1166             :      * anything because we hadn't reached a consistent state yet.
    1167             :      */
    1168         166 :     if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr)
    1169          10 :         LogicalIncreaseRestartDecodingForSlot(lsn, txn->restart_decoding_lsn);
    1170             : 
    1171             :     /*
    1172             :      * No in-progress transaction, can reuse the last serialized snapshot if
    1173             :      * we have one.
    1174             :      */
    1175         304 :     else if (txn == NULL &&
    1176         292 :              builder->reorder->current_restart_decoding_lsn != InvalidXLogRecPtr &&
    1177         144 :              builder->last_serialized_snapshot != InvalidXLogRecPtr)
    1178         144 :         LogicalIncreaseRestartDecodingForSlot(lsn,
    1179             :                                               builder->last_serialized_snapshot);
    1180             : }
    1181             : 
    1182             : 
    1183             : /*
    1184             :  * Build the start of a snapshot that's capable of decoding the catalog.
    1185             :  *
    1186             :  * Helper function for SnapBuildProcessRunningXacts() while we're not yet
    1187             :  * consistent.
    1188             :  *
    1189             :  * Returns true if there is a point in performing internal maintenance/cleanup
    1190             :  * using the xl_running_xacts record.
    1191             :  */
    1192             : static bool
    1193         820 : SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running)
    1194             : {
    1195             :     /* ---
    1196             :      * Build catalog decoding snapshot incrementally using information about
    1197             :      * the currently running transactions. There are several ways to do that:
    1198             :      *
    1199             :      * a) There were no running transactions when the xl_running_xacts record
    1200             :      *    was inserted, jump to CONSISTENT immediately. We might find such a
    1201             :      *    state while waiting on c)'s sub-states.
    1202             :      *
    1203             :      * b) This (in a previous run) or another decoding slot serialized a
    1204             :      *    snapshot to disk that we can use.  Can't use this method for the
    1205             :      *    initial snapshot when slot is being created and needs full snapshot
    1206             :      *    for export or direct use, as that snapshot will only contain catalog
    1207             :      *    modifying transactions.
    1208             :      *
    1209             :      * c) First incrementally build a snapshot for catalog tuples
    1210             :      *    (BUILDING_SNAPSHOT), that requires all, already in-progress,
    1211             :      *    transactions to finish.  Every transaction starting after that
    1212             :      *    (FULL_SNAPSHOT state), has enough information to be decoded.  But
    1213             :      *    for older running transactions no viable snapshot exists yet, so
    1214             :      *    CONSISTENT will only be reached once all of those have finished.
    1215             :      * ---
    1216             :      */
    1217             : 
    1218             :     /*
    1219             :      * xl_running_xact record is older than what we can use, we might not have
    1220             :      * all necessary catalog rows anymore.
    1221             :      */
    1222        1148 :     if (TransactionIdIsNormal(builder->initial_xmin_horizon) &&
    1223         328 :         NormalTransactionIdPrecedes(running->oldestRunningXid,
    1224             :                                     builder->initial_xmin_horizon))
    1225             :     {
    1226           0 :         ereport(DEBUG1,
    1227             :                 (errmsg_internal("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low",
    1228             :                                  (uint32) (lsn >> 32), (uint32) lsn),
    1229             :                  errdetail_internal("initial xmin horizon of %u vs the snapshot's %u",
    1230             :                                     builder->initial_xmin_horizon, running->oldestRunningXid)));
    1231             : 
    1232             : 
    1233           0 :         SnapBuildWaitSnapshot(running, builder->initial_xmin_horizon);
    1234             : 
    1235           0 :         return true;
    1236             :     }
    1237             : 
    1238             :     /*
    1239             :      * a) No transaction were running, we can jump to consistent.
    1240             :      *
    1241             :      * This is not affected by races around xl_running_xacts, because we can
    1242             :      * miss transaction commits, but currently not transactions starting.
    1243             :      *
    1244             :      * NB: We might have already started to incrementally assemble a snapshot,
    1245             :      * so we need to be careful to deal with that.
    1246             :      */
    1247         820 :     if (running->oldestRunningXid == running->nextXid)
    1248             :     {
    1249        1264 :         if (builder->start_decoding_at == InvalidXLogRecPtr ||
    1250         476 :             builder->start_decoding_at <= lsn)
    1251             :             /* can decode everything after this */
    1252         314 :             builder->start_decoding_at = lsn + 1;
    1253             : 
    1254             :         /* As no transactions were running xmin/xmax can be trivially set. */
    1255         788 :         builder->xmin = running->nextXid; /* < are finished */
    1256         788 :         builder->xmax = running->nextXid; /* >= are running */
    1257             : 
    1258             :         /* so we can safely use the faster comparisons */
    1259         788 :         Assert(TransactionIdIsNormal(builder->xmin));
    1260         788 :         Assert(TransactionIdIsNormal(builder->xmax));
    1261             : 
    1262         788 :         builder->state = SNAPBUILD_CONSISTENT;
    1263         788 :         SnapBuildStartNextPhaseAt(builder, InvalidTransactionId);
    1264             : 
    1265         788 :         ereport(LOG,
    1266             :                 (errmsg("logical decoding found consistent point at %X/%X",
    1267             :                         (uint32) (lsn >> 32), (uint32) lsn),
    1268             :                  errdetail("There are no running transactions.")));
    1269             : 
    1270         788 :         return false;
    1271             :     }
    1272             :     /* b) valid on disk state and not building full snapshot */
    1273          58 :     else if (!builder->building_full_snapshot &&
    1274          26 :              SnapBuildRestore(builder, lsn))
    1275             :     {
    1276             :         /* there won't be any state to cleanup */
    1277           8 :         return false;
    1278             :     }
    1279             : 
    1280             :     /*
    1281             :      * c) transition from START to BUILDING_SNAPSHOT.
    1282             :      *
    1283             :      * In START state, and a xl_running_xacts record with running xacts is
    1284             :      * encountered.  In that case, switch to BUILDING_SNAPSHOT state, and
    1285             :      * record xl_running_xacts->nextXid.  Once all running xacts have finished
    1286             :      * (i.e. they're all >= nextXid), we have a complete catalog snapshot.  It
    1287             :      * might look that we could use xl_running_xact's ->xids information to
    1288             :      * get there quicker, but that is problematic because transactions marked
    1289             :      * as running, might already have inserted their commit record - it's
    1290             :      * infeasible to change that with locking.
    1291             :      */
    1292          24 :     else if (builder->state == SNAPBUILD_START)
    1293             :     {
    1294          14 :         builder->state = SNAPBUILD_BUILDING_SNAPSHOT;
    1295          14 :         SnapBuildStartNextPhaseAt(builder, running->nextXid);
    1296             : 
    1297             :         /*
    1298             :          * Start with an xmin/xmax that's correct for future, when all the
    1299             :          * currently running transactions have finished. We'll update both
    1300             :          * while waiting for the pending transactions to finish.
    1301             :          */
    1302          14 :         builder->xmin = running->nextXid; /* < are finished */
    1303          14 :         builder->xmax = running->nextXid; /* >= are running */
    1304             : 
    1305             :         /* so we can safely use the faster comparisons */
    1306          14 :         Assert(TransactionIdIsNormal(builder->xmin));
    1307          14 :         Assert(TransactionIdIsNormal(builder->xmax));
    1308             : 
    1309          14 :         ereport(LOG,
    1310             :                 (errmsg("logical decoding found initial starting point at %X/%X",
    1311             :                         (uint32) (lsn >> 32), (uint32) lsn),
    1312             :                  errdetail("Waiting for transactions (approximately %d) older than %u to end.",
    1313             :                            running->xcnt, running->nextXid)));
    1314             : 
    1315          14 :         SnapBuildWaitSnapshot(running, running->nextXid);
    1316             :     }
    1317             : 
    1318             :     /*
    1319             :      * c) transition from BUILDING_SNAPSHOT to FULL_SNAPSHOT.
    1320             :      *
    1321             :      * In BUILDING_SNAPSHOT state, and this xl_running_xacts' oldestRunningXid
    1322             :      * is >= than nextXid from when we switched to BUILDING_SNAPSHOT.  This
    1323             :      * means all transactions starting afterwards have enough information to
    1324             :      * be decoded.  Switch to FULL_SNAPSHOT.
    1325             :      */
    1326          16 :     else if (builder->state == SNAPBUILD_BUILDING_SNAPSHOT &&
    1327           6 :              TransactionIdPrecedesOrEquals(SnapBuildNextPhaseAt(builder),
    1328             :                                            running->oldestRunningXid))
    1329             :     {
    1330           6 :         builder->state = SNAPBUILD_FULL_SNAPSHOT;
    1331           6 :         SnapBuildStartNextPhaseAt(builder, running->nextXid);
    1332             : 
    1333           6 :         ereport(LOG,
    1334             :                 (errmsg("logical decoding found initial consistent point at %X/%X",
    1335             :                         (uint32) (lsn >> 32), (uint32) lsn),
    1336             :                  errdetail("Waiting for transactions (approximately %d) older than %u to end.",
    1337             :                            running->xcnt, running->nextXid)));
    1338             : 
    1339           6 :         SnapBuildWaitSnapshot(running, running->nextXid);
    1340             :     }
    1341             : 
    1342             :     /*
    1343             :      * c) transition from FULL_SNAPSHOT to CONSISTENT.
    1344             :      *
    1345             :      * In FULL_SNAPSHOT state (see d) ), and this xl_running_xacts'
    1346             :      * oldestRunningXid is >= than nextXid from when we switched to
    1347             :      * FULL_SNAPSHOT.  This means all transactions that are currently in
    1348             :      * progress have a catalog snapshot, and all their changes have been
    1349             :      * collected.  Switch to CONSISTENT.
    1350             :      */
    1351           8 :     else if (builder->state == SNAPBUILD_FULL_SNAPSHOT &&
    1352           4 :              TransactionIdPrecedesOrEquals(SnapBuildNextPhaseAt(builder),
    1353             :                                            running->oldestRunningXid))
    1354             :     {
    1355           4 :         builder->state = SNAPBUILD_CONSISTENT;
    1356           4 :         SnapBuildStartNextPhaseAt(builder, InvalidTransactionId);
    1357             : 
    1358           4 :         ereport(LOG,
    1359             :                 (errmsg("logical decoding found consistent point at %X/%X",
    1360             :                         (uint32) (lsn >> 32), (uint32) lsn),
    1361             :                  errdetail("There are no old transactions anymore.")));
    1362             :     }
    1363             : 
    1364             :     /*
    1365             :      * We already started to track running xacts and need to wait for all
    1366             :      * in-progress ones to finish. We fall through to the normal processing of
    1367             :      * records so incremental cleanup can be performed.
    1368             :      */
    1369          24 :     return true;
    1370             : 
    1371             : }
    1372             : 
    1373             : /* ---
    1374             :  * Iterate through xids in record, wait for all older than the cutoff to
    1375             :  * finish.  Then, if possible, log a new xl_running_xacts record.
    1376             :  *
    1377             :  * This isn't required for the correctness of decoding, but to:
    1378             :  * a) allow isolationtester to notice that we're currently waiting for
    1379             :  *    something.
    1380             :  * b) log a new xl_running_xacts record where it'd be helpful, without having
    1381             :  *    to write for bgwriter or checkpointer.
    1382             :  * ---
    1383             :  */
    1384             : static void
    1385          20 : SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff)
    1386             : {
    1387             :     int         off;
    1388             : 
    1389          40 :     for (off = 0; off < running->xcnt; off++)
    1390             :     {
    1391          20 :         TransactionId xid = running->xids[off];
    1392             : 
    1393             :         /*
    1394             :          * Upper layers should prevent that we ever need to wait on ourselves.
    1395             :          * Check anyway, since failing to do so would either result in an
    1396             :          * endless wait or an Assert() failure.
    1397             :          */
    1398          20 :         if (TransactionIdIsCurrentTransactionId(xid))
    1399           0 :             elog(ERROR, "waiting for ourselves");
    1400             : 
    1401          20 :         if (TransactionIdFollows(xid, cutoff))
    1402           0 :             continue;
    1403             : 
    1404          20 :         XactLockTableWait(xid, NULL, NULL, XLTW_None);
    1405             :     }
    1406             : 
    1407             :     /*
    1408             :      * All transactions we needed to finish finished - try to ensure there is
    1409             :      * another xl_running_xacts record in a timely manner, without having to
    1410             :      * write for bgwriter or checkpointer to log one.  During recovery we
    1411             :      * can't enforce that, so we'll have to wait.
    1412             :      */
    1413          20 :     if (!RecoveryInProgress())
    1414             :     {
    1415          20 :         LogStandbySnapshot();
    1416             :     }
    1417          20 : }
    1418             : 
    1419             : /* -----------------------------------
    1420             :  * Snapshot serialization support
    1421             :  * -----------------------------------
    1422             :  */
    1423             : 
    1424             : /*
    1425             :  * We store current state of struct SnapBuild on disk in the following manner:
    1426             :  *
    1427             :  * struct SnapBuildOnDisk;
    1428             :  * TransactionId * running.xcnt_space;
    1429             :  * TransactionId * committed.xcnt; (*not xcnt_space*)
    1430             :  *
    1431             :  */
    1432             : typedef struct SnapBuildOnDisk
    1433             : {
    1434             :     /* first part of this struct needs to be version independent */
    1435             : 
    1436             :     /* data not covered by checksum */
    1437             :     uint32      magic;
    1438             :     pg_crc32c   checksum;
    1439             : 
    1440             :     /* data covered by checksum */
    1441             : 
    1442             :     /* version, in case we want to support pg_upgrade */
    1443             :     uint32      version;
    1444             :     /* how large is the on disk data, excluding the constant sized part */
    1445             :     uint32      length;
    1446             : 
    1447             :     /* version dependent part */
    1448             :     SnapBuild   builder;
    1449             : 
    1450             :     /* variable amount of TransactionIds follows */
    1451             : } SnapBuildOnDisk;
    1452             : 
    1453             : #define SnapBuildOnDiskConstantSize \
    1454             :     offsetof(SnapBuildOnDisk, builder)
    1455             : #define SnapBuildOnDiskNotChecksummedSize \
    1456             :     offsetof(SnapBuildOnDisk, version)
    1457             : 
    1458             : #define SNAPBUILD_MAGIC 0x51A1E001
    1459             : #define SNAPBUILD_VERSION 2
    1460             : 
    1461             : /*
    1462             :  * Store/Load a snapshot from disk, depending on the snapshot builder's state.
    1463             :  *
    1464             :  * Supposed to be used by external (i.e. not snapbuild.c) code that just read
    1465             :  * a record that's a potential location for a serialized snapshot.
    1466             :  */
    1467             : void
    1468          30 : SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn)
    1469             : {
    1470          30 :     if (builder->state < SNAPBUILD_CONSISTENT)
    1471           0 :         SnapBuildRestore(builder, lsn);
    1472             :     else
    1473          30 :         SnapBuildSerialize(builder, lsn);
    1474          30 : }
    1475             : 
    1476             : /*
    1477             :  * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already
    1478             :  * been done by another decoding process.
    1479             :  */
    1480             : static void
    1481         192 : SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn)
    1482             : {
    1483             :     Size        needed_length;
    1484             :     SnapBuildOnDisk *ondisk;
    1485             :     char       *ondisk_c;
    1486             :     int         fd;
    1487             :     char        tmppath[MAXPGPATH];
    1488             :     char        path[MAXPGPATH];
    1489             :     int         ret;
    1490             :     struct stat stat_buf;
    1491             :     Size        sz;
    1492             : 
    1493         192 :     Assert(lsn != InvalidXLogRecPtr);
    1494         192 :     Assert(builder->last_serialized_snapshot == InvalidXLogRecPtr ||
    1495             :            builder->last_serialized_snapshot <= lsn);
    1496             : 
    1497             :     /*
    1498             :      * no point in serializing if we cannot continue to work immediately after
    1499             :      * restoring the snapshot
    1500             :      */
    1501         192 :     if (builder->state < SNAPBUILD_CONSISTENT)
    1502         192 :         return;
    1503             : 
    1504             :     /*
    1505             :      * We identify snapshots by the LSN they are valid for. We don't need to
    1506             :      * include timelines in the name as each LSN maps to exactly one timeline
    1507             :      * unless the user used pg_resetwal or similar. If a user did so, there's
    1508             :      * no hope continuing to decode anyway.
    1509             :      */
    1510         384 :     sprintf(path, "pg_logical/snapshots/%X-%X.snap",
    1511         192 :             (uint32) (lsn >> 32), (uint32) lsn);
    1512             : 
    1513             :     /*
    1514             :      * first check whether some other backend already has written the snapshot
    1515             :      * for this LSN. It's perfectly fine if there's none, so we accept ENOENT
    1516             :      * as a valid state. Everything else is an unexpected error.
    1517             :      */
    1518         192 :     ret = stat(path, &stat_buf);
    1519             : 
    1520         192 :     if (ret != 0 && errno != ENOENT)
    1521           0 :         ereport(ERROR,
    1522             :                 (errcode_for_file_access(),
    1523             :                  errmsg("could not stat file \"%s\": %m", path)));
    1524             : 
    1525         192 :     else if (ret == 0)
    1526             :     {
    1527             :         /*
    1528             :          * somebody else has already serialized to this point, don't overwrite
    1529             :          * but remember location, so we don't need to read old data again.
    1530             :          *
    1531             :          * To be sure it has been synced to disk after the rename() from the
    1532             :          * tempfile filename to the real filename, we just repeat the fsync.
    1533             :          * That ought to be cheap because in most scenarios it should already
    1534             :          * be safely on disk.
    1535             :          */
    1536          44 :         fsync_fname(path, false);
    1537          44 :         fsync_fname("pg_logical/snapshots", true);
    1538             : 
    1539          44 :         builder->last_serialized_snapshot = lsn;
    1540          44 :         goto out;
    1541             :     }
    1542             : 
    1543             :     /*
    1544             :      * there is an obvious race condition here between the time we stat(2) the
    1545             :      * file and us writing the file. But we rename the file into place
    1546             :      * atomically and all files created need to contain the same data anyway,
    1547             :      * so this is perfectly fine, although a bit of a resource waste. Locking
    1548             :      * seems like pointless complication.
    1549             :      */
    1550         148 :     elog(DEBUG1, "serializing snapshot to %s", path);
    1551             : 
    1552             :     /* to make sure only we will write to this tempfile, include pid */
    1553         296 :     sprintf(tmppath, "pg_logical/snapshots/%X-%X.snap.%u.tmp",
    1554         148 :             (uint32) (lsn >> 32), (uint32) lsn, MyProcPid);
    1555             : 
    1556             :     /*
    1557             :      * Unlink temporary file if it already exists, needs to have been before a
    1558             :      * crash/error since we won't enter this function twice from within a
    1559             :      * single decoding slot/backend and the temporary file contains the pid of
    1560             :      * the current process.
    1561             :      */
    1562         148 :     if (unlink(tmppath) != 0 && errno != ENOENT)
    1563           0 :         ereport(ERROR,
    1564             :                 (errcode_for_file_access(),
    1565             :                  errmsg("could not remove file \"%s\": %m", tmppath)));
    1566             : 
    1567         148 :     needed_length = sizeof(SnapBuildOnDisk) +
    1568         148 :         sizeof(TransactionId) * builder->committed.xcnt;
    1569             : 
    1570         148 :     ondisk_c = MemoryContextAllocZero(builder->context, needed_length);
    1571         148 :     ondisk = (SnapBuildOnDisk *) ondisk_c;
    1572         148 :     ondisk->magic = SNAPBUILD_MAGIC;
    1573         148 :     ondisk->version = SNAPBUILD_VERSION;
    1574         148 :     ondisk->length = needed_length;
    1575         148 :     INIT_CRC32C(ondisk->checksum);
    1576         148 :     COMP_CRC32C(ondisk->checksum,
    1577             :                 ((char *) ondisk) + SnapBuildOnDiskNotChecksummedSize,
    1578             :                 SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize);
    1579         148 :     ondisk_c += sizeof(SnapBuildOnDisk);
    1580             : 
    1581         148 :     memcpy(&ondisk->builder, builder, sizeof(SnapBuild));
    1582             :     /* NULL-ify memory-only data */
    1583         148 :     ondisk->builder.context = NULL;
    1584         148 :     ondisk->builder.snapshot = NULL;
    1585         148 :     ondisk->builder.reorder = NULL;
    1586         148 :     ondisk->builder.committed.xip = NULL;
    1587             : 
    1588         148 :     COMP_CRC32C(ondisk->checksum,
    1589             :                 &ondisk->builder,
    1590             :                 sizeof(SnapBuild));
    1591             : 
    1592             :     /* there shouldn't be any running xacts */
    1593         148 :     Assert(builder->was_running.was_xcnt == 0);
    1594             : 
    1595             :     /* copy committed xacts */
    1596         148 :     sz = sizeof(TransactionId) * builder->committed.xcnt;
    1597         148 :     memcpy(ondisk_c, builder->committed.xip, sz);
    1598         148 :     COMP_CRC32C(ondisk->checksum, ondisk_c, sz);
    1599         148 :     ondisk_c += sz;
    1600             : 
    1601         148 :     FIN_CRC32C(ondisk->checksum);
    1602             : 
    1603             :     /* we have valid data now, open tempfile and write it there */
    1604         148 :     fd = OpenTransientFile(tmppath,
    1605             :                            O_CREAT | O_EXCL | O_WRONLY | PG_BINARY);
    1606         148 :     if (fd < 0)
    1607           0 :         ereport(ERROR,
    1608             :                 (errcode_for_file_access(),
    1609             :                  errmsg("could not open file \"%s\": %m", tmppath)));
    1610             : 
    1611         148 :     errno = 0;
    1612         148 :     pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_WRITE);
    1613         148 :     if ((write(fd, ondisk, needed_length)) != needed_length)
    1614             :     {
    1615           0 :         int         save_errno = errno;
    1616             : 
    1617           0 :         CloseTransientFile(fd);
    1618             : 
    1619             :         /* if write didn't set errno, assume problem is no disk space */
    1620           0 :         errno = save_errno ? save_errno : ENOSPC;
    1621           0 :         ereport(ERROR,
    1622             :                 (errcode_for_file_access(),
    1623             :                  errmsg("could not write to file \"%s\": %m", tmppath)));
    1624             :     }
    1625         148 :     pgstat_report_wait_end();
    1626             : 
    1627             :     /*
    1628             :      * fsync the file before renaming so that even if we crash after this we
    1629             :      * have either a fully valid file or nothing.
    1630             :      *
    1631             :      * It's safe to just ERROR on fsync() here because we'll retry the whole
    1632             :      * operation including the writes.
    1633             :      *
    1634             :      * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has
    1635             :      * some noticeable overhead since it's performed synchronously during
    1636             :      * decoding?
    1637             :      */
    1638         148 :     pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_SYNC);
    1639         148 :     if (pg_fsync(fd) != 0)
    1640             :     {
    1641           0 :         int         save_errno = errno;
    1642             : 
    1643           0 :         CloseTransientFile(fd);
    1644           0 :         errno = save_errno;
    1645           0 :         ereport(ERROR,
    1646             :                 (errcode_for_file_access(),
    1647             :                  errmsg("could not fsync file \"%s\": %m", tmppath)));
    1648             :     }
    1649         148 :     pgstat_report_wait_end();
    1650             : 
    1651         148 :     if (CloseTransientFile(fd) != 0)
    1652           0 :         ereport(ERROR,
    1653             :                 (errcode_for_file_access(),
    1654             :                  errmsg("could not close file \"%s\": %m", tmppath)));
    1655             : 
    1656         148 :     fsync_fname("pg_logical/snapshots", true);
    1657             : 
    1658             :     /*
    1659             :      * We may overwrite the work from some other backend, but that's ok, our
    1660             :      * snapshot is valid as well, we'll just have done some superfluous work.
    1661             :      */
    1662         148 :     if (rename(tmppath, path) != 0)
    1663             :     {
    1664           0 :         ereport(ERROR,
    1665             :                 (errcode_for_file_access(),
    1666             :                  errmsg("could not rename file \"%s\" to \"%s\": %m",
    1667             :                         tmppath, path)));
    1668             :     }
    1669             : 
    1670             :     /* make sure we persist */
    1671         148 :     fsync_fname(path, false);
    1672         148 :     fsync_fname("pg_logical/snapshots", true);
    1673             : 
    1674             :     /*
    1675             :      * Now there's no way we can loose the dumped state anymore, remember this
    1676             :      * as a serialization point.
    1677             :      */
    1678         148 :     builder->last_serialized_snapshot = lsn;
    1679             : 
    1680             : out:
    1681         192 :     ReorderBufferSetRestartPoint(builder->reorder,
    1682             :                                  builder->last_serialized_snapshot);
    1683             : }
    1684             : 
    1685             : /*
    1686             :  * Restore a snapshot into 'builder' if previously one has been stored at the
    1687             :  * location indicated by 'lsn'. Returns true if successful, false otherwise.
    1688             :  */
    1689             : static bool
    1690          26 : SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn)
    1691             : {
    1692             :     SnapBuildOnDisk ondisk;
    1693             :     int         fd;
    1694             :     char        path[MAXPGPATH];
    1695             :     Size        sz;
    1696             :     int         readBytes;
    1697             :     pg_crc32c   checksum;
    1698             : 
    1699             :     /* no point in loading a snapshot if we're already there */
    1700          26 :     if (builder->state == SNAPBUILD_CONSISTENT)
    1701           0 :         return false;
    1702             : 
    1703          52 :     sprintf(path, "pg_logical/snapshots/%X-%X.snap",
    1704          26 :             (uint32) (lsn >> 32), (uint32) lsn);
    1705             : 
    1706          26 :     fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
    1707             : 
    1708          26 :     if (fd < 0 && errno == ENOENT)
    1709          18 :         return false;
    1710           8 :     else if (fd < 0)
    1711           0 :         ereport(ERROR,
    1712             :                 (errcode_for_file_access(),
    1713             :                  errmsg("could not open file \"%s\": %m", path)));
    1714             : 
    1715             :     /* ----
    1716             :      * Make sure the snapshot had been stored safely to disk, that's normally
    1717             :      * cheap.
    1718             :      * Note that we do not need PANIC here, nobody will be able to use the
    1719             :      * slot without fsyncing, and saving it won't succeed without an fsync()
    1720             :      * either...
    1721             :      * ----
    1722             :      */
    1723           8 :     fsync_fname(path, false);
    1724           8 :     fsync_fname("pg_logical/snapshots", true);
    1725             : 
    1726             : 
    1727             :     /* read statically sized portion of snapshot */
    1728           8 :     pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ);
    1729           8 :     readBytes = read(fd, &ondisk, SnapBuildOnDiskConstantSize);
    1730           8 :     pgstat_report_wait_end();
    1731           8 :     if (readBytes != SnapBuildOnDiskConstantSize)
    1732             :     {
    1733           0 :         int         save_errno = errno;
    1734             : 
    1735           0 :         CloseTransientFile(fd);
    1736             : 
    1737           0 :         if (readBytes < 0)
    1738             :         {
    1739           0 :             errno = save_errno;
    1740           0 :             ereport(ERROR,
    1741             :                     (errcode_for_file_access(),
    1742             :                      errmsg("could not read file \"%s\": %m", path)));
    1743             :         }
    1744             :         else
    1745           0 :             ereport(ERROR,
    1746             :                     (errcode(ERRCODE_DATA_CORRUPTED),
    1747             :                      errmsg("could not read file \"%s\": read %d of %zu",
    1748             :                             path, readBytes,
    1749             :                             (Size) SnapBuildOnDiskConstantSize)));
    1750             :     }
    1751             : 
    1752           8 :     if (ondisk.magic != SNAPBUILD_MAGIC)
    1753           0 :         ereport(ERROR,
    1754             :                 (errcode(ERRCODE_DATA_CORRUPTED),
    1755             :                  errmsg("snapbuild state file \"%s\" has wrong magic number: %u instead of %u",
    1756             :                         path, ondisk.magic, SNAPBUILD_MAGIC)));
    1757             : 
    1758           8 :     if (ondisk.version != SNAPBUILD_VERSION)
    1759           0 :         ereport(ERROR,
    1760             :                 (errcode(ERRCODE_DATA_CORRUPTED),
    1761             :                  errmsg("snapbuild state file \"%s\" has unsupported version: %u instead of %u",
    1762             :                         path, ondisk.version, SNAPBUILD_VERSION)));
    1763             : 
    1764           8 :     INIT_CRC32C(checksum);
    1765           8 :     COMP_CRC32C(checksum,
    1766             :                 ((char *) &ondisk) + SnapBuildOnDiskNotChecksummedSize,
    1767             :                 SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize);
    1768             : 
    1769             :     /* read SnapBuild */
    1770           8 :     pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ);
    1771           8 :     readBytes = read(fd, &ondisk.builder, sizeof(SnapBuild));
    1772           8 :     pgstat_report_wait_end();
    1773           8 :     if (readBytes != sizeof(SnapBuild))
    1774             :     {
    1775           0 :         int         save_errno = errno;
    1776             : 
    1777           0 :         CloseTransientFile(fd);
    1778             : 
    1779           0 :         if (readBytes < 0)
    1780             :         {
    1781           0 :             errno = save_errno;
    1782           0 :             ereport(ERROR,
    1783             :                     (errcode_for_file_access(),
    1784             :                      errmsg("could not read file \"%s\": %m", path)));
    1785             :         }
    1786             :         else
    1787           0 :             ereport(ERROR,
    1788             :                     (errcode(ERRCODE_DATA_CORRUPTED),
    1789             :                      errmsg("could not read file \"%s\": read %d of %zu",
    1790             :                             path, readBytes, sizeof(SnapBuild))));
    1791             :     }
    1792           8 :     COMP_CRC32C(checksum, &ondisk.builder, sizeof(SnapBuild));
    1793             : 
    1794             :     /* restore running xacts (dead, but kept for backward compat) */
    1795           8 :     sz = sizeof(TransactionId) * ondisk.builder.was_running.was_xcnt_space;
    1796           8 :     ondisk.builder.was_running.was_xip =
    1797           8 :         MemoryContextAllocZero(builder->context, sz);
    1798           8 :     pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ);
    1799           8 :     readBytes = read(fd, ondisk.builder.was_running.was_xip, sz);
    1800           8 :     pgstat_report_wait_end();
    1801           8 :     if (readBytes != sz)
    1802             :     {
    1803           0 :         int         save_errno = errno;
    1804             : 
    1805           0 :         CloseTransientFile(fd);
    1806             : 
    1807           0 :         if (readBytes < 0)
    1808             :         {
    1809           0 :             errno = save_errno;
    1810           0 :             ereport(ERROR,
    1811             :                     (errcode_for_file_access(),
    1812             :                      errmsg("could not read file \"%s\": %m", path)));
    1813             :         }
    1814             :         else
    1815           0 :             ereport(ERROR,
    1816             :                     (errcode(ERRCODE_DATA_CORRUPTED),
    1817             :                      errmsg("could not read file \"%s\": read %d of %zu",
    1818             :                             path, readBytes, sz)));
    1819             :     }
    1820           8 :     COMP_CRC32C(checksum, ondisk.builder.was_running.was_xip, sz);
    1821             : 
    1822             :     /* restore committed xacts information */
    1823           8 :     sz = sizeof(TransactionId) * ondisk.builder.committed.xcnt;
    1824           8 :     ondisk.builder.committed.xip = MemoryContextAllocZero(builder->context, sz);
    1825           8 :     pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ);
    1826           8 :     readBytes = read(fd, ondisk.builder.committed.xip, sz);
    1827           8 :     pgstat_report_wait_end();
    1828           8 :     if (readBytes != sz)
    1829             :     {
    1830           0 :         int         save_errno = errno;
    1831             : 
    1832           0 :         CloseTransientFile(fd);
    1833             : 
    1834           0 :         if (readBytes < 0)
    1835             :         {
    1836           0 :             errno = save_errno;
    1837           0 :             ereport(ERROR,
    1838             :                     (errcode_for_file_access(),
    1839             :                      errmsg("could not read file \"%s\": %m", path)));
    1840             :         }
    1841             :         else
    1842           0 :             ereport(ERROR,
    1843             :                     (errcode(ERRCODE_DATA_CORRUPTED),
    1844             :                      errmsg("could not read file \"%s\": read %d of %zu",
    1845             :                             path, readBytes, sz)));
    1846             :     }
    1847           8 :     COMP_CRC32C(checksum, ondisk.builder.committed.xip, sz);
    1848             : 
    1849           8 :     if (CloseTransientFile(fd) != 0)
    1850           0 :         ereport(ERROR,
    1851             :                 (errcode_for_file_access(),
    1852             :                  errmsg("could not close file \"%s\": %m", path)));
    1853             : 
    1854           8 :     FIN_CRC32C(checksum);
    1855             : 
    1856             :     /* verify checksum of what we've read */
    1857           8 :     if (!EQ_CRC32C(checksum, ondisk.checksum))
    1858           0 :         ereport(ERROR,
    1859             :                 (errcode(ERRCODE_DATA_CORRUPTED),
    1860             :                  errmsg("checksum mismatch for snapbuild state file \"%s\": is %u, should be %u",
    1861             :                         path, checksum, ondisk.checksum)));
    1862             : 
    1863             :     /*
    1864             :      * ok, we now have a sensible snapshot here, figure out if it has more
    1865             :      * information than we have.
    1866             :      */
    1867             : 
    1868             :     /*
    1869             :      * We are only interested in consistent snapshots for now, comparing
    1870             :      * whether one incomplete snapshot is more "advanced" seems to be
    1871             :      * unnecessarily complex.
    1872             :      */
    1873           8 :     if (ondisk.builder.state < SNAPBUILD_CONSISTENT)
    1874           0 :         goto snapshot_not_interesting;
    1875             : 
    1876             :     /*
    1877             :      * Don't use a snapshot that requires an xmin that we cannot guarantee to
    1878             :      * be available.
    1879             :      */
    1880           8 :     if (TransactionIdPrecedes(ondisk.builder.xmin, builder->initial_xmin_horizon))
    1881           0 :         goto snapshot_not_interesting;
    1882             : 
    1883             : 
    1884             :     /* ok, we think the snapshot is sensible, copy over everything important */
    1885           8 :     builder->xmin = ondisk.builder.xmin;
    1886           8 :     builder->xmax = ondisk.builder.xmax;
    1887           8 :     builder->state = ondisk.builder.state;
    1888             : 
    1889           8 :     builder->committed.xcnt = ondisk.builder.committed.xcnt;
    1890             :     /* We only allocated/stored xcnt, not xcnt_space xids ! */
    1891             :     /* don't overwrite preallocated xip, if we don't have anything here */
    1892           8 :     if (builder->committed.xcnt > 0)
    1893             :     {
    1894           8 :         pfree(builder->committed.xip);
    1895           8 :         builder->committed.xcnt_space = ondisk.builder.committed.xcnt;
    1896           8 :         builder->committed.xip = ondisk.builder.committed.xip;
    1897             :     }
    1898           8 :     ondisk.builder.committed.xip = NULL;
    1899             : 
    1900             :     /* our snapshot is not interesting anymore, build a new one */
    1901           8 :     if (builder->snapshot != NULL)
    1902             :     {
    1903           0 :         SnapBuildSnapDecRefcount(builder->snapshot);
    1904             :     }
    1905           8 :     builder->snapshot = SnapBuildBuildSnapshot(builder);
    1906           8 :     SnapBuildSnapIncRefcount(builder->snapshot);
    1907             : 
    1908           8 :     ReorderBufferSetRestartPoint(builder->reorder, lsn);
    1909             : 
    1910           8 :     Assert(builder->state == SNAPBUILD_CONSISTENT);
    1911             : 
    1912           8 :     ereport(LOG,
    1913             :             (errmsg("logical decoding found consistent point at %X/%X",
    1914             :                     (uint32) (lsn >> 32), (uint32) lsn),
    1915             :              errdetail("Logical decoding will begin using saved snapshot.")));
    1916           8 :     return true;
    1917             : 
    1918             : snapshot_not_interesting:
    1919           0 :     if (ondisk.builder.committed.xip != NULL)
    1920           0 :         pfree(ondisk.builder.committed.xip);
    1921           0 :     return false;
    1922             : }
    1923             : 
    1924             : /*
    1925             :  * Remove all serialized snapshots that are not required anymore because no
    1926             :  * slot can need them. This doesn't actually have to run during a checkpoint,
    1927             :  * but it's a convenient point to schedule this.
    1928             :  *
    1929             :  * NB: We run this during checkpoints even if logical decoding is disabled so
    1930             :  * we cleanup old slots at some point after it got disabled.
    1931             :  */
    1932             : void
    1933         702 : CheckPointSnapBuild(void)
    1934             : {
    1935             :     XLogRecPtr  cutoff;
    1936             :     XLogRecPtr  redo;
    1937             :     DIR        *snap_dir;
    1938             :     struct dirent *snap_de;
    1939             :     char        path[MAXPGPATH + 21];
    1940             : 
    1941             :     /*
    1942             :      * We start off with a minimum of the last redo pointer. No new
    1943             :      * replication slot will start before that, so that's a safe upper bound
    1944             :      * for removal.
    1945             :      */
    1946         702 :     redo = GetRedoRecPtr();
    1947             : 
    1948             :     /* now check for the restart ptrs from existing slots */
    1949         702 :     cutoff = ReplicationSlotsComputeLogicalRestartLSN();
    1950             : 
    1951             :     /* don't start earlier than the restart lsn */
    1952         702 :     if (redo < cutoff)
    1953           0 :         cutoff = redo;
    1954             : 
    1955         702 :     snap_dir = AllocateDir("pg_logical/snapshots");
    1956        2962 :     while ((snap_de = ReadDir(snap_dir, "pg_logical/snapshots")) != NULL)
    1957             :     {
    1958             :         uint32      hi;
    1959             :         uint32      lo;
    1960             :         XLogRecPtr  lsn;
    1961             :         struct stat statbuf;
    1962             : 
    1963        2414 :         if (strcmp(snap_de->d_name, ".") == 0 ||
    1964         856 :             strcmp(snap_de->d_name, "..") == 0)
    1965        2808 :             continue;
    1966             : 
    1967         154 :         snprintf(path, sizeof(path), "pg_logical/snapshots/%s", snap_de->d_name);
    1968             : 
    1969         154 :         if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
    1970             :         {
    1971           0 :             elog(DEBUG1, "only regular files expected: %s", path);
    1972           0 :             continue;
    1973             :         }
    1974             : 
    1975             :         /*
    1976             :          * temporary filenames from SnapBuildSerialize() include the LSN and
    1977             :          * everything but are postfixed by .$pid.tmp. We can just remove them
    1978             :          * the same as other files because there can be none that are
    1979             :          * currently being written that are older than cutoff.
    1980             :          *
    1981             :          * We just log a message if a file doesn't fit the pattern, it's
    1982             :          * probably some editors lock/state file or similar...
    1983             :          */
    1984         154 :         if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2)
    1985             :         {
    1986           0 :             ereport(LOG,
    1987             :                     (errmsg("could not parse file name \"%s\"", path)));
    1988           0 :             continue;
    1989             :         }
    1990             : 
    1991         154 :         lsn = ((uint64) hi) << 32 | lo;
    1992             : 
    1993             :         /* check whether we still need it */
    1994         154 :         if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
    1995             :         {
    1996         104 :             elog(DEBUG1, "removing snapbuild snapshot %s", path);
    1997             : 
    1998             :             /*
    1999             :              * It's not particularly harmful, though strange, if we can't
    2000             :              * remove the file here. Don't prevent the checkpoint from
    2001             :              * completing, that'd be a cure worse than the disease.
    2002             :              */
    2003         104 :             if (unlink(path) < 0)
    2004             :             {
    2005           0 :                 ereport(LOG,
    2006             :                         (errcode_for_file_access(),
    2007             :                          errmsg("could not remove file \"%s\": %m",
    2008             :                                 path)));
    2009           0 :                 continue;
    2010             :             }
    2011             :         }
    2012             :     }
    2013         702 :     FreeDir(snap_dir);
    2014         702 : }

Generated by: LCOV version 1.14