Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * reorderbuffer.c
4 : * PostgreSQL logical replay/reorder buffer management
5 : *
6 : *
7 : * Copyright (c) 2012-2020, PostgreSQL Global Development Group
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/replication/reorderbuffer.c
12 : *
13 : * NOTES
14 : * This module gets handed individual pieces of transactions in the order
15 : * they are written to the WAL and is responsible to reassemble them into
16 : * toplevel transaction sized pieces. When a transaction is completely
17 : * reassembled - signaled by reading the transaction commit record - it
18 : * will then call the output plugin (cf. ReorderBufferCommit()) with the
19 : * individual changes. The output plugins rely on snapshots built by
20 : * snapbuild.c which hands them to us.
21 : *
22 : * Transactions and subtransactions/savepoints in postgres are not
23 : * immediately linked to each other from outside the performing
24 : * backend. Only at commit/abort (or special xact_assignment records) they
25 : * are linked together. Which means that we will have to splice together a
26 : * toplevel transaction from its subtransactions. To do that efficiently we
27 : * build a binary heap indexed by the smallest current lsn of the individual
28 : * subtransactions' changestreams. As the individual streams are inherently
29 : * ordered by LSN - since that is where we build them from - the transaction
30 : * can easily be reassembled by always using the subtransaction with the
31 : * smallest current LSN from the heap.
32 : *
33 : * In order to cope with large transactions - which can be several times as
34 : * big as the available memory - this module supports spooling the contents
35 : * of a large transactions to disk. When the transaction is replayed the
36 : * contents of individual (sub-)transactions will be read from disk in
37 : * chunks.
38 : *
39 : * This module also has to deal with reassembling toast records from the
40 : * individual chunks stored in WAL. When a new (or initial) version of a
41 : * tuple is stored in WAL it will always be preceded by the toast chunks
42 : * emitted for the columns stored out of line. Within a single toplevel
43 : * transaction there will be no other data carrying records between a row's
44 : * toast chunks and the row data itself. See ReorderBufferToast* for
45 : * details.
46 : *
47 : * ReorderBuffer uses two special memory context types - SlabContext for
48 : * allocations of fixed-length structures (changes and transactions), and
49 : * GenerationContext for the variable-length transaction data (allocated
50 : * and freed in groups with similar lifespans).
51 : *
52 : * To limit the amount of memory used by decoded changes, we track memory
53 : * used at the reorder buffer level (i.e. total amount of memory), and for
54 : * each transaction. When the total amount of used memory exceeds the
55 : * limit, the transaction consuming the most memory is then serialized to
56 : * disk.
57 : *
58 : * Only decoded changes are evicted from memory (spilled to disk), not the
59 : * transaction records. The number of toplevel transactions is limited,
60 : * but a transaction with many subtransactions may still consume significant
61 : * amounts of memory. However, the transaction records are fairly small and
62 : * are not included in the memory limit.
63 : *
64 : * The current eviction algorithm is very simple - the transaction is
65 : * picked merely by size, while it might be useful to also consider age
66 : * (LSN) of the changes for example. With the new Generational memory
67 : * allocator, evicting the oldest changes would make it more likely the
68 : * memory gets actually freed.
69 : *
70 : * We still rely on max_changes_in_memory when loading serialized changes
71 : * back into memory. At that point we can't use the memory limit directly
72 : * as we load the subxacts independently. One option to deal with this
73 : * would be to count the subxacts, and allow each to allocate 1/N of the
74 : * memory limit. That however does not seem very appealing, because with
75 : * many subtransactions it may easily cause thrashing (short cycles of
76 : * deserializing and applying very few changes). We probably should give
77 : * a bit more memory to the oldest subtransactions, because it's likely
78 : * they are the source for the next sequence of changes.
79 : *
80 : * -------------------------------------------------------------------------
81 : */
82 : #include "postgres.h"
83 :
84 : #include <unistd.h>
85 : #include <sys/stat.h>
86 :
87 : #include "access/detoast.h"
88 : #include "access/heapam.h"
89 : #include "access/rewriteheap.h"
90 : #include "access/transam.h"
91 : #include "access/xact.h"
92 : #include "access/xlog_internal.h"
93 : #include "catalog/catalog.h"
94 : #include "lib/binaryheap.h"
95 : #include "miscadmin.h"
96 : #include "pgstat.h"
97 : #include "replication/logical.h"
98 : #include "replication/reorderbuffer.h"
99 : #include "replication/slot.h"
100 : #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
101 : #include "storage/bufmgr.h"
102 : #include "storage/fd.h"
103 : #include "storage/sinval.h"
104 : #include "utils/builtins.h"
105 : #include "utils/combocid.h"
106 : #include "utils/memdebug.h"
107 : #include "utils/memutils.h"
108 : #include "utils/rel.h"
109 : #include "utils/relfilenodemap.h"
110 :
111 :
112 : /* entry for a hash table we use to map from xid to our transaction state */
113 : typedef struct ReorderBufferTXNByIdEnt
114 : {
115 : TransactionId xid;
116 : ReorderBufferTXN *txn;
117 : } ReorderBufferTXNByIdEnt;
118 :
119 : /* data structures for (relfilenode, ctid) => (cmin, cmax) mapping */
120 : typedef struct ReorderBufferTupleCidKey
121 : {
122 : RelFileNode relnode;
123 : ItemPointerData tid;
124 : } ReorderBufferTupleCidKey;
125 :
126 : typedef struct ReorderBufferTupleCidEnt
127 : {
128 : ReorderBufferTupleCidKey key;
129 : CommandId cmin;
130 : CommandId cmax;
131 : CommandId combocid; /* just for debugging */
132 : } ReorderBufferTupleCidEnt;
133 :
134 : /* Virtual file descriptor with file offset tracking */
135 : typedef struct TXNEntryFile
136 : {
137 : File vfd; /* -1 when the file is closed */
138 : off_t curOffset; /* offset for next write or read. Reset to 0
139 : * when vfd is opened. */
140 : } TXNEntryFile;
141 :
142 : /* k-way in-order change iteration support structures */
143 : typedef struct ReorderBufferIterTXNEntry
144 : {
145 : XLogRecPtr lsn;
146 : ReorderBufferChange *change;
147 : ReorderBufferTXN *txn;
148 : TXNEntryFile file;
149 : XLogSegNo segno;
150 : } ReorderBufferIterTXNEntry;
151 :
152 : typedef struct ReorderBufferIterTXNState
153 : {
154 : binaryheap *heap;
155 : Size nr_txns;
156 : dlist_head old_change;
157 : ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER];
158 : } ReorderBufferIterTXNState;
159 :
160 : /* toast datastructures */
161 : typedef struct ReorderBufferToastEnt
162 : {
163 : Oid chunk_id; /* toast_table.chunk_id */
164 : int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
165 : * have seen */
166 : Size num_chunks; /* number of chunks we've already seen */
167 : Size size; /* combined size of chunks seen */
168 : dlist_head chunks; /* linked list of chunks */
169 : struct varlena *reconstructed; /* reconstructed varlena now pointed to in
170 : * main tup */
171 : } ReorderBufferToastEnt;
172 :
173 : /* Disk serialization support datastructures */
174 : typedef struct ReorderBufferDiskChange
175 : {
176 : Size size;
177 : ReorderBufferChange change;
178 : /* data follows */
179 : } ReorderBufferDiskChange;
180 :
181 : #define IsSpecInsert(action) \
182 : ( \
183 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
184 : )
185 : #define IsSpecConfirm(action) \
186 : ( \
187 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) \
188 : )
189 : #define IsInsertOrUpdate(action) \
190 : ( \
191 : (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
192 : ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
193 : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
194 : )
195 :
196 : /*
197 : * Maximum number of changes kept in memory, per transaction. After that,
198 : * changes are spooled to disk.
199 : *
200 : * The current value should be sufficient to decode the entire transaction
201 : * without hitting disk in OLTP workloads, while starting to spool to disk in
202 : * other workloads reasonably fast.
203 : *
204 : * At some point in the future it probably makes sense to have a more elaborate
205 : * resource management here, but it's not entirely clear what that would look
206 : * like.
207 : */
208 : int logical_decoding_work_mem;
209 : static const Size max_changes_in_memory = 4096; /* XXX for restore only */
210 :
211 : /* ---------------------------------------
212 : * primary reorderbuffer support routines
213 : * ---------------------------------------
214 : */
215 : static ReorderBufferTXN *ReorderBufferGetTXN(ReorderBuffer *rb);
216 : static void ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
217 : static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb,
218 : TransactionId xid, bool create, bool *is_new,
219 : XLogRecPtr lsn, bool create_as_top);
220 : static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
221 : ReorderBufferTXN *subtxn);
222 :
223 : static void AssertTXNLsnOrder(ReorderBuffer *rb);
224 :
225 : /* ---------------------------------------
226 : * support functions for lsn-order iterating over the ->changes of a
227 : * transaction and its subtransactions
228 : *
229 : * used for iteration over the k-way heap merge of a transaction and its
230 : * subtransactions
231 : * ---------------------------------------
232 : */
233 : static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
234 : ReorderBufferIterTXNState *volatile *iter_state);
235 : static ReorderBufferChange *ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state);
236 : static void ReorderBufferIterTXNFinish(ReorderBuffer *rb,
237 : ReorderBufferIterTXNState *state);
238 : static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs);
239 :
240 : /*
241 : * ---------------------------------------
242 : * Disk serialization support functions
243 : * ---------------------------------------
244 : */
245 : static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb);
246 : static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
247 : static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
248 : int fd, ReorderBufferChange *change);
249 : static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
250 : TXNEntryFile *file, XLogSegNo *segno);
251 : static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
252 : char *change);
253 : static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
254 : static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
255 : bool txn_prepared);
256 : static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
257 : static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
258 : TransactionId xid, XLogSegNo segno);
259 :
260 : static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
261 : static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
262 : ReorderBufferTXN *txn, CommandId cid);
263 :
264 : /*
265 : * ---------------------------------------
266 : * Streaming support functions
267 : * ---------------------------------------
268 : */
269 : static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
270 : static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
271 : static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
272 : static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn);
273 :
274 : /* ---------------------------------------
275 : * toast reassembly support
276 : * ---------------------------------------
277 : */
278 : static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn);
279 : static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn);
280 : static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
281 : Relation relation, ReorderBufferChange *change);
282 : static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
283 : Relation relation, ReorderBufferChange *change);
284 :
285 : /*
286 : * ---------------------------------------
287 : * memory accounting
288 : * ---------------------------------------
289 : */
290 : static Size ReorderBufferChangeSize(ReorderBufferChange *change);
291 : static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
292 : ReorderBufferChange *change, bool addition);
293 :
294 : /*
295 : * Allocate a new ReorderBuffer and clean out any old serialized state from
296 : * prior ReorderBuffer instances for the same slot.
297 : */
298 : ReorderBuffer *
299 820 : ReorderBufferAllocate(void)
300 : {
301 : ReorderBuffer *buffer;
302 : HASHCTL hash_ctl;
303 : MemoryContext new_ctx;
304 :
305 820 : Assert(MyReplicationSlot != NULL);
306 :
307 : /* allocate memory in own context, to have better accountability */
308 820 : new_ctx = AllocSetContextCreate(CurrentMemoryContext,
309 : "ReorderBuffer",
310 : ALLOCSET_DEFAULT_SIZES);
311 :
312 820 : buffer =
313 : (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
314 :
315 820 : memset(&hash_ctl, 0, sizeof(hash_ctl));
316 :
317 820 : buffer->context = new_ctx;
318 :
319 820 : buffer->change_context = SlabContextCreate(new_ctx,
320 : "Change",
321 : SLAB_DEFAULT_BLOCK_SIZE,
322 : sizeof(ReorderBufferChange));
323 :
324 820 : buffer->txn_context = SlabContextCreate(new_ctx,
325 : "TXN",
326 : SLAB_DEFAULT_BLOCK_SIZE,
327 : sizeof(ReorderBufferTXN));
328 :
329 820 : buffer->tup_context = GenerationContextCreate(new_ctx,
330 : "Tuples",
331 : SLAB_LARGE_BLOCK_SIZE);
332 :
333 820 : hash_ctl.keysize = sizeof(TransactionId);
334 820 : hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
335 820 : hash_ctl.hcxt = buffer->context;
336 :
337 820 : buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
338 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
339 :
340 820 : buffer->by_txn_last_xid = InvalidTransactionId;
341 820 : buffer->by_txn_last_txn = NULL;
342 :
343 820 : buffer->outbuf = NULL;
344 820 : buffer->outbufsize = 0;
345 820 : buffer->size = 0;
346 :
347 820 : buffer->spillTxns = 0;
348 820 : buffer->spillCount = 0;
349 820 : buffer->spillBytes = 0;
350 :
351 820 : buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
352 :
353 820 : dlist_init(&buffer->toplevel_by_lsn);
354 820 : dlist_init(&buffer->txns_by_base_snapshot_lsn);
355 :
356 : /*
357 : * Ensure there's no stale data from prior uses of this slot, in case some
358 : * prior exit avoided calling ReorderBufferFree. Failure to do this can
359 : * produce duplicated txns, and it's very cheap if there's nothing there.
360 : */
361 820 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
362 :
363 820 : return buffer;
364 : }
365 :
366 : /*
367 : * Free a ReorderBuffer
368 : */
369 : void
370 738 : ReorderBufferFree(ReorderBuffer *rb)
371 : {
372 738 : MemoryContext context = rb->context;
373 :
374 : /*
375 : * We free separately allocated data by entirely scrapping reorderbuffer's
376 : * memory context.
377 : */
378 738 : MemoryContextDelete(context);
379 :
380 : /* Free disk space used by unconsumed reorder buffers */
381 738 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
382 738 : }
383 :
384 : /*
385 : * Get an unused, possibly preallocated, ReorderBufferTXN.
386 : */
387 : static ReorderBufferTXN *
388 4520 : ReorderBufferGetTXN(ReorderBuffer *rb)
389 : {
390 : ReorderBufferTXN *txn;
391 :
392 4520 : txn = (ReorderBufferTXN *)
393 4520 : MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
394 :
395 4520 : memset(txn, 0, sizeof(ReorderBufferTXN));
396 :
397 4520 : dlist_init(&txn->changes);
398 4520 : dlist_init(&txn->tuplecids);
399 4520 : dlist_init(&txn->subtxns);
400 :
401 : /* InvalidCommandId is not zero, so set it explicitly */
402 4520 : txn->command_id = InvalidCommandId;
403 :
404 4520 : return txn;
405 : }
406 :
407 : /*
408 : * Free a ReorderBufferTXN.
409 : */
410 : static void
411 4468 : ReorderBufferReturnTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
412 : {
413 : /* clean the lookup cache if we were cached (quite likely) */
414 4468 : if (rb->by_txn_last_xid == txn->xid)
415 : {
416 4118 : rb->by_txn_last_xid = InvalidTransactionId;
417 4118 : rb->by_txn_last_txn = NULL;
418 : }
419 :
420 : /* free data that's contained */
421 :
422 4468 : if (txn->gid != NULL)
423 : {
424 52 : pfree(txn->gid);
425 52 : txn->gid = NULL;
426 : }
427 :
428 4468 : if (txn->tuplecid_hash != NULL)
429 : {
430 402 : hash_destroy(txn->tuplecid_hash);
431 402 : txn->tuplecid_hash = NULL;
432 : }
433 :
434 4468 : if (txn->invalidations)
435 : {
436 1162 : pfree(txn->invalidations);
437 1162 : txn->invalidations = NULL;
438 : }
439 :
440 4468 : pfree(txn);
441 4468 : }
442 :
443 : /*
444 : * Get an fresh ReorderBufferChange.
445 : */
446 : ReorderBufferChange *
447 3364572 : ReorderBufferGetChange(ReorderBuffer *rb)
448 : {
449 : ReorderBufferChange *change;
450 :
451 3364572 : change = (ReorderBufferChange *)
452 3364572 : MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
453 :
454 3364572 : memset(change, 0, sizeof(ReorderBufferChange));
455 3364572 : return change;
456 : }
457 :
458 : /*
459 : * Free a ReorderBufferChange and update memory accounting, if requested.
460 : */
461 : void
462 3361556 : ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change,
463 : bool upd_mem)
464 : {
465 : /* update memory accounting info */
466 3361556 : if (upd_mem)
467 3343002 : ReorderBufferChangeMemoryUpdate(rb, change, false);
468 :
469 : /* free contained data */
470 3361556 : switch (change->action)
471 : {
472 : case REORDER_BUFFER_CHANGE_INSERT:
473 : case REORDER_BUFFER_CHANGE_UPDATE:
474 : case REORDER_BUFFER_CHANGE_DELETE:
475 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
476 3253282 : if (change->data.tp.newtuple)
477 : {
478 2838754 : ReorderBufferReturnTupleBuf(rb, change->data.tp.newtuple);
479 2838754 : change->data.tp.newtuple = NULL;
480 : }
481 :
482 3253282 : if (change->data.tp.oldtuple)
483 : {
484 275986 : ReorderBufferReturnTupleBuf(rb, change->data.tp.oldtuple);
485 275986 : change->data.tp.oldtuple = NULL;
486 : }
487 3253282 : break;
488 : case REORDER_BUFFER_CHANGE_MESSAGE:
489 52 : if (change->data.msg.prefix != NULL)
490 52 : pfree(change->data.msg.prefix);
491 52 : change->data.msg.prefix = NULL;
492 52 : if (change->data.msg.message != NULL)
493 52 : pfree(change->data.msg.message);
494 52 : change->data.msg.message = NULL;
495 52 : break;
496 : case REORDER_BUFFER_CHANGE_INVALIDATION:
497 5806 : if (change->data.inval.invalidations)
498 5806 : pfree(change->data.inval.invalidations);
499 5806 : change->data.inval.invalidations = NULL;
500 5806 : break;
501 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
502 1200 : if (change->data.snapshot)
503 : {
504 1200 : ReorderBufferFreeSnap(rb, change->data.snapshot);
505 1200 : change->data.snapshot = NULL;
506 : }
507 1200 : break;
508 : /* no data in addition to the struct itself */
509 : case REORDER_BUFFER_CHANGE_TRUNCATE:
510 26 : if (change->data.truncate.relids != NULL)
511 : {
512 26 : ReorderBufferReturnRelids(rb, change->data.truncate.relids);
513 26 : change->data.truncate.relids = NULL;
514 : }
515 26 : break;
516 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
517 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
518 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
519 101190 : break;
520 : }
521 :
522 3361556 : pfree(change);
523 3361556 : }
524 :
525 : /*
526 : * Get a fresh ReorderBufferTupleBuf fitting at least a tuple of size
527 : * tuple_len (excluding header overhead).
528 : */
529 : ReorderBufferTupleBuf *
530 3117582 : ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
531 : {
532 : ReorderBufferTupleBuf *tuple;
533 : Size alloc_len;
534 :
535 3117582 : alloc_len = tuple_len + SizeofHeapTupleHeader;
536 :
537 3117582 : tuple = (ReorderBufferTupleBuf *)
538 3117582 : MemoryContextAlloc(rb->tup_context,
539 : sizeof(ReorderBufferTupleBuf) +
540 : MAXIMUM_ALIGNOF + alloc_len);
541 3117582 : tuple->alloc_tuple_size = alloc_len;
542 3117582 : tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
543 :
544 3117582 : return tuple;
545 : }
546 :
547 : /*
548 : * Free an ReorderBufferTupleBuf.
549 : */
550 : void
551 3114740 : ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
552 : {
553 3114740 : pfree(tuple);
554 3114740 : }
555 :
556 : /*
557 : * Get an array for relids of truncated relations.
558 : *
559 : * We use the global memory context (for the whole reorder buffer), because
560 : * none of the existing ones seems like a good match (some are SLAB, so we
561 : * can't use those, and tup_context is meant for tuple data, not relids). We
562 : * could add yet another context, but it seems like an overkill - TRUNCATE is
563 : * not particularly common operation, so it does not seem worth it.
564 : */
565 : Oid *
566 26 : ReorderBufferGetRelids(ReorderBuffer *rb, int nrelids)
567 : {
568 : Oid *relids;
569 : Size alloc_len;
570 :
571 26 : alloc_len = sizeof(Oid) * nrelids;
572 :
573 26 : relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
574 :
575 26 : return relids;
576 : }
577 :
578 : /*
579 : * Free an array of relids.
580 : */
581 : void
582 26 : ReorderBufferReturnRelids(ReorderBuffer *rb, Oid *relids)
583 : {
584 26 : pfree(relids);
585 26 : }
586 :
587 : /*
588 : * Return the ReorderBufferTXN from the given buffer, specified by Xid.
589 : * If create is true, and a transaction doesn't already exist, create it
590 : * (with the given LSN, and as top transaction if that's specified);
591 : * when this happens, is_new is set to true.
592 : */
593 : static ReorderBufferTXN *
594 11474082 : ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
595 : bool *is_new, XLogRecPtr lsn, bool create_as_top)
596 : {
597 : ReorderBufferTXN *txn;
598 : ReorderBufferTXNByIdEnt *ent;
599 : bool found;
600 :
601 11474082 : Assert(TransactionIdIsValid(xid));
602 :
603 : /*
604 : * Check the one-entry lookup cache first
605 : */
606 22944014 : if (TransactionIdIsValid(rb->by_txn_last_xid) &&
607 11469932 : rb->by_txn_last_xid == xid)
608 : {
609 9475124 : txn = rb->by_txn_last_txn;
610 :
611 9475124 : if (txn != NULL)
612 : {
613 : /* found it, and it's valid */
614 9475118 : if (is_new)
615 3112 : *is_new = false;
616 9475118 : return txn;
617 : }
618 :
619 : /*
620 : * cached as non-existent, and asked not to create? Then nothing else
621 : * to do.
622 : */
623 6 : if (!create)
624 6 : return NULL;
625 : /* otherwise fall through to create it */
626 : }
627 :
628 : /*
629 : * If the cache wasn't hit or it yielded an "does-not-exist" and we want
630 : * to create an entry.
631 : */
632 :
633 : /* search the lookup table */
634 1998958 : ent = (ReorderBufferTXNByIdEnt *)
635 1998958 : hash_search(rb->by_txn,
636 : (void *) &xid,
637 : create ? HASH_ENTER : HASH_FIND,
638 : &found);
639 1998958 : if (found)
640 1991888 : txn = ent->txn;
641 7070 : else if (create)
642 : {
643 : /* initialize the new entry, if creation was requested */
644 4520 : Assert(ent != NULL);
645 4520 : Assert(lsn != InvalidXLogRecPtr);
646 :
647 4520 : ent->txn = ReorderBufferGetTXN(rb);
648 4520 : ent->txn->xid = xid;
649 4520 : txn = ent->txn;
650 4520 : txn->first_lsn = lsn;
651 4520 : txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
652 :
653 4520 : if (create_as_top)
654 : {
655 3274 : dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
656 3274 : AssertTXNLsnOrder(rb);
657 : }
658 : }
659 : else
660 2550 : txn = NULL; /* not found and not asked to create */
661 :
662 : /* update cache */
663 1998958 : rb->by_txn_last_xid = xid;
664 1998958 : rb->by_txn_last_txn = txn;
665 :
666 1998958 : if (is_new)
667 3306 : *is_new = !found;
668 :
669 1998958 : Assert(!create || txn != NULL);
670 1998958 : return txn;
671 : }
672 :
673 : /*
674 : * Record the partial change for the streaming of in-progress transactions. We
675 : * can stream only complete changes so if we have a partial change like toast
676 : * table insert or speculative insert then we mark such a 'txn' so that it
677 : * can't be streamed. We also ensure that if the changes in such a 'txn' are
678 : * above logical_decoding_work_mem threshold then we stream them as soon as we
679 : * have a complete change.
680 : */
681 : static void
682 2998210 : ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
683 : ReorderBufferChange *change,
684 : bool toast_insert)
685 : {
686 : ReorderBufferTXN *toptxn;
687 :
688 : /*
689 : * The partial changes need to be processed only while streaming
690 : * in-progress transactions.
691 : */
692 2998210 : if (!ReorderBufferCanStream(rb))
693 5434424 : return;
694 :
695 : /* Get the top transaction. */
696 561996 : if (txn->toptxn != NULL)
697 28960 : toptxn = txn->toptxn;
698 : else
699 533036 : toptxn = txn;
700 :
701 : /*
702 : * Set the toast insert bit whenever we get toast insert to indicate a
703 : * partial change and clear it when we get the insert or update on main
704 : * table (Both update and insert will do the insert in the toast table).
705 : */
706 561996 : if (toast_insert)
707 2726 : toptxn->txn_flags |= RBTXN_HAS_TOAST_INSERT;
708 559292 : else if (rbtxn_has_toast_insert(toptxn) &&
709 22 : IsInsertOrUpdate(change->action))
710 22 : toptxn->txn_flags &= ~RBTXN_HAS_TOAST_INSERT;
711 :
712 : /*
713 : * Set the spec insert bit whenever we get the speculative insert to
714 : * indicate the partial change and clear the same on speculative confirm.
715 : */
716 561996 : if (IsSpecInsert(change->action))
717 0 : toptxn->txn_flags |= RBTXN_HAS_SPEC_INSERT;
718 561996 : else if (IsSpecConfirm(change->action))
719 : {
720 : /*
721 : * Speculative confirm change must be preceded by speculative
722 : * insertion.
723 : */
724 0 : Assert(rbtxn_has_spec_insert(toptxn));
725 0 : toptxn->txn_flags &= ~RBTXN_HAS_SPEC_INSERT;
726 : }
727 :
728 : /*
729 : * Stream the transaction if it is serialized before and the changes are
730 : * now complete in the top-level transaction.
731 : *
732 : * The reason for doing the streaming of such a transaction as soon as we
733 : * get the complete change for it is that previously it would have reached
734 : * the memory threshold and wouldn't get streamed because of incomplete
735 : * changes. Delaying such transactions would increase apply lag for them.
736 : */
737 888760 : if (ReorderBufferCanStartStreaming(rb) &&
738 974840 : !(rbtxn_has_incomplete_tuple(toptxn)) &&
739 324038 : rbtxn_is_serialized(txn))
740 4 : ReorderBufferStreamTXN(rb, toptxn);
741 : }
742 :
743 : /*
744 : * Queue a change into a transaction so it can be replayed upon commit or will be
745 : * streamed when we reach logical_decoding_work_mem threshold.
746 : */
747 : void
748 3016764 : ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
749 : ReorderBufferChange *change, bool toast_insert)
750 : {
751 : ReorderBufferTXN *txn;
752 :
753 3016764 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
754 :
755 : /*
756 : * While streaming the previous changes we have detected that the
757 : * transaction is aborted. So there is no point in collecting further
758 : * changes for it.
759 : */
760 3016764 : if (txn->concurrent_abort)
761 : {
762 : /*
763 : * We don't need to update memory accounting for this change as we
764 : * have not added it to the queue yet.
765 : */
766 18554 : ReorderBufferReturnChange(rb, change, false);
767 3035312 : return;
768 : }
769 :
770 2998210 : change->lsn = lsn;
771 2998210 : change->txn = txn;
772 :
773 2998210 : Assert(InvalidXLogRecPtr != lsn);
774 2998210 : dlist_push_tail(&txn->changes, &change->node);
775 2998210 : txn->nentries++;
776 2998210 : txn->nentries_mem++;
777 :
778 : /* update memory accounting information */
779 2998210 : ReorderBufferChangeMemoryUpdate(rb, change, true);
780 :
781 : /* process partial change */
782 2998210 : ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
783 :
784 : /* check the memory limits and evict something if needed */
785 2998210 : ReorderBufferCheckMemoryLimit(rb);
786 : }
787 :
788 : /*
789 : * Queue message into a transaction so it can be processed upon commit.
790 : */
791 : void
792 56 : ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid,
793 : Snapshot snapshot, XLogRecPtr lsn,
794 : bool transactional, const char *prefix,
795 : Size message_size, const char *message)
796 : {
797 56 : if (transactional)
798 : {
799 : MemoryContext oldcontext;
800 : ReorderBufferChange *change;
801 :
802 50 : Assert(xid != InvalidTransactionId);
803 :
804 50 : oldcontext = MemoryContextSwitchTo(rb->context);
805 :
806 50 : change = ReorderBufferGetChange(rb);
807 50 : change->action = REORDER_BUFFER_CHANGE_MESSAGE;
808 50 : change->data.msg.prefix = pstrdup(prefix);
809 50 : change->data.msg.message_size = message_size;
810 50 : change->data.msg.message = palloc(message_size);
811 50 : memcpy(change->data.msg.message, message, message_size);
812 :
813 50 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
814 :
815 50 : MemoryContextSwitchTo(oldcontext);
816 : }
817 : else
818 : {
819 6 : ReorderBufferTXN *txn = NULL;
820 6 : volatile Snapshot snapshot_now = snapshot;
821 :
822 6 : if (xid != InvalidTransactionId)
823 4 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
824 :
825 : /* setup snapshot to allow catalog access */
826 6 : SetupHistoricSnapshot(snapshot_now, NULL);
827 6 : PG_TRY();
828 : {
829 6 : rb->message(rb, txn, lsn, false, prefix, message_size, message);
830 :
831 6 : TeardownHistoricSnapshot(false);
832 : }
833 0 : PG_CATCH();
834 : {
835 0 : TeardownHistoricSnapshot(true);
836 0 : PG_RE_THROW();
837 : }
838 6 : PG_END_TRY();
839 : }
840 56 : }
841 :
842 : /*
843 : * AssertTXNLsnOrder
844 : * Verify LSN ordering of transaction lists in the reorderbuffer
845 : *
846 : * Other LSN-related invariants are checked too.
847 : *
848 : * No-op if assertions are not in use.
849 : */
850 : static void
851 8102 : AssertTXNLsnOrder(ReorderBuffer *rb)
852 : {
853 : #ifdef USE_ASSERT_CHECKING
854 : dlist_iter iter;
855 8102 : XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
856 8102 : XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
857 :
858 16848 : dlist_foreach(iter, &rb->toplevel_by_lsn)
859 : {
860 8746 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, node,
861 : iter.cur);
862 :
863 : /* start LSN must be set */
864 8746 : Assert(cur_txn->first_lsn != InvalidXLogRecPtr);
865 :
866 : /* If there is an end LSN, it must be higher than start LSN */
867 8746 : if (cur_txn->end_lsn != InvalidXLogRecPtr)
868 8 : Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
869 :
870 : /* Current initial LSN must be strictly higher than previous */
871 8746 : if (prev_first_lsn != InvalidXLogRecPtr)
872 952 : Assert(prev_first_lsn < cur_txn->first_lsn);
873 :
874 : /* known-as-subtxn txns must not be listed */
875 8746 : Assert(!rbtxn_is_known_subxact(cur_txn));
876 :
877 8746 : prev_first_lsn = cur_txn->first_lsn;
878 : }
879 :
880 13316 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
881 : {
882 5214 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN,
883 : base_snapshot_node,
884 : iter.cur);
885 :
886 : /* base snapshot (and its LSN) must be set */
887 5214 : Assert(cur_txn->base_snapshot != NULL);
888 5214 : Assert(cur_txn->base_snapshot_lsn != InvalidXLogRecPtr);
889 :
890 : /* current LSN must be strictly higher than previous */
891 5214 : if (prev_base_snap_lsn != InvalidXLogRecPtr)
892 838 : Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
893 :
894 : /* known-as-subtxn txns must not be listed */
895 5214 : Assert(!rbtxn_is_known_subxact(cur_txn));
896 :
897 5214 : prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
898 : }
899 : #endif
900 8102 : }
901 :
902 : /*
903 : * AssertChangeLsnOrder
904 : *
905 : * Check ordering of changes in the (sub)transaction.
906 : */
907 : static void
908 2404 : AssertChangeLsnOrder(ReorderBufferTXN *txn)
909 : {
910 : #ifdef USE_ASSERT_CHECKING
911 : dlist_iter iter;
912 2404 : XLogRecPtr prev_lsn = txn->first_lsn;
913 :
914 358004 : dlist_foreach(iter, &txn->changes)
915 : {
916 : ReorderBufferChange *cur_change;
917 :
918 355600 : cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
919 :
920 355600 : Assert(txn->first_lsn != InvalidXLogRecPtr);
921 355600 : Assert(cur_change->lsn != InvalidXLogRecPtr);
922 355600 : Assert(txn->first_lsn <= cur_change->lsn);
923 :
924 355600 : if (txn->end_lsn != InvalidXLogRecPtr)
925 40560 : Assert(cur_change->lsn <= txn->end_lsn);
926 :
927 355600 : Assert(prev_lsn <= cur_change->lsn);
928 :
929 355600 : prev_lsn = cur_change->lsn;
930 : }
931 : #endif
932 2404 : }
933 :
934 : /*
935 : * ReorderBufferGetOldestTXN
936 : * Return oldest transaction in reorderbuffer
937 : */
938 : ReorderBufferTXN *
939 166 : ReorderBufferGetOldestTXN(ReorderBuffer *rb)
940 : {
941 : ReorderBufferTXN *txn;
942 :
943 166 : AssertTXNLsnOrder(rb);
944 :
945 166 : if (dlist_is_empty(&rb->toplevel_by_lsn))
946 148 : return NULL;
947 :
948 18 : txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
949 :
950 18 : Assert(!rbtxn_is_known_subxact(txn));
951 18 : Assert(txn->first_lsn != InvalidXLogRecPtr);
952 18 : return txn;
953 : }
954 :
955 : /*
956 : * ReorderBufferGetOldestXmin
957 : * Return oldest Xmin in reorderbuffer
958 : *
959 : * Returns oldest possibly running Xid from the point of view of snapshots
960 : * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
961 : * there are none.
962 : *
963 : * Since snapshots are assigned monotonically, this equals the Xmin of the
964 : * base snapshot with minimal base_snapshot_lsn.
965 : */
966 : TransactionId
967 186 : ReorderBufferGetOldestXmin(ReorderBuffer *rb)
968 : {
969 : ReorderBufferTXN *txn;
970 :
971 186 : AssertTXNLsnOrder(rb);
972 :
973 186 : if (dlist_is_empty(&rb->txns_by_base_snapshot_lsn))
974 168 : return InvalidTransactionId;
975 :
976 18 : txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
977 : &rb->txns_by_base_snapshot_lsn);
978 18 : return txn->base_snapshot->xmin;
979 : }
980 :
981 : void
982 200 : ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
983 : {
984 200 : rb->current_restart_decoding_lsn = ptr;
985 200 : }
986 :
987 : /*
988 : * ReorderBufferAssignChild
989 : *
990 : * Make note that we know that subxid is a subtransaction of xid, seen as of
991 : * the given lsn.
992 : */
993 : void
994 1594 : ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
995 : TransactionId subxid, XLogRecPtr lsn)
996 : {
997 : ReorderBufferTXN *txn;
998 : ReorderBufferTXN *subtxn;
999 : bool new_top;
1000 : bool new_sub;
1001 :
1002 1594 : txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1003 1594 : subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1004 :
1005 1594 : if (!new_sub)
1006 : {
1007 348 : if (rbtxn_is_known_subxact(subtxn))
1008 : {
1009 : /* already associated, nothing to do */
1010 1942 : return;
1011 : }
1012 : else
1013 : {
1014 : /*
1015 : * We already saw this transaction, but initially added it to the
1016 : * list of top-level txns. Now that we know it's not top-level,
1017 : * remove it from there.
1018 : */
1019 0 : dlist_delete(&subtxn->node);
1020 : }
1021 : }
1022 :
1023 1246 : subtxn->txn_flags |= RBTXN_IS_SUBXACT;
1024 1246 : subtxn->toplevel_xid = xid;
1025 1246 : Assert(subtxn->nsubtxns == 0);
1026 :
1027 : /* set the reference to top-level transaction */
1028 1246 : subtxn->toptxn = txn;
1029 :
1030 : /* add to subtransaction list */
1031 1246 : dlist_push_tail(&txn->subtxns, &subtxn->node);
1032 1246 : txn->nsubtxns++;
1033 :
1034 : /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1035 1246 : ReorderBufferTransferSnapToParent(txn, subtxn);
1036 :
1037 : /* Verify LSN-ordering invariant */
1038 1246 : AssertTXNLsnOrder(rb);
1039 : }
1040 :
1041 : /*
1042 : * ReorderBufferTransferSnapToParent
1043 : * Transfer base snapshot from subtxn to top-level txn, if needed
1044 : *
1045 : * This is done if the top-level txn doesn't have a base snapshot, or if the
1046 : * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1047 : * snapshot's LSN. This can happen if there are no changes in the toplevel
1048 : * txn but there are some in the subtxn, or the first change in subtxn has
1049 : * earlier LSN than first change in the top-level txn and we learned about
1050 : * their kinship only now.
1051 : *
1052 : * The subtransaction's snapshot is cleared regardless of the transfer
1053 : * happening, since it's not needed anymore in either case.
1054 : *
1055 : * We do this as soon as we become aware of their kinship, to avoid queueing
1056 : * extra snapshots to txns known-as-subtxns -- only top-level txns will
1057 : * receive further snapshots.
1058 : */
1059 : static void
1060 1248 : ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
1061 : ReorderBufferTXN *subtxn)
1062 : {
1063 1248 : Assert(subtxn->toplevel_xid == txn->xid);
1064 :
1065 1248 : if (subtxn->base_snapshot != NULL)
1066 : {
1067 0 : if (txn->base_snapshot == NULL ||
1068 0 : subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1069 : {
1070 : /*
1071 : * If the toplevel transaction already has a base snapshot but
1072 : * it's newer than the subxact's, purge it.
1073 : */
1074 0 : if (txn->base_snapshot != NULL)
1075 : {
1076 0 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1077 0 : dlist_delete(&txn->base_snapshot_node);
1078 : }
1079 :
1080 : /*
1081 : * The snapshot is now the top transaction's; transfer it, and
1082 : * adjust the list position of the top transaction in the list by
1083 : * moving it to where the subtransaction is.
1084 : */
1085 0 : txn->base_snapshot = subtxn->base_snapshot;
1086 0 : txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1087 0 : dlist_insert_before(&subtxn->base_snapshot_node,
1088 : &txn->base_snapshot_node);
1089 :
1090 : /*
1091 : * The subtransaction doesn't have a snapshot anymore (so it
1092 : * mustn't be in the list.)
1093 : */
1094 0 : subtxn->base_snapshot = NULL;
1095 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1096 0 : dlist_delete(&subtxn->base_snapshot_node);
1097 : }
1098 : else
1099 : {
1100 : /* Base snap of toplevel is fine, so subxact's is not needed */
1101 0 : SnapBuildSnapDecRefcount(subtxn->base_snapshot);
1102 0 : dlist_delete(&subtxn->base_snapshot_node);
1103 0 : subtxn->base_snapshot = NULL;
1104 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1105 : }
1106 : }
1107 1248 : }
1108 :
1109 : /*
1110 : * Associate a subtransaction with its toplevel transaction at commit
1111 : * time. There may be no further changes added after this.
1112 : */
1113 : void
1114 510 : ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid,
1115 : TransactionId subxid, XLogRecPtr commit_lsn,
1116 : XLogRecPtr end_lsn)
1117 : {
1118 : ReorderBufferTXN *subtxn;
1119 :
1120 510 : subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1121 : InvalidXLogRecPtr, false);
1122 :
1123 : /*
1124 : * No need to do anything if that subtxn didn't contain any changes
1125 : */
1126 510 : if (!subtxn)
1127 672 : return;
1128 :
1129 348 : subtxn->final_lsn = commit_lsn;
1130 348 : subtxn->end_lsn = end_lsn;
1131 :
1132 : /*
1133 : * Assign this subxact as a child of the toplevel xact (no-op if already
1134 : * done.)
1135 : */
1136 348 : ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
1137 : }
1138 :
1139 :
1140 : /*
1141 : * Support for efficiently iterating over a transaction's and its
1142 : * subtransactions' changes.
1143 : *
1144 : * We do by doing a k-way merge between transactions/subtransactions. For that
1145 : * we model the current heads of the different transactions as a binary heap
1146 : * so we easily know which (sub-)transaction has the change with the smallest
1147 : * lsn next.
1148 : *
1149 : * We assume the changes in individual transactions are already sorted by LSN.
1150 : */
1151 :
1152 : /*
1153 : * Binary heap comparison function.
1154 : */
1155 : static int
1156 110826 : ReorderBufferIterCompare(Datum a, Datum b, void *arg)
1157 : {
1158 110826 : ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg;
1159 110826 : XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1160 110826 : XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1161 :
1162 110826 : if (pos_a < pos_b)
1163 108176 : return 1;
1164 2650 : else if (pos_a == pos_b)
1165 0 : return 0;
1166 2650 : return -1;
1167 : }
1168 :
1169 : /*
1170 : * Allocate & initialize an iterator which iterates in lsn order over a
1171 : * transaction and all its subtransactions.
1172 : *
1173 : * Note: The iterator state is returned through iter_state parameter rather
1174 : * than the function's return value. This is because the state gets cleaned up
1175 : * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1176 : * back the state even if this function throws an exception.
1177 : */
1178 : static void
1179 1898 : ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
1180 : ReorderBufferIterTXNState *volatile *iter_state)
1181 : {
1182 1898 : Size nr_txns = 0;
1183 : ReorderBufferIterTXNState *state;
1184 : dlist_iter cur_txn_i;
1185 : int32 off;
1186 :
1187 1898 : *iter_state = NULL;
1188 :
1189 : /* Check ordering of changes in the toplevel transaction. */
1190 1898 : AssertChangeLsnOrder(txn);
1191 :
1192 : /*
1193 : * Calculate the size of our heap: one element for every transaction that
1194 : * contains changes. (Besides the transactions already in the reorder
1195 : * buffer, we count the one we were directly passed.)
1196 : */
1197 1898 : if (txn->nentries > 0)
1198 1802 : nr_txns++;
1199 :
1200 2404 : dlist_foreach(cur_txn_i, &txn->subtxns)
1201 : {
1202 : ReorderBufferTXN *cur_txn;
1203 :
1204 506 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1205 :
1206 : /* Check ordering of changes in this subtransaction. */
1207 506 : AssertChangeLsnOrder(cur_txn);
1208 :
1209 506 : if (cur_txn->nentries > 0)
1210 430 : nr_txns++;
1211 : }
1212 :
1213 : /* allocate iteration state */
1214 1898 : state = (ReorderBufferIterTXNState *)
1215 1898 : MemoryContextAllocZero(rb->context,
1216 : sizeof(ReorderBufferIterTXNState) +
1217 1898 : sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1218 :
1219 1898 : state->nr_txns = nr_txns;
1220 1898 : dlist_init(&state->old_change);
1221 :
1222 4130 : for (off = 0; off < state->nr_txns; off++)
1223 : {
1224 2232 : state->entries[off].file.vfd = -1;
1225 2232 : state->entries[off].segno = 0;
1226 : }
1227 :
1228 : /* allocate heap */
1229 1898 : state->heap = binaryheap_allocate(state->nr_txns,
1230 : ReorderBufferIterCompare,
1231 : state);
1232 :
1233 : /* Now that the state fields are initialized, it is safe to return it. */
1234 1898 : *iter_state = state;
1235 :
1236 : /*
1237 : * Now insert items into the binary heap, in an unordered fashion. (We
1238 : * will run a heap assembly step at the end; this is more efficient.)
1239 : */
1240 :
1241 1898 : off = 0;
1242 :
1243 : /* add toplevel transaction if it contains changes */
1244 1898 : if (txn->nentries > 0)
1245 : {
1246 : ReorderBufferChange *cur_change;
1247 :
1248 1802 : if (rbtxn_is_serialized(txn))
1249 : {
1250 : /* serialize remaining changes */
1251 34 : ReorderBufferSerializeTXN(rb, txn);
1252 34 : ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1253 : &state->entries[off].segno);
1254 : }
1255 :
1256 1802 : cur_change = dlist_head_element(ReorderBufferChange, node,
1257 : &txn->changes);
1258 :
1259 1802 : state->entries[off].lsn = cur_change->lsn;
1260 1802 : state->entries[off].change = cur_change;
1261 1802 : state->entries[off].txn = txn;
1262 :
1263 1802 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1264 : }
1265 :
1266 : /* add subtransactions if they contain changes */
1267 2404 : dlist_foreach(cur_txn_i, &txn->subtxns)
1268 : {
1269 : ReorderBufferTXN *cur_txn;
1270 :
1271 506 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1272 :
1273 506 : if (cur_txn->nentries > 0)
1274 : {
1275 : ReorderBufferChange *cur_change;
1276 :
1277 430 : if (rbtxn_is_serialized(cur_txn))
1278 : {
1279 : /* serialize remaining changes */
1280 32 : ReorderBufferSerializeTXN(rb, cur_txn);
1281 32 : ReorderBufferRestoreChanges(rb, cur_txn,
1282 : &state->entries[off].file,
1283 : &state->entries[off].segno);
1284 : }
1285 430 : cur_change = dlist_head_element(ReorderBufferChange, node,
1286 : &cur_txn->changes);
1287 :
1288 430 : state->entries[off].lsn = cur_change->lsn;
1289 430 : state->entries[off].change = cur_change;
1290 430 : state->entries[off].txn = cur_txn;
1291 :
1292 430 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1293 : }
1294 : }
1295 :
1296 : /* assemble a valid binary heap */
1297 1898 : binaryheap_build(state->heap);
1298 1898 : }
1299 :
1300 : /*
1301 : * Return the next change when iterating over a transaction and its
1302 : * subtransactions.
1303 : *
1304 : * Returns NULL when no further changes exist.
1305 : */
1306 : static ReorderBufferChange *
1307 660608 : ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
1308 : {
1309 : ReorderBufferChange *change;
1310 : ReorderBufferIterTXNEntry *entry;
1311 : int32 off;
1312 :
1313 : /* nothing there anymore */
1314 660608 : if (state->heap->bh_size == 0)
1315 1878 : return NULL;
1316 :
1317 658730 : off = DatumGetInt32(binaryheap_first(state->heap));
1318 658730 : entry = &state->entries[off];
1319 :
1320 : /* free memory we might have "leaked" in the previous *Next call */
1321 658730 : if (!dlist_is_empty(&state->old_change))
1322 : {
1323 82 : change = dlist_container(ReorderBufferChange, node,
1324 : dlist_pop_head_node(&state->old_change));
1325 82 : ReorderBufferReturnChange(rb, change, true);
1326 82 : Assert(dlist_is_empty(&state->old_change));
1327 : }
1328 :
1329 658730 : change = entry->change;
1330 :
1331 : /*
1332 : * update heap with information about which transaction has the next
1333 : * relevant change in LSN order
1334 : */
1335 :
1336 : /* there are in-memory changes */
1337 658730 : if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1338 : {
1339 656454 : dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1340 656454 : ReorderBufferChange *next_change =
1341 656454 : dlist_container(ReorderBufferChange, node, next);
1342 :
1343 : /* txn stays the same */
1344 656454 : state->entries[off].lsn = next_change->lsn;
1345 656454 : state->entries[off].change = next_change;
1346 :
1347 656454 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1348 656454 : return change;
1349 : }
1350 :
1351 : /* try to load changes from disk */
1352 2276 : if (entry->txn->nentries != entry->txn->nentries_mem)
1353 : {
1354 : /*
1355 : * Ugly: restoring changes will reuse *Change records, thus delete the
1356 : * current one from the per-tx list and only free in the next call.
1357 : */
1358 116 : dlist_delete(&change->node);
1359 116 : dlist_push_tail(&state->old_change, &change->node);
1360 :
1361 116 : if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1362 : &state->entries[off].segno))
1363 : {
1364 : /* successfully restored changes from disk */
1365 64 : ReorderBufferChange *next_change =
1366 64 : dlist_head_element(ReorderBufferChange, node,
1367 : &entry->txn->changes);
1368 :
1369 64 : elog(DEBUG2, "restored %u/%u changes from disk",
1370 : (uint32) entry->txn->nentries_mem,
1371 : (uint32) entry->txn->nentries);
1372 :
1373 64 : Assert(entry->txn->nentries_mem);
1374 : /* txn stays the same */
1375 64 : state->entries[off].lsn = next_change->lsn;
1376 64 : state->entries[off].change = next_change;
1377 64 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1378 :
1379 64 : return change;
1380 : }
1381 : }
1382 :
1383 : /* ok, no changes there anymore, remove */
1384 2212 : binaryheap_remove_first(state->heap);
1385 :
1386 2212 : return change;
1387 : }
1388 :
1389 : /*
1390 : * Deallocate the iterator
1391 : */
1392 : static void
1393 1892 : ReorderBufferIterTXNFinish(ReorderBuffer *rb,
1394 : ReorderBufferIterTXNState *state)
1395 : {
1396 : int32 off;
1397 :
1398 4118 : for (off = 0; off < state->nr_txns; off++)
1399 : {
1400 2226 : if (state->entries[off].file.vfd != -1)
1401 0 : FileClose(state->entries[off].file.vfd);
1402 : }
1403 :
1404 : /* free memory we might have "leaked" in the last *Next call */
1405 1892 : if (!dlist_is_empty(&state->old_change))
1406 : {
1407 : ReorderBufferChange *change;
1408 :
1409 32 : change = dlist_container(ReorderBufferChange, node,
1410 : dlist_pop_head_node(&state->old_change));
1411 32 : ReorderBufferReturnChange(rb, change, true);
1412 32 : Assert(dlist_is_empty(&state->old_change));
1413 : }
1414 :
1415 1892 : binaryheap_free(state->heap);
1416 1892 : pfree(state);
1417 1892 : }
1418 :
1419 : /*
1420 : * Cleanup the contents of a transaction, usually after the transaction
1421 : * committed or aborted.
1422 : */
1423 : static void
1424 4468 : ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
1425 : {
1426 : bool found;
1427 : dlist_mutable_iter iter;
1428 :
1429 : /* cleanup subtransactions & their changes */
1430 4816 : dlist_foreach_modify(iter, &txn->subtxns)
1431 : {
1432 : ReorderBufferTXN *subtxn;
1433 :
1434 348 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1435 :
1436 : /*
1437 : * Subtransactions are always associated to the toplevel TXN, even if
1438 : * they originally were happening inside another subtxn, so we won't
1439 : * ever recurse more than one level deep here.
1440 : */
1441 348 : Assert(rbtxn_is_known_subxact(subtxn));
1442 348 : Assert(subtxn->nsubtxns == 0);
1443 :
1444 348 : ReorderBufferCleanupTXN(rb, subtxn);
1445 : }
1446 :
1447 : /* cleanup changes in the txn */
1448 138056 : dlist_foreach_modify(iter, &txn->changes)
1449 : {
1450 : ReorderBufferChange *change;
1451 :
1452 133588 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1453 :
1454 : /* Check we're not mixing changes from different transactions. */
1455 133588 : Assert(change->txn == txn);
1456 :
1457 133588 : ReorderBufferReturnChange(rb, change, true);
1458 : }
1459 :
1460 : /*
1461 : * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1462 : * They are always stored in the toplevel transaction.
1463 : */
1464 35008 : dlist_foreach_modify(iter, &txn->tuplecids)
1465 : {
1466 : ReorderBufferChange *change;
1467 :
1468 30540 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1469 :
1470 : /* Check we're not mixing changes from different transactions. */
1471 30540 : Assert(change->txn == txn);
1472 30540 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1473 :
1474 30540 : ReorderBufferReturnChange(rb, change, true);
1475 : }
1476 :
1477 : /*
1478 : * Cleanup the base snapshot, if set.
1479 : */
1480 4468 : if (txn->base_snapshot != NULL)
1481 : {
1482 3186 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1483 3186 : dlist_delete(&txn->base_snapshot_node);
1484 : }
1485 :
1486 : /*
1487 : * Cleanup the snapshot for the last streamed run.
1488 : */
1489 4468 : if (txn->snapshot_now != NULL)
1490 : {
1491 54 : Assert(rbtxn_is_streamed(txn));
1492 54 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
1493 : }
1494 :
1495 : /*
1496 : * Remove TXN from its containing list.
1497 : *
1498 : * Note: if txn is known as subxact, we are deleting the TXN from its
1499 : * parent's list of known subxacts; this leaves the parent's nsubxacts
1500 : * count too high, but we don't care. Otherwise, we are deleting the TXN
1501 : * from the LSN-ordered list of toplevel TXNs.
1502 : */
1503 4468 : dlist_delete(&txn->node);
1504 :
1505 : /* now remove reference from buffer */
1506 4468 : hash_search(rb->by_txn,
1507 4468 : (void *) &txn->xid,
1508 : HASH_REMOVE,
1509 : &found);
1510 4468 : Assert(found);
1511 :
1512 : /* remove entries spilled to disk */
1513 4468 : if (rbtxn_is_serialized(txn))
1514 422 : ReorderBufferRestoreCleanup(rb, txn);
1515 :
1516 : /* deallocate */
1517 4468 : ReorderBufferReturnTXN(rb, txn);
1518 4468 : }
1519 :
1520 : /*
1521 : * Discard changes from a transaction (and subtransactions), either after streaming or
1522 : * after a PREPARE.
1523 : * The flag txn_prepared indicates if this is called after a PREPARE.
1524 : * If streaming, keep the remaining info - transactions, tuplecids, invalidations and
1525 : * snapshots. If after a PREPARE, keep only the invalidations and snapshots.
1526 : */
1527 : static void
1528 1040 : ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared)
1529 : {
1530 : dlist_mutable_iter iter;
1531 :
1532 : /* cleanup subtransactions & their changes */
1533 1218 : dlist_foreach_modify(iter, &txn->subtxns)
1534 : {
1535 : ReorderBufferTXN *subtxn;
1536 :
1537 178 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1538 :
1539 : /*
1540 : * Subtransactions are always associated to the toplevel TXN, even if
1541 : * they originally were happening inside another subtxn, so we won't
1542 : * ever recurse more than one level deep here.
1543 : */
1544 178 : Assert(rbtxn_is_known_subxact(subtxn));
1545 178 : Assert(subtxn->nsubtxns == 0);
1546 :
1547 178 : ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
1548 : }
1549 :
1550 : /* cleanup changes in the txn */
1551 321496 : dlist_foreach_modify(iter, &txn->changes)
1552 : {
1553 : ReorderBufferChange *change;
1554 :
1555 320456 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1556 :
1557 : /* Check we're not mixing changes from different transactions. */
1558 320456 : Assert(change->txn == txn);
1559 :
1560 : /* remove the change from it's containing list */
1561 320456 : dlist_delete(&change->node);
1562 :
1563 320456 : ReorderBufferReturnChange(rb, change, true);
1564 : }
1565 :
1566 : /*
1567 : * Mark the transaction as streamed.
1568 : *
1569 : * The toplevel transaction, identified by (toptxn==NULL), is marked as
1570 : * streamed always, even if it does not contain any changes (that is, when
1571 : * all the changes are in subtransactions).
1572 : *
1573 : * For subtransactions, we only mark them as streamed when there are
1574 : * changes in them.
1575 : *
1576 : * We do it this way because of aborts - we don't want to send aborts for
1577 : * XIDs the downstream is not aware of. And of course, it always knows
1578 : * about the toplevel xact (we send the XID in all messages), but we never
1579 : * stream XIDs of empty subxacts.
1580 : */
1581 1040 : if ((!txn_prepared) && ((!txn->toptxn) || (txn->nentries_mem != 0)))
1582 918 : txn->txn_flags |= RBTXN_IS_STREAMED;
1583 :
1584 1040 : if (txn_prepared)
1585 : {
1586 : /*
1587 : * If this is a prepared txn, cleanup the tuplecids we stored for decoding
1588 : * catalog snapshot access.
1589 : * They are always stored in the toplevel transaction.
1590 : */
1591 300 : dlist_foreach_modify(iter, &txn->tuplecids)
1592 : {
1593 : ReorderBufferChange *change;
1594 :
1595 252 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1596 :
1597 : /* Check we're not mixing changes from different transactions. */
1598 252 : Assert(change->txn == txn);
1599 252 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1600 :
1601 : /* Remove the change from its containing list. */
1602 252 : dlist_delete(&change->node);
1603 :
1604 252 : ReorderBufferReturnChange(rb, change, true);
1605 : }
1606 : }
1607 :
1608 : /*
1609 : * Destroy the (relfilenode, ctid) hashtable, so that we don't leak any
1610 : * memory. We could also keep the hash table and update it with new ctid
1611 : * values, but this seems simpler and good enough for now.
1612 : */
1613 1040 : if (txn->tuplecid_hash != NULL)
1614 : {
1615 36 : hash_destroy(txn->tuplecid_hash);
1616 36 : txn->tuplecid_hash = NULL;
1617 : }
1618 :
1619 : /* If this txn is serialized then clean the disk space. */
1620 1040 : if (rbtxn_is_serialized(txn))
1621 : {
1622 4 : ReorderBufferRestoreCleanup(rb, txn);
1623 4 : txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
1624 :
1625 : /*
1626 : * We set this flag to indicate if the transaction is ever serialized.
1627 : * We need this to accurately update the stats as otherwise the same
1628 : * transaction can be counted as serialized multiple times.
1629 : */
1630 4 : txn->txn_flags |= RBTXN_IS_SERIALIZED_CLEAR;
1631 : }
1632 :
1633 : /* also reset the number of entries in the transaction */
1634 1040 : txn->nentries_mem = 0;
1635 1040 : txn->nentries = 0;
1636 1040 : }
1637 :
1638 : /*
1639 : * Build a hash with a (relfilenode, ctid) -> (cmin, cmax) mapping for use by
1640 : * HeapTupleSatisfiesHistoricMVCC.
1641 : */
1642 : static void
1643 1898 : ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
1644 : {
1645 : dlist_iter iter;
1646 : HASHCTL hash_ctl;
1647 :
1648 1898 : if (!rbtxn_has_catalog_changes(txn) || dlist_is_empty(&txn->tuplecids))
1649 3358 : return;
1650 :
1651 438 : memset(&hash_ctl, 0, sizeof(hash_ctl));
1652 :
1653 438 : hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
1654 438 : hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1655 438 : hash_ctl.hcxt = rb->context;
1656 :
1657 : /*
1658 : * create the hash with the exact number of to-be-stored tuplecids from
1659 : * the start
1660 : */
1661 438 : txn->tuplecid_hash =
1662 438 : hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1663 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1664 :
1665 12502 : dlist_foreach(iter, &txn->tuplecids)
1666 : {
1667 : ReorderBufferTupleCidKey key;
1668 : ReorderBufferTupleCidEnt *ent;
1669 : bool found;
1670 : ReorderBufferChange *change;
1671 :
1672 12064 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1673 :
1674 12064 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1675 :
1676 : /* be careful about padding */
1677 12064 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1678 :
1679 12064 : key.relnode = change->data.tuplecid.node;
1680 :
1681 12064 : ItemPointerCopy(&change->data.tuplecid.tid,
1682 : &key.tid);
1683 :
1684 12064 : ent = (ReorderBufferTupleCidEnt *)
1685 12064 : hash_search(txn->tuplecid_hash,
1686 : (void *) &key,
1687 : HASH_ENTER | HASH_FIND,
1688 : &found);
1689 12064 : if (!found)
1690 : {
1691 9154 : ent->cmin = change->data.tuplecid.cmin;
1692 9154 : ent->cmax = change->data.tuplecid.cmax;
1693 9154 : ent->combocid = change->data.tuplecid.combocid;
1694 : }
1695 : else
1696 : {
1697 : /*
1698 : * Maybe we already saw this tuple before in this transaction, but
1699 : * if so it must have the same cmin.
1700 : */
1701 2910 : Assert(ent->cmin == change->data.tuplecid.cmin);
1702 :
1703 : /*
1704 : * cmax may be initially invalid, but once set it can only grow,
1705 : * and never become invalid again.
1706 : */
1707 2910 : Assert((ent->cmax == InvalidCommandId) ||
1708 : ((change->data.tuplecid.cmax != InvalidCommandId) &&
1709 : (change->data.tuplecid.cmax > ent->cmax)));
1710 2910 : ent->cmax = change->data.tuplecid.cmax;
1711 : }
1712 : }
1713 : }
1714 :
1715 : /*
1716 : * Copy a provided snapshot so we can modify it privately. This is needed so
1717 : * that catalog modifying transactions can look into intermediate catalog
1718 : * states.
1719 : */
1720 : static Snapshot
1721 1642 : ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
1722 : ReorderBufferTXN *txn, CommandId cid)
1723 : {
1724 : Snapshot snap;
1725 : dlist_iter iter;
1726 1642 : int i = 0;
1727 : Size size;
1728 :
1729 1642 : size = sizeof(SnapshotData) +
1730 3284 : sizeof(TransactionId) * orig_snap->xcnt +
1731 1642 : sizeof(TransactionId) * (txn->nsubtxns + 1);
1732 :
1733 1642 : snap = MemoryContextAllocZero(rb->context, size);
1734 1642 : memcpy(snap, orig_snap, sizeof(SnapshotData));
1735 :
1736 1642 : snap->copied = true;
1737 1642 : snap->active_count = 1; /* mark as active so nobody frees it */
1738 1642 : snap->regd_count = 0;
1739 1642 : snap->xip = (TransactionId *) (snap + 1);
1740 :
1741 1642 : memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1742 :
1743 : /*
1744 : * snap->subxip contains all txids that belong to our transaction which we
1745 : * need to check via cmin/cmax. That's why we store the toplevel
1746 : * transaction in there as well.
1747 : */
1748 1642 : snap->subxip = snap->xip + snap->xcnt;
1749 1642 : snap->subxip[i++] = txn->xid;
1750 :
1751 : /*
1752 : * subxcnt isn't decreased when subtransactions abort, so count manually.
1753 : * Since it's an upper boundary it is safe to use it for the allocation
1754 : * above.
1755 : */
1756 1642 : snap->subxcnt = 1;
1757 :
1758 1838 : dlist_foreach(iter, &txn->subtxns)
1759 : {
1760 : ReorderBufferTXN *sub_txn;
1761 :
1762 196 : sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
1763 196 : snap->subxip[i++] = sub_txn->xid;
1764 196 : snap->subxcnt++;
1765 : }
1766 :
1767 : /* sort so we can bsearch() later */
1768 1642 : qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1769 :
1770 : /* store the specified current CommandId */
1771 1642 : snap->curcid = cid;
1772 :
1773 1642 : return snap;
1774 : }
1775 :
1776 : /*
1777 : * Free a previously ReorderBufferCopySnap'ed snapshot
1778 : */
1779 : static void
1780 2834 : ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
1781 : {
1782 2834 : if (snap->copied)
1783 1638 : pfree(snap);
1784 : else
1785 1196 : SnapBuildSnapDecRefcount(snap);
1786 2834 : }
1787 :
1788 : /*
1789 : * If the transaction was (partially) streamed, we need to commit it in a
1790 : * 'streamed' way. That is, we first stream the remaining part of the
1791 : * transaction, and then invoke stream_commit message.
1792 : */
1793 : static void
1794 52 : ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
1795 : {
1796 : /* we should only call this for previously streamed transactions */
1797 52 : Assert(rbtxn_is_streamed(txn));
1798 :
1799 52 : ReorderBufferStreamTXN(rb, txn);
1800 :
1801 52 : if (rbtxn_prepared(txn))
1802 : {
1803 18 : rb->stream_prepare(rb, txn, txn->final_lsn);
1804 : /* Here we are streaming and part of the PREPARE of a two-phase commit
1805 : * The full cleanup will happen as part of the COMMIT PREPAREDs, so now
1806 : * just truncate txn by removing changes and tuple_cids
1807 : */
1808 18 : ReorderBufferTruncateTXN(rb, txn, true);
1809 : /* Reset the CheckXidAlive */
1810 18 : CheckXidAlive = InvalidTransactionId;
1811 : }
1812 : else
1813 : {
1814 34 : rb->stream_commit(rb, txn, txn->final_lsn);
1815 34 : ReorderBufferCleanupTXN(rb, txn);
1816 : }
1817 52 : }
1818 :
1819 : /*
1820 : * Set xid to detect concurrent aborts.
1821 : *
1822 : * While streaming an in-progress transaction there is a possibility that the
1823 : * (sub)transaction might get aborted concurrently. In such case if the
1824 : * (sub)transaction has catalog update then we might decode the tuple using
1825 : * wrong catalog version. For example, suppose there is one catalog tuple with
1826 : * (xmin: 500, xmax: 0). Now, the transaction 501 updates the catalog tuple
1827 : * and after that we will have two tuples (xmin: 500, xmax: 501) and
1828 : * (xmin: 501, xmax: 0). Now, if 501 is aborted and some other transaction
1829 : * say 502 updates the same catalog tuple then the first tuple will be changed
1830 : * to (xmin: 500, xmax: 502). So, the problem is that when we try to decode
1831 : * the tuple inserted/updated in 501 after the catalog update, we will see the
1832 : * catalog tuple with (xmin: 500, xmax: 502) as visible because it will
1833 : * consider that the tuple is deleted by xid 502 which is not visible to our
1834 : * snapshot. And when we will try to decode with that catalog tuple, it can
1835 : * lead to a wrong result or a crash. So, it is necessary to detect
1836 : * concurrent aborts to allow streaming of in-progress transactions.
1837 : *
1838 : * For detecting the concurrent abort we set CheckXidAlive to the current
1839 : * (sub)transaction's xid for which this change belongs to. And, during
1840 : * catalog scan we can check the status of the xid and if it is aborted we will
1841 : * report a specific error so that we can stop streaming current transaction
1842 : * and discard the already streamed changes on such an error. We might have
1843 : * already streamed some of the changes for the aborted (sub)transaction, but
1844 : * that is fine because when we decode the abort we will stream abort message
1845 : * to truncate the changes in the subscriber.
1846 : */
1847 : static inline void
1848 321612 : SetupCheckXidLive(TransactionId xid)
1849 : {
1850 : /*
1851 : * If the input transaction id is already set as a CheckXidAlive then
1852 : * nothing to do.
1853 : */
1854 321612 : if (TransactionIdEquals(CheckXidAlive, xid))
1855 517688 : return;
1856 :
1857 : /*
1858 : * setup CheckXidAlive if it's not committed yet. We don't check if the
1859 : * xid is aborted. That will happen during catalog access.
1860 : */
1861 125536 : if (!TransactionIdDidCommit(xid))
1862 544 : CheckXidAlive = xid;
1863 : else
1864 124992 : CheckXidAlive = InvalidTransactionId;
1865 : }
1866 :
1867 : /*
1868 : * Helper function for ReorderBufferProcessTXN for applying change.
1869 : */
1870 : static inline void
1871 632718 : ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
1872 : Relation relation, ReorderBufferChange *change,
1873 : bool streaming)
1874 : {
1875 632718 : if (streaming)
1876 318344 : rb->stream_change(rb, txn, relation, change);
1877 : else
1878 314374 : rb->apply_change(rb, txn, relation, change);
1879 632710 : }
1880 :
1881 : /*
1882 : * Helper function for ReorderBufferProcessTXN for applying the truncate.
1883 : */
1884 : static inline void
1885 20 : ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn,
1886 : int nrelations, Relation *relations,
1887 : ReorderBufferChange *change, bool streaming)
1888 : {
1889 20 : if (streaming)
1890 0 : rb->stream_truncate(rb, txn, nrelations, relations, change);
1891 : else
1892 20 : rb->apply_truncate(rb, txn, nrelations, relations, change);
1893 20 : }
1894 :
1895 : /*
1896 : * Helper function for ReorderBufferProcessTXN for applying the message.
1897 : */
1898 : static inline void
1899 12 : ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn,
1900 : ReorderBufferChange *change, bool streaming)
1901 : {
1902 12 : if (streaming)
1903 6 : rb->stream_message(rb, txn, change->lsn, true,
1904 2 : change->data.msg.prefix,
1905 : change->data.msg.message_size,
1906 2 : change->data.msg.message);
1907 : else
1908 30 : rb->message(rb, txn, change->lsn, true,
1909 10 : change->data.msg.prefix,
1910 : change->data.msg.message_size,
1911 10 : change->data.msg.message);
1912 12 : }
1913 :
1914 : /*
1915 : * Function to store the command id and snapshot at the end of the current
1916 : * stream so that we can reuse the same while sending the next stream.
1917 : */
1918 : static inline void
1919 814 : ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn,
1920 : Snapshot snapshot_now, CommandId command_id)
1921 : {
1922 814 : txn->command_id = command_id;
1923 :
1924 : /* Avoid copying if it's already copied. */
1925 814 : if (snapshot_now->copied)
1926 814 : txn->snapshot_now = snapshot_now;
1927 : else
1928 0 : txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
1929 : txn, command_id);
1930 814 : }
1931 :
1932 : /*
1933 : * Helper function for ReorderBufferProcessTXN to handle the concurrent
1934 : * abort of the streaming transaction. This resets the TXN such that it
1935 : * can be used to stream the remaining data of transaction being processed.
1936 : */
1937 : static void
1938 10 : ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
1939 : Snapshot snapshot_now,
1940 : CommandId command_id,
1941 : XLogRecPtr last_lsn,
1942 : ReorderBufferChange *specinsert)
1943 : {
1944 : /* Discard the changes that we just streamed.
1945 : * This can only be called if streaming and not part of a PREPARE in
1946 : * a two-phase commit, so set prepared flag as false.
1947 : */
1948 10 : ReorderBufferTruncateTXN(rb, txn, false);
1949 :
1950 : /* Free all resources allocated for toast reconstruction */
1951 10 : ReorderBufferToastReset(rb, txn);
1952 :
1953 : /* Return the spec insert change if it is not NULL */
1954 10 : if (specinsert != NULL)
1955 : {
1956 0 : ReorderBufferReturnChange(rb, specinsert, true);
1957 0 : specinsert = NULL;
1958 : }
1959 :
1960 : /* Stop the stream. */
1961 10 : rb->stream_stop(rb, txn, last_lsn);
1962 :
1963 : /* Remember the command ID and snapshot for the streaming run */
1964 10 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
1965 10 : }
1966 :
1967 : /*
1968 : * Helper function for ReorderBufferCommit and ReorderBufferStreamTXN.
1969 : *
1970 : * We are here due to one of the 3 scenarios:
1971 : * 1. As part of streaming an in-progress transactions
1972 : * 2. Prepare of a two-phase commit
1973 : * 3. Commit of a transaction.
1974 : *
1975 : * Send data of a transaction (and its subtransactions) to the
1976 : * output plugin. We iterate over the top and subtransactions (using a k-way
1977 : * merge) and replay the changes in lsn order.
1978 : *
1979 : * If streaming is true then data will be sent using stream API.
1980 : *
1981 : * Note: "volatile" markers on some parameters are to avoid trouble with
1982 : * PG_TRY inside the function.
1983 : */
1984 : static void
1985 1898 : ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
1986 : XLogRecPtr commit_lsn,
1987 : volatile Snapshot snapshot_now,
1988 : volatile CommandId command_id,
1989 : bool streaming)
1990 : {
1991 : bool using_subtxn;
1992 1898 : MemoryContext ccxt = CurrentMemoryContext;
1993 1898 : ReorderBufferIterTXNState *volatile iterstate = NULL;
1994 1898 : volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
1995 1898 : ReorderBufferChange *volatile specinsert = NULL;
1996 1898 : volatile bool stream_started = false;
1997 1898 : ReorderBufferTXN *volatile curtxn = NULL;
1998 :
1999 : /* build data to be able to lookup the CommandIds of catalog tuples */
2000 1898 : ReorderBufferBuildTupleCidHash(rb, txn);
2001 :
2002 : /* setup the initial snapshot */
2003 1898 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2004 :
2005 : /*
2006 : * Decoding needs access to syscaches et al., which in turn use
2007 : * heavyweight locks and such. Thus we need to have enough state around to
2008 : * keep track of those. The easiest way is to simply use a transaction
2009 : * internally. That also allows us to easily enforce that nothing writes
2010 : * to the database by checking for xid assignments.
2011 : *
2012 : * When we're called via the SQL SRF there's already a transaction
2013 : * started, so start an explicit subtransaction there.
2014 : */
2015 1898 : using_subtxn = IsTransactionOrTransactionBlock();
2016 :
2017 1898 : PG_TRY();
2018 : {
2019 : ReorderBufferChange *change;
2020 :
2021 1898 : if (using_subtxn)
2022 740 : BeginInternalSubTransaction(streaming ? "stream" : "replay");
2023 : else
2024 1158 : StartTransactionCommand();
2025 :
2026 : /* We only need to send begin/commit for non-streamed transactions. */
2027 1898 : if (!streaming)
2028 1078 : rb->begin(rb, txn);
2029 :
2030 1898 : ReorderBufferIterTXNInit(rb, txn, &iterstate);
2031 1898 : while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2032 : {
2033 658730 : Relation relation = NULL;
2034 : Oid reloid;
2035 :
2036 : /*
2037 : * We can't call start stream callback before processing first
2038 : * change.
2039 : */
2040 658730 : if (prev_lsn == InvalidXLogRecPtr)
2041 : {
2042 1890 : if (streaming)
2043 : {
2044 812 : txn->origin_id = change->origin_id;
2045 812 : rb->stream_start(rb, txn, change->lsn);
2046 812 : stream_started = true;
2047 : }
2048 : }
2049 :
2050 : /*
2051 : * Enforce correct ordering of changes, merged from multiple
2052 : * subtransactions. The changes may have the same LSN due to
2053 : * MULTI_INSERT xlog records.
2054 : */
2055 658730 : Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn);
2056 :
2057 658730 : prev_lsn = change->lsn;
2058 :
2059 : /* Set the current xid to detect concurrent aborts. */
2060 658730 : if (streaming || rbtxn_prepared(change->txn))
2061 : {
2062 321612 : curtxn = change->txn;
2063 321612 : SetupCheckXidLive(curtxn->xid);
2064 : }
2065 :
2066 658730 : switch (change->action)
2067 : {
2068 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
2069 :
2070 : /*
2071 : * Confirmation for speculative insertion arrived. Simply
2072 : * use as a normal record. It'll be cleaned up at the end
2073 : * of INSERT processing.
2074 : */
2075 3564 : if (specinsert == NULL)
2076 0 : elog(ERROR, "invalid ordering of speculative insertion changes");
2077 3564 : Assert(specinsert->data.tp.oldtuple == NULL);
2078 3564 : change = specinsert;
2079 3564 : change->action = REORDER_BUFFER_CHANGE_INSERT;
2080 :
2081 : /* intentionally fall through */
2082 : case REORDER_BUFFER_CHANGE_INSERT:
2083 : case REORDER_BUFFER_CHANGE_UPDATE:
2084 : case REORDER_BUFFER_CHANGE_DELETE:
2085 640764 : Assert(snapshot_now);
2086 :
2087 640764 : reloid = RelidByRelfilenode(change->data.tp.relnode.spcNode,
2088 : change->data.tp.relnode.relNode);
2089 :
2090 : /*
2091 : * Mapped catalog tuple without data, emitted while
2092 : * catalog table was in the process of being rewritten. We
2093 : * can fail to look up the relfilenode, because the
2094 : * relmapper has no "historic" view, in contrast to normal
2095 : * the normal catalog during decoding. Thus repeated
2096 : * rewrites can cause a lookup failure. That's OK because
2097 : * we do not decode catalog changes anyway. Normally such
2098 : * tuples would be skipped over below, but we can't
2099 : * identify whether the table should be logically logged
2100 : * without mapping the relfilenode to the oid.
2101 : */
2102 640904 : if (reloid == InvalidOid &&
2103 304 : change->data.tp.newtuple == NULL &&
2104 152 : change->data.tp.oldtuple == NULL)
2105 : goto change_done;
2106 640600 : else if (reloid == InvalidOid)
2107 0 : elog(ERROR, "could not map filenode \"%s\" to relation OID",
2108 : relpathperm(change->data.tp.relnode,
2109 : MAIN_FORKNUM));
2110 :
2111 640600 : relation = RelationIdGetRelation(reloid);
2112 :
2113 640600 : if (!RelationIsValid(relation))
2114 0 : elog(ERROR, "could not open relation with OID %u (for filenode \"%s\")",
2115 : reloid,
2116 : relpathperm(change->data.tp.relnode,
2117 : MAIN_FORKNUM));
2118 :
2119 640600 : if (!RelationIsLogicallyLogged(relation))
2120 : goto change_done;
2121 :
2122 : /*
2123 : * Ignore temporary heaps created during DDL unless the
2124 : * plugin has asked for them.
2125 : */
2126 636558 : if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2127 40 : goto change_done;
2128 :
2129 : /*
2130 : * For now ignore sequence changes entirely. Most of the
2131 : * time they don't log changes using records we
2132 : * understand, so it doesn't make sense to handle the few
2133 : * cases we do.
2134 : */
2135 636518 : if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
2136 0 : goto change_done;
2137 :
2138 : /* user-triggered change */
2139 636518 : if (!IsToastRelation(relation))
2140 : {
2141 632718 : ReorderBufferToastReplace(rb, txn, relation, change);
2142 632718 : ReorderBufferApplyChange(rb, txn, relation, change,
2143 : streaming);
2144 :
2145 : /*
2146 : * Only clear reassembled toast chunks if we're sure
2147 : * they're not required anymore. The creator of the
2148 : * tuple tells us.
2149 : */
2150 632710 : if (change->data.tp.clear_toast_afterwards)
2151 632306 : ReorderBufferToastReset(rb, txn);
2152 : }
2153 : /* we're not interested in toast deletions */
2154 3800 : else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2155 : {
2156 : /*
2157 : * Need to reassemble the full toasted Datum in
2158 : * memory, to ensure the chunks don't get reused till
2159 : * we're done remove it from the list of this
2160 : * transaction's changes. Otherwise it will get
2161 : * freed/reused while restoring spooled data from
2162 : * disk.
2163 : */
2164 3338 : Assert(change->data.tp.newtuple != NULL);
2165 :
2166 3338 : dlist_delete(&change->node);
2167 3338 : ReorderBufferToastAppendChunk(rb, txn, relation,
2168 : change);
2169 : }
2170 :
2171 : change_done:
2172 :
2173 : /*
2174 : * Either speculative insertion was confirmed, or it was
2175 : * unsuccessful and the record isn't needed anymore.
2176 : */
2177 640744 : if (specinsert != NULL)
2178 : {
2179 3564 : ReorderBufferReturnChange(rb, specinsert, true);
2180 3564 : specinsert = NULL;
2181 : }
2182 :
2183 640744 : if (RelationIsValid(relation))
2184 : {
2185 640592 : RelationClose(relation);
2186 640592 : relation = NULL;
2187 : }
2188 640744 : break;
2189 :
2190 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
2191 :
2192 : /*
2193 : * Speculative insertions are dealt with by delaying the
2194 : * processing of the insert until the confirmation record
2195 : * arrives. For that we simply unlink the record from the
2196 : * chain, so it does not get freed/reused while restoring
2197 : * spooled data from disk.
2198 : *
2199 : * This is safe in the face of concurrent catalog changes
2200 : * because the relevant relation can't be changed between
2201 : * speculative insertion and confirmation due to
2202 : * CheckTableNotInUse() and locking.
2203 : */
2204 :
2205 : /* clear out a pending (and thus failed) speculation */
2206 3564 : if (specinsert != NULL)
2207 : {
2208 0 : ReorderBufferReturnChange(rb, specinsert, true);
2209 0 : specinsert = NULL;
2210 : }
2211 :
2212 : /* and memorize the pending insertion */
2213 3564 : dlist_delete(&change->node);
2214 3564 : specinsert = change;
2215 3564 : break;
2216 :
2217 : case REORDER_BUFFER_CHANGE_TRUNCATE:
2218 : {
2219 : int i;
2220 20 : int nrelids = change->data.truncate.nrelids;
2221 20 : int nrelations = 0;
2222 : Relation *relations;
2223 :
2224 20 : relations = palloc0(nrelids * sizeof(Relation));
2225 50 : for (i = 0; i < nrelids; i++)
2226 : {
2227 30 : Oid relid = change->data.truncate.relids[i];
2228 : Relation relation;
2229 :
2230 30 : relation = RelationIdGetRelation(relid);
2231 :
2232 30 : if (!RelationIsValid(relation))
2233 0 : elog(ERROR, "could not open relation with OID %u", relid);
2234 :
2235 30 : if (!RelationIsLogicallyLogged(relation))
2236 0 : continue;
2237 :
2238 30 : relations[nrelations++] = relation;
2239 : }
2240 :
2241 : /* Apply the truncate. */
2242 20 : ReorderBufferApplyTruncate(rb, txn, nrelations,
2243 : relations, change,
2244 : streaming);
2245 :
2246 50 : for (i = 0; i < nrelations; i++)
2247 30 : RelationClose(relations[i]);
2248 :
2249 20 : break;
2250 : }
2251 :
2252 : case REORDER_BUFFER_CHANGE_MESSAGE:
2253 12 : ReorderBufferApplyMessage(rb, txn, change, streaming);
2254 12 : break;
2255 :
2256 : case REORDER_BUFFER_CHANGE_INVALIDATION:
2257 : /* Execute the invalidation messages locally */
2258 2164 : ReorderBufferExecuteInvalidations(
2259 : change->data.inval.ninvalidations,
2260 : change->data.inval.invalidations);
2261 2164 : break;
2262 :
2263 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
2264 : /* get rid of the old */
2265 448 : TeardownHistoricSnapshot(false);
2266 :
2267 448 : if (snapshot_now->copied)
2268 : {
2269 412 : ReorderBufferFreeSnap(rb, snapshot_now);
2270 412 : snapshot_now =
2271 412 : ReorderBufferCopySnap(rb, change->data.snapshot,
2272 : txn, command_id);
2273 : }
2274 :
2275 : /*
2276 : * Restored from disk, need to be careful not to double
2277 : * free. We could introduce refcounting for that, but for
2278 : * now this seems infrequent enough not to care.
2279 : */
2280 36 : else if (change->data.snapshot->copied)
2281 : {
2282 0 : snapshot_now =
2283 0 : ReorderBufferCopySnap(rb, change->data.snapshot,
2284 : txn, command_id);
2285 : }
2286 : else
2287 : {
2288 36 : snapshot_now = change->data.snapshot;
2289 : }
2290 :
2291 : /* and continue with the new one */
2292 448 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2293 448 : break;
2294 :
2295 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
2296 11758 : Assert(change->data.command_id != InvalidCommandId);
2297 :
2298 11758 : if (command_id < change->data.command_id)
2299 : {
2300 1752 : command_id = change->data.command_id;
2301 :
2302 1752 : if (!snapshot_now->copied)
2303 : {
2304 : /* we don't use the global one anymore */
2305 410 : snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2306 : txn, command_id);
2307 : }
2308 :
2309 1752 : snapshot_now->curcid = command_id;
2310 :
2311 1752 : TeardownHistoricSnapshot(false);
2312 1752 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2313 : }
2314 :
2315 11758 : break;
2316 :
2317 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
2318 0 : elog(ERROR, "tuplecid value in changequeue");
2319 : break;
2320 : }
2321 : }
2322 :
2323 : /*
2324 : * There's a speculative insertion remaining, just clean in up, it
2325 : * can't have been successful, otherwise we'd gotten a confirmation
2326 : * record.
2327 : */
2328 1878 : if (specinsert)
2329 : {
2330 0 : ReorderBufferReturnChange(rb, specinsert, true);
2331 0 : specinsert = NULL;
2332 : }
2333 :
2334 : /* clean up the iterator */
2335 1878 : ReorderBufferIterTXNFinish(rb, iterstate);
2336 1878 : iterstate = NULL;
2337 :
2338 : /*
2339 : * Done with current changes, send the last message for this set of
2340 : * changes depending upon streaming mode.
2341 : */
2342 1878 : if (streaming)
2343 : {
2344 804 : if (stream_started)
2345 : {
2346 796 : rb->stream_stop(rb, txn, prev_lsn);
2347 796 : stream_started = false;
2348 : }
2349 : }
2350 : else
2351 : {
2352 : /*
2353 : * Call either PREPARE (for two-phase transactions) or COMMIT
2354 : * (for regular ones).
2355 : */
2356 1074 : if (rbtxn_prepared(txn))
2357 26 : rb->prepare(rb, txn, commit_lsn);
2358 : else
2359 1048 : rb->commit(rb, txn, commit_lsn);
2360 : }
2361 :
2362 : /* this is just a sanity check against bad output plugin behaviour */
2363 1878 : if (GetCurrentTransactionIdIfAny() != InvalidTransactionId)
2364 0 : elog(ERROR, "output plugin used XID %u",
2365 : GetCurrentTransactionId());
2366 :
2367 : /*
2368 : * Remember the command ID and snapshot for the next set of changes in
2369 : * streaming mode.
2370 : */
2371 1878 : if (streaming)
2372 804 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2373 1074 : else if (snapshot_now->copied)
2374 410 : ReorderBufferFreeSnap(rb, snapshot_now);
2375 :
2376 : /* cleanup */
2377 1878 : TeardownHistoricSnapshot(false);
2378 :
2379 : /*
2380 : * Aborting the current (sub-)transaction as a whole has the right
2381 : * semantics. We want all locks acquired in here to be released, not
2382 : * reassigned to the parent and we do not want any database access
2383 : * have persistent effects.
2384 : */
2385 1878 : AbortCurrentTransaction();
2386 :
2387 : /* make sure there's no cache pollution */
2388 1878 : ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
2389 :
2390 1878 : if (using_subtxn)
2391 734 : RollbackAndReleaseCurrentSubTransaction();
2392 :
2393 : /*
2394 : * We are here due to one of the 3 scenarios:
2395 : * 1. As part of streaming in-progress transactions
2396 : * 2. Prepare of a two-phase commit
2397 : * 3. Commit of a transaction.
2398 : *
2399 : * If we are streaming the in-progress transaction then discard the
2400 : * changes that we just streamed, and mark the transactions as
2401 : * streamed (if they contained changes), set prepared flag as false.
2402 : * If part of a prepare of a two-phase commit set the prepared flag
2403 : * as true so that we can discard changes and cleanup tuplecids.
2404 : * Otherwise, remove all the
2405 : * changes and deallocate the ReorderBufferTXN.
2406 : */
2407 1878 : if (streaming)
2408 : {
2409 804 : ReorderBufferTruncateTXN(rb, txn, false);
2410 :
2411 : /* Reset the CheckXidAlive */
2412 804 : CheckXidAlive = InvalidTransactionId;
2413 : }
2414 1074 : else if (rbtxn_prepared(txn))
2415 : {
2416 26 : ReorderBufferTruncateTXN(rb, txn, true);
2417 : /* Reset the CheckXidAlive */
2418 26 : CheckXidAlive = InvalidTransactionId;
2419 : }
2420 : else
2421 1048 : ReorderBufferCleanupTXN(rb, txn);
2422 : }
2423 14 : PG_CATCH();
2424 : {
2425 14 : MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
2426 14 : ErrorData *errdata = CopyErrorData();
2427 :
2428 : /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
2429 14 : if (iterstate)
2430 14 : ReorderBufferIterTXNFinish(rb, iterstate);
2431 :
2432 14 : TeardownHistoricSnapshot(true);
2433 :
2434 : /*
2435 : * Force cache invalidation to happen outside of a valid transaction
2436 : * to prevent catalog access as we just caught an error.
2437 : */
2438 14 : AbortCurrentTransaction();
2439 :
2440 : /* make sure there's no cache pollution */
2441 14 : ReorderBufferExecuteInvalidations(txn->ninvalidations,
2442 : txn->invalidations);
2443 :
2444 14 : if (using_subtxn)
2445 6 : RollbackAndReleaseCurrentSubTransaction();
2446 :
2447 : /*
2448 : * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2449 : * abort of the (sub)transaction we are streaming or preparing. We need to do the
2450 : * cleanup and return gracefully on this error, see SetupCheckXidLive.
2451 : */
2452 14 : if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK)
2453 : {
2454 : /*
2455 : * This error can only occur either when we are sending the data in
2456 : * streaming mode and the streaming is not finished yet or when we are
2457 : * sending the data out on a PREPARE during a two-phase commit.
2458 : * Both conditions can't be true either, it should be one of them.
2459 : */
2460 14 : Assert(streaming || rbtxn_prepared(txn));
2461 14 : Assert(stream_started || rbtxn_prepared(txn));
2462 14 : Assert(!(streaming && rbtxn_prepared(txn)));
2463 :
2464 : /* Cleanup the temporary error state. */
2465 14 : FlushErrorState();
2466 14 : FreeErrorData(errdata);
2467 14 : errdata = NULL;
2468 14 : curtxn->concurrent_abort = true;
2469 :
2470 : /*
2471 : * If streaming, reset the TXN so that it is allowed to stream remaining data.
2472 : */
2473 14 : if (streaming)
2474 : {
2475 10 : ReorderBufferResetTXN(rb, txn, snapshot_now,
2476 : command_id, prev_lsn,
2477 : specinsert);
2478 : }
2479 : else
2480 : {
2481 4 : elog(LOG, "stopping decoding of %s (%u)",
2482 : txn->gid[0] != '\0'? txn->gid:"", txn->xid);
2483 4 : ReorderBufferTruncateTXN(rb, txn, true);
2484 : }
2485 : }
2486 : else
2487 : {
2488 0 : ReorderBufferCleanupTXN(rb, txn);
2489 0 : MemoryContextSwitchTo(ecxt);
2490 0 : PG_RE_THROW();
2491 : }
2492 : }
2493 1892 : PG_END_TRY();
2494 1892 : }
2495 :
2496 : /*
2497 : * Perform the replay of a transaction and its non-aborted subtransactions.
2498 : *
2499 : * Subtransactions previously have to be processed by
2500 : * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2501 : * transaction with ReorderBufferAssignChild.
2502 : *
2503 : * This interface is called once a toplevel commit is read for both streamed
2504 : * as well as non-streamed transactions.
2505 : */
2506 : static void
2507 1132 : ReorderBufferCommitInternal(ReorderBufferTXN *txn,
2508 : ReorderBuffer *rb, TransactionId xid,
2509 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2510 : TimestampTz commit_time,
2511 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2512 : {
2513 : Snapshot snapshot_now;
2514 1132 : CommandId command_id = FirstCommandId;
2515 :
2516 1132 : txn->final_lsn = commit_lsn;
2517 1132 : txn->end_lsn = end_lsn;
2518 1132 : txn->commit_time = commit_time;
2519 1132 : txn->origin_id = origin_id;
2520 1132 : txn->origin_lsn = origin_lsn;
2521 :
2522 : /*
2523 : * If the transaction was (partially) streamed, we need to commit it in a
2524 : * 'streamed' way. That is, we first stream the remaining part of the
2525 : * transaction, and then invoke stream_commit message.
2526 : *
2527 : * Called after everything (origin ID, LSN, ...) is stored in the
2528 : * transaction to avoid passing that information directly.
2529 : */
2530 1132 : if (rbtxn_is_streamed(txn))
2531 : {
2532 52 : ReorderBufferStreamCommit(rb, txn);
2533 52 : return;
2534 : }
2535 :
2536 : /*
2537 : * If this transaction has no snapshot, it didn't make any changes to the
2538 : * database, so there's nothing to decode. Note that
2539 : * ReorderBufferCommitChild will have transferred any snapshots from
2540 : * subtransactions if there were any.
2541 : */
2542 1080 : if (txn->base_snapshot == NULL)
2543 : {
2544 2 : Assert(txn->ninvalidations == 0);
2545 2 : ReorderBufferCleanupTXN(rb, txn);
2546 2 : return;
2547 : }
2548 :
2549 1078 : snapshot_now = txn->base_snapshot;
2550 :
2551 : /* Process and send the changes to output plugin. */
2552 1078 : ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2553 : command_id, false);
2554 : }
2555 :
2556 : /*
2557 : * Ask output plugin whether we want to skip this PREPARE and send
2558 : * this transaction as a regular commit later.
2559 : */
2560 : bool
2561 156 : ReorderBufferPrepareNeedSkip(ReorderBuffer *rb, TransactionId xid, const char *gid)
2562 : {
2563 : ReorderBufferTXN *txn;
2564 :
2565 156 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2566 :
2567 156 : return rb->filter_prepare(rb, txn, xid, gid);
2568 : }
2569 :
2570 :
2571 : /*
2572 : * Commit a transaction.
2573 : *
2574 : * See comments for ReorderBufferCommitInternal()
2575 : */
2576 : void
2577 1086 : ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
2578 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2579 : TimestampTz commit_time,
2580 : RepOriginId origin_id, XLogRecPtr origin_lsn)
2581 : {
2582 : ReorderBufferTXN *txn;
2583 :
2584 1086 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2585 : false);
2586 :
2587 : /* unknown transaction, nothing to replay */
2588 1086 : if (txn == NULL)
2589 1088 : return;
2590 :
2591 1084 : ReorderBufferCommitInternal(txn, rb, xid, commit_lsn, end_lsn,
2592 : commit_time, origin_id, origin_lsn);
2593 : }
2594 :
2595 : /*
2596 : * Prepare a two-phase transaction. It calls ReorderBufferCommitInternal()
2597 : * since all prepared transactions need to be decoded at PREPARE time.
2598 : */
2599 : void
2600 48 : ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid,
2601 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2602 : TimestampTz commit_time,
2603 : RepOriginId origin_id, XLogRecPtr origin_lsn,
2604 : char *gid)
2605 : {
2606 : ReorderBufferTXN *txn;
2607 :
2608 48 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2609 : false);
2610 :
2611 : /* unknown transaction, nothing to replay */
2612 48 : if (txn == NULL)
2613 48 : return;
2614 :
2615 48 : txn->txn_flags |= RBTXN_PREPARE;
2616 48 : txn->gid = palloc(strlen(gid) + 1); /* trailing '\0' */
2617 48 : strcpy(txn->gid, gid);
2618 :
2619 48 : ReorderBufferCommitInternal(txn, rb, xid, commit_lsn, end_lsn,
2620 : commit_time, origin_id, origin_lsn);
2621 : }
2622 :
2623 : /*
2624 : * Check whether this transaction was sent as prepared to subscribers.
2625 : * Called while handling commit|abort prepared.
2626 : */
2627 : bool
2628 0 : ReorderBufferTxnIsPrepared(ReorderBuffer *rb, TransactionId xid,
2629 : const char *gid)
2630 : {
2631 : ReorderBufferTXN *txn;
2632 :
2633 0 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2634 : false);
2635 :
2636 : /*
2637 : * Always call the prepare filter. It's the job of the prepare filter to
2638 : * give us the *same* response for a given xid across multiple calls
2639 : * (including ones on restart)
2640 : */
2641 0 : return !(rb->filter_prepare(rb, txn, xid, gid));
2642 : }
2643 :
2644 : /*
2645 : * Send standalone xact event. This is used to handle COMMIT/ROLLBACK PREPARED.
2646 : */
2647 : void
2648 52 : ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid,
2649 : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2650 : TimestampTz commit_time,
2651 : RepOriginId origin_id, XLogRecPtr origin_lsn,
2652 : char *gid, bool is_commit)
2653 : {
2654 : ReorderBufferTXN *txn;
2655 :
2656 : /*
2657 : * The transaction may or may not exist (during restarts for example).
2658 : * Anyway, two-phase transactions do not contain any reorderbuffers. So allow
2659 : * it to be created below.
2660 : */
2661 52 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, commit_lsn,
2662 : true);
2663 :
2664 52 : txn->final_lsn = commit_lsn;
2665 52 : txn->end_lsn = end_lsn;
2666 52 : txn->commit_time = commit_time;
2667 52 : txn->origin_id = origin_id;
2668 52 : txn->origin_lsn = origin_lsn;
2669 : /* this txn is obviously prepared */
2670 52 : txn->txn_flags |= RBTXN_PREPARE;
2671 52 : txn->gid = palloc(strlen(gid) + 1); /* trailing '\0' */
2672 52 : strcpy(txn->gid, gid);
2673 :
2674 52 : if (is_commit)
2675 30 : txn->txn_flags |= RBTXN_COMMIT_PREPARED;
2676 : else
2677 22 : txn->txn_flags |= RBTXN_ROLLBACK_PREPARED;
2678 :
2679 52 : if (rbtxn_commit_prepared(txn))
2680 30 : rb->commit_prepared(rb, txn, commit_lsn);
2681 22 : else if (rbtxn_rollback_prepared(txn))
2682 22 : rb->rollback_prepared(rb, txn, commit_lsn);
2683 :
2684 :
2685 : /* cleanup: make sure there's no cache pollution */
2686 52 : ReorderBufferExecuteInvalidations(txn->ninvalidations,
2687 : txn->invalidations);
2688 52 : ReorderBufferCleanupTXN(rb, txn);
2689 52 : }
2690 :
2691 : /*
2692 : * Abort a transaction that possibly has previous changes. Needs to be first
2693 : * called for subtransactions and then for the toplevel xid.
2694 : *
2695 : * NB: Transactions handled here have to have actively aborted (i.e. have
2696 : * produced an abort record). Implicitly aborted transactions are handled via
2697 : * ReorderBufferAbortOld(); transactions we're just not interested in, but
2698 : * which have committed are handled in ReorderBufferForget().
2699 : *
2700 : * This function purges this transaction and its contents from memory and
2701 : * disk.
2702 : */
2703 : void
2704 128 : ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
2705 : {
2706 : ReorderBufferTXN *txn;
2707 :
2708 128 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2709 : false);
2710 :
2711 : /* unknown, nothing to remove */
2712 128 : if (txn == NULL)
2713 128 : return;
2714 :
2715 : /* For streamed transactions notify the remote node about the abort. */
2716 128 : if (rbtxn_is_streamed(txn))
2717 : {
2718 28 : rb->stream_abort(rb, txn, lsn);
2719 :
2720 : /*
2721 : * We might have decoded changes for this transaction that could load
2722 : * the cache as per the current transaction's view (consider DDL's
2723 : * happened in this transaction). We don't want the decoding of future
2724 : * transactions to use those cache entries so execute invalidations.
2725 : */
2726 28 : if (txn->ninvalidations > 0)
2727 0 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
2728 : txn->invalidations);
2729 : }
2730 :
2731 : /* cosmetic... */
2732 128 : txn->final_lsn = lsn;
2733 :
2734 : /*
2735 : * remove potential on-disk data, and deallocate.
2736 : *
2737 : * We remove it even for prepared transactions (GID is enough to
2738 : * commit/abort those later).
2739 : */
2740 128 : ReorderBufferCleanupTXN(rb, txn);
2741 : }
2742 :
2743 : /*
2744 : * Abort all transactions that aren't actually running anymore because the
2745 : * server restarted.
2746 : *
2747 : * NB: These really have to be transactions that have aborted due to a server
2748 : * crash/immediate restart, as we don't deal with invalidations here.
2749 : */
2750 : void
2751 982 : ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
2752 : {
2753 : dlist_mutable_iter it;
2754 :
2755 : /*
2756 : * Iterate through all (potential) toplevel TXNs and abort all that are
2757 : * older than what possibly can be running. Once we've found the first
2758 : * that is alive we stop, there might be some that acquired an xid earlier
2759 : * but started writing later, but it's unlikely and they will be cleaned
2760 : * up in a later call to this function.
2761 : */
2762 990 : dlist_foreach_modify(it, &rb->toplevel_by_lsn)
2763 : {
2764 : ReorderBufferTXN *txn;
2765 :
2766 32 : txn = dlist_container(ReorderBufferTXN, node, it.cur);
2767 :
2768 32 : if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
2769 : {
2770 8 : elog(DEBUG2, "aborting old transaction %u", txn->xid);
2771 :
2772 : /* remove potential on-disk data, and deallocate this tx */
2773 8 : ReorderBufferCleanupTXN(rb, txn);
2774 : }
2775 : else
2776 48 : return;
2777 : }
2778 : }
2779 :
2780 : /*
2781 : * Forget the contents of a transaction if we aren't interested in its
2782 : * contents. Needs to be first called for subtransactions and then for the
2783 : * toplevel xid.
2784 : *
2785 : * This is significantly different to ReorderBufferAbort() because
2786 : * transactions that have committed need to be treated differently from aborted
2787 : * ones since they may have modified the catalog.
2788 : *
2789 : * Note that this is only allowed to be called in the moment a transaction
2790 : * commit has just been read, not earlier; otherwise later records referring
2791 : * to this xid might re-create the transaction incompletely.
2792 : */
2793 : void
2794 3962 : ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
2795 : {
2796 : ReorderBufferTXN *txn;
2797 :
2798 3962 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2799 : false);
2800 :
2801 : /* unknown, nothing to forget */
2802 3962 : if (txn == NULL)
2803 5076 : return;
2804 :
2805 : /* For streamed transactions notify the remote node about the abort. */
2806 2848 : if (rbtxn_is_streamed(txn))
2807 0 : rb->stream_abort(rb, txn, lsn);
2808 :
2809 : /* cosmetic... */
2810 2848 : txn->final_lsn = lsn;
2811 :
2812 : /*
2813 : * Process cache invalidation messages if there are any. Even if we're not
2814 : * interested in the transaction's contents, it could have manipulated the
2815 : * catalog and we need to update the caches according to that.
2816 : */
2817 2848 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
2818 728 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
2819 : txn->invalidations);
2820 : else
2821 2120 : Assert(txn->ninvalidations == 0);
2822 :
2823 : /* remove potential on-disk data, and deallocate */
2824 2848 : ReorderBufferCleanupTXN(rb, txn);
2825 : }
2826 :
2827 : /*
2828 : * Execute invalidations happening outside the context of a decoded
2829 : * transaction. That currently happens either for xid-less commits
2830 : * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
2831 : * transactions (via ReorderBufferForget()).
2832 : */
2833 : void
2834 728 : ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations,
2835 : SharedInvalidationMessage *invalidations)
2836 : {
2837 728 : bool use_subtxn = IsTransactionOrTransactionBlock();
2838 : int i;
2839 :
2840 728 : if (use_subtxn)
2841 682 : BeginInternalSubTransaction("replay");
2842 :
2843 : /*
2844 : * Force invalidations to happen outside of a valid transaction - that way
2845 : * entries will just be marked as invalid without accessing the catalog.
2846 : * That's advantageous because we don't need to setup the full state
2847 : * necessary for catalog access.
2848 : */
2849 728 : if (use_subtxn)
2850 682 : AbortCurrentTransaction();
2851 :
2852 33904 : for (i = 0; i < ninvalidations; i++)
2853 33176 : LocalExecuteInvalidationMessage(&invalidations[i]);
2854 :
2855 728 : if (use_subtxn)
2856 682 : RollbackAndReleaseCurrentSubTransaction();
2857 728 : }
2858 :
2859 : /*
2860 : * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
2861 : * least once for every xid in XLogRecord->xl_xid (other places in records
2862 : * may, but do not have to be passed through here).
2863 : *
2864 : * Reorderbuffer keeps some datastructures about transactions in LSN order,
2865 : * for efficiency. To do that it has to know about when transactions are seen
2866 : * first in the WAL. As many types of records are not actually interesting for
2867 : * logical decoding, they do not necessarily pass though here.
2868 : */
2869 : void
2870 4380928 : ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
2871 : {
2872 : /* many records won't have an xid assigned, centralize check here */
2873 4380928 : if (xid != InvalidTransactionId)
2874 4379046 : ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
2875 4380928 : }
2876 :
2877 : /*
2878 : * Add a new snapshot to this transaction that may only used after lsn 'lsn'
2879 : * because the previous snapshot doesn't describe the catalog correctly for
2880 : * following rows.
2881 : */
2882 : void
2883 1204 : ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
2884 : XLogRecPtr lsn, Snapshot snap)
2885 : {
2886 1204 : ReorderBufferChange *change = ReorderBufferGetChange(rb);
2887 :
2888 1204 : change->data.snapshot = snap;
2889 1204 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
2890 :
2891 1204 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
2892 1204 : }
2893 :
2894 : /*
2895 : * Set up the transaction's base snapshot.
2896 : *
2897 : * If we know that xid is a subtransaction, set the base snapshot on the
2898 : * top-level transaction instead.
2899 : */
2900 : void
2901 3230 : ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid,
2902 : XLogRecPtr lsn, Snapshot snap)
2903 : {
2904 : ReorderBufferTXN *txn;
2905 : bool is_new;
2906 :
2907 3230 : AssertArg(snap != NULL);
2908 :
2909 : /*
2910 : * Fetch the transaction to operate on. If we know it's a subtransaction,
2911 : * operate on its top-level transaction instead.
2912 : */
2913 3230 : txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
2914 3230 : if (rbtxn_is_known_subxact(txn))
2915 218 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
2916 : NULL, InvalidXLogRecPtr, false);
2917 3230 : Assert(txn->base_snapshot == NULL);
2918 :
2919 3230 : txn->base_snapshot = snap;
2920 3230 : txn->base_snapshot_lsn = lsn;
2921 3230 : dlist_push_tail(&rb->txns_by_base_snapshot_lsn, &txn->base_snapshot_node);
2922 :
2923 3230 : AssertTXNLsnOrder(rb);
2924 3230 : }
2925 :
2926 : /*
2927 : * Access the catalog with this CommandId at this point in the changestream.
2928 : *
2929 : * May only be called for command ids > 1
2930 : */
2931 : void
2932 30900 : ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
2933 : XLogRecPtr lsn, CommandId cid)
2934 : {
2935 30900 : ReorderBufferChange *change = ReorderBufferGetChange(rb);
2936 :
2937 30900 : change->data.command_id = cid;
2938 30900 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
2939 :
2940 30900 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
2941 30900 : }
2942 :
2943 : /*
2944 : * Update memory counters to account for the new or removed change.
2945 : *
2946 : * We update two counters - in the reorder buffer, and in the transaction
2947 : * containing the change. The reorder buffer counter allows us to quickly
2948 : * decide if we reached the memory limit, the transaction counter allows
2949 : * us to quickly pick the largest transaction for eviction.
2950 : *
2951 : * When streaming is enabled, we need to update the toplevel transaction
2952 : * counters instead - we don't really care about subtransactions as we
2953 : * can't stream them individually anyway, and we only pick toplevel
2954 : * transactions for eviction. So only toplevel transactions matter.
2955 : */
2956 : static void
2957 6659048 : ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
2958 : ReorderBufferChange *change,
2959 : bool addition)
2960 : {
2961 : Size sz;
2962 : ReorderBufferTXN *txn;
2963 6659048 : ReorderBufferTXN *toptxn = NULL;
2964 :
2965 6659048 : Assert(change->txn);
2966 :
2967 : /*
2968 : * Ignore tuple CID changes, because those are not evicted when reaching
2969 : * memory limit. So we just don't count them, because it might easily
2970 : * trigger a pointless attempt to spill.
2971 : */
2972 6659048 : if (change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID)
2973 6689840 : return;
2974 :
2975 6628256 : txn = change->txn;
2976 :
2977 : /* If streaming supported, update the total size in top level as well. */
2978 6628256 : if (ReorderBufferCanStream(rb))
2979 : {
2980 1126702 : if (txn->toptxn != NULL)
2981 57920 : toptxn = txn->toptxn;
2982 : else
2983 1068782 : toptxn = txn;
2984 : }
2985 :
2986 6628256 : sz = ReorderBufferChangeSize(change);
2987 :
2988 6628256 : if (addition)
2989 : {
2990 3315582 : txn->size += sz;
2991 3315582 : rb->size += sz;
2992 :
2993 : /* Update the total size in the top transaction. */
2994 3315582 : if (toptxn)
2995 564766 : toptxn->total_size += sz;
2996 : }
2997 : else
2998 : {
2999 3312674 : Assert((rb->size >= sz) && (txn->size >= sz));
3000 3312674 : txn->size -= sz;
3001 3312674 : rb->size -= sz;
3002 :
3003 : /* Update the total size in the top transaction. */
3004 3312674 : if (toptxn)
3005 561936 : toptxn->total_size -= sz;
3006 : }
3007 :
3008 6628256 : Assert(txn->size <= rb->size);
3009 : }
3010 :
3011 : /*
3012 : * Add new (relfilenode, tid) -> (cmin, cmax) mappings.
3013 : *
3014 : * We do not include this change type in memory accounting, because we
3015 : * keep CIDs in a separate list and do not evict them when reaching
3016 : * the memory limit.
3017 : */
3018 : void
3019 30900 : ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
3020 : XLogRecPtr lsn, RelFileNode node,
3021 : ItemPointerData tid, CommandId cmin,
3022 : CommandId cmax, CommandId combocid)
3023 : {
3024 30900 : ReorderBufferChange *change = ReorderBufferGetChange(rb);
3025 : ReorderBufferTXN *txn;
3026 :
3027 30900 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3028 :
3029 30900 : change->data.tuplecid.node = node;
3030 30900 : change->data.tuplecid.tid = tid;
3031 30900 : change->data.tuplecid.cmin = cmin;
3032 30900 : change->data.tuplecid.cmax = cmax;
3033 30900 : change->data.tuplecid.combocid = combocid;
3034 30900 : change->lsn = lsn;
3035 30900 : change->txn = txn;
3036 30900 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID;
3037 :
3038 30900 : dlist_push_tail(&txn->tuplecids, &change->node);
3039 30900 : txn->ntuplecids++;
3040 30900 : }
3041 :
3042 : /*
3043 : * Setup the invalidation of the toplevel transaction.
3044 : *
3045 : * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3046 : * accumulates all the invalidation messages in the toplevel transaction as
3047 : * well as in the form of change in reorder buffer. We require to record it in
3048 : * form of the change so that we can execute only the required invalidations
3049 : * instead of executing all the invalidations on each CommandId increment. We
3050 : * also need to accumulate these in the toplevel transaction because in some
3051 : * cases we skip processing the transaction (see ReorderBufferForget), we need
3052 : * to execute all the invalidations together.
3053 : */
3054 : void
3055 5780 : ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
3056 : XLogRecPtr lsn, Size nmsgs,
3057 : SharedInvalidationMessage *msgs)
3058 : {
3059 : ReorderBufferTXN *txn;
3060 : MemoryContext oldcontext;
3061 : ReorderBufferChange *change;
3062 :
3063 5780 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3064 :
3065 5780 : oldcontext = MemoryContextSwitchTo(rb->context);
3066 :
3067 : /*
3068 : * Collect all the invalidations under the top transaction so that we can
3069 : * execute them all together. See comment atop this function
3070 : */
3071 5780 : if (txn->toptxn)
3072 260 : txn = txn->toptxn;
3073 :
3074 5780 : Assert(nmsgs > 0);
3075 :
3076 : /* Accumulate invalidations. */
3077 5780 : if (txn->ninvalidations == 0)
3078 : {
3079 1174 : txn->ninvalidations = nmsgs;
3080 1174 : txn->invalidations = (SharedInvalidationMessage *)
3081 1174 : palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3082 1174 : memcpy(txn->invalidations, msgs,
3083 : sizeof(SharedInvalidationMessage) * nmsgs);
3084 : }
3085 : else
3086 : {
3087 4606 : txn->invalidations = (SharedInvalidationMessage *)
3088 4606 : repalloc(txn->invalidations, sizeof(SharedInvalidationMessage) *
3089 4606 : (txn->ninvalidations + nmsgs));
3090 :
3091 4606 : memcpy(txn->invalidations + txn->ninvalidations, msgs,
3092 : nmsgs * sizeof(SharedInvalidationMessage));
3093 4606 : txn->ninvalidations += nmsgs;
3094 : }
3095 :
3096 5780 : change = ReorderBufferGetChange(rb);
3097 5780 : change->action = REORDER_BUFFER_CHANGE_INVALIDATION;
3098 5780 : change->data.inval.ninvalidations = nmsgs;
3099 5780 : change->data.inval.invalidations = (SharedInvalidationMessage *)
3100 5780 : palloc(sizeof(SharedInvalidationMessage) * nmsgs);
3101 5780 : memcpy(change->data.inval.invalidations, msgs,
3102 : sizeof(SharedInvalidationMessage) * nmsgs);
3103 :
3104 5780 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3105 :
3106 5780 : MemoryContextSwitchTo(oldcontext);
3107 5780 : }
3108 :
3109 : /*
3110 : * Apply all invalidations we know. Possibly we only need parts at this point
3111 : * in the changestream but we don't know which those are.
3112 : */
3113 : static void
3114 4108 : ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs)
3115 : {
3116 : int i;
3117 :
3118 49386 : for (i = 0; i < nmsgs; i++)
3119 45278 : LocalExecuteInvalidationMessage(&msgs[i]);
3120 4108 : }
3121 :
3122 : /*
3123 : * Mark a transaction as containing catalog changes
3124 : */
3125 : void
3126 38146 : ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
3127 : XLogRecPtr lsn)
3128 : {
3129 : ReorderBufferTXN *txn;
3130 :
3131 38146 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3132 :
3133 38146 : txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3134 :
3135 : /*
3136 : * Mark top-level transaction as having catalog changes too if one of its
3137 : * children has so that the ReorderBufferBuildTupleCidHash can
3138 : * conveniently check just top-level transaction and decide whether to
3139 : * build the hash table or not.
3140 : */
3141 38146 : if (txn->toptxn != NULL)
3142 1622 : txn->toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3143 38146 : }
3144 :
3145 : /*
3146 : * Query whether a transaction is already *known* to contain catalog
3147 : * changes. This can be wrong until directly before the commit!
3148 : */
3149 : bool
3150 5588 : ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
3151 : {
3152 : ReorderBufferTXN *txn;
3153 :
3154 5588 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3155 : false);
3156 5588 : if (txn == NULL)
3157 1278 : return false;
3158 :
3159 4310 : return rbtxn_has_catalog_changes(txn);
3160 : }
3161 :
3162 : /*
3163 : * ReorderBufferXidHasBaseSnapshot
3164 : * Have we already set the base snapshot for the given txn/subtxn?
3165 : */
3166 : bool
3167 2993236 : ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
3168 : {
3169 : ReorderBufferTXN *txn;
3170 :
3171 2993236 : txn = ReorderBufferTXNByXid(rb, xid, false,
3172 : NULL, InvalidXLogRecPtr, false);
3173 :
3174 : /* transaction isn't known yet, ergo no snapshot */
3175 2993236 : if (txn == NULL)
3176 0 : return false;
3177 :
3178 : /* a known subtxn? operate on top-level txn instead */
3179 2993236 : if (rbtxn_is_known_subxact(txn))
3180 992040 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3181 : NULL, InvalidXLogRecPtr, false);
3182 :
3183 2993236 : return txn->base_snapshot != NULL;
3184 : }
3185 :
3186 :
3187 : /*
3188 : * ---------------------------------------
3189 : * Disk serialization support
3190 : * ---------------------------------------
3191 : */
3192 :
3193 : /*
3194 : * Ensure the IO buffer is >= sz.
3195 : */
3196 : static void
3197 5681972 : ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
3198 : {
3199 5681972 : if (!rb->outbufsize)
3200 : {
3201 76 : rb->outbuf = MemoryContextAlloc(rb->context, sz);
3202 76 : rb->outbufsize = sz;
3203 : }
3204 5681896 : else if (rb->outbufsize < sz)
3205 : {
3206 560 : rb->outbuf = repalloc(rb->outbuf, sz);
3207 560 : rb->outbufsize = sz;
3208 : }
3209 5681972 : }
3210 :
3211 : /*
3212 : * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3213 : *
3214 : * XXX With many subtransactions this might be quite slow, because we'll have
3215 : * to walk through all of them. There are some options how we could improve
3216 : * that: (a) maintain some secondary structure with transactions sorted by
3217 : * amount of changes, (b) not looking for the entirely largest transaction,
3218 : * but e.g. for transaction using at least some fraction of the memory limit,
3219 : * and (c) evicting multiple transactions at once, e.g. to free a given portion
3220 : * of the memory limit (e.g. 50%).
3221 : */
3222 : static ReorderBufferTXN *
3223 5914 : ReorderBufferLargestTXN(ReorderBuffer *rb)
3224 : {
3225 : HASH_SEQ_STATUS hash_seq;
3226 : ReorderBufferTXNByIdEnt *ent;
3227 5914 : ReorderBufferTXN *largest = NULL;
3228 :
3229 5914 : hash_seq_init(&hash_seq, rb->by_txn);
3230 21242 : while ((ent = hash_seq_search(&hash_seq)) != NULL)
3231 : {
3232 9414 : ReorderBufferTXN *txn = ent->txn;
3233 :
3234 : /* if the current transaction is larger, remember it */
3235 9414 : if ((!largest) || (txn->size > largest->size))
3236 7484 : largest = txn;
3237 : }
3238 :
3239 5914 : Assert(largest);
3240 5914 : Assert(largest->size > 0);
3241 5914 : Assert(largest->size <= rb->size);
3242 :
3243 5914 : return largest;
3244 : }
3245 :
3246 : /*
3247 : * Find the largest toplevel transaction to evict (by streaming).
3248 : *
3249 : * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3250 : * should give us the same transaction (because we don't update memory account
3251 : * for subtransaction with streaming, so it's always 0). But we can simply
3252 : * iterate over the limited number of toplevel transactions.
3253 : *
3254 : * Note that, we skip transactions that contains incomplete changes. There
3255 : * is a scope of optimization here such that we can select the largest transaction
3256 : * which has complete changes. But that will make the code and design quite complex
3257 : * and that might not be worth the benefit. If we plan to stream the transactions
3258 : * that contains incomplete changes then we need to find a way to partially
3259 : * stream/truncate the transaction changes in-memory and build a mechanism to
3260 : * partially truncate the spilled files. Additionally, whenever we partially
3261 : * stream the transaction we need to maintain the last streamed lsn and next time
3262 : * we need to restore from that segment and the offset in WAL. As we stream the
3263 : * changes from the top transaction and restore them subtransaction wise, we need
3264 : * to even remember the subxact from where we streamed the last change.
3265 : */
3266 : static ReorderBufferTXN *
3267 850 : ReorderBufferLargestTopTXN(ReorderBuffer *rb)
3268 : {
3269 : dlist_iter iter;
3270 850 : Size largest_size = 0;
3271 850 : ReorderBufferTXN *largest = NULL;
3272 :
3273 : /* Find the largest top-level transaction. */
3274 1786 : dlist_foreach(iter, &rb->toplevel_by_lsn)
3275 : {
3276 : ReorderBufferTXN *txn;
3277 :
3278 936 : txn = dlist_container(ReorderBufferTXN, node, iter.cur);
3279 :
3280 1788 : if ((largest != NULL || txn->total_size > largest_size) &&
3281 1704 : (txn->total_size > 0) && !(rbtxn_has_incomplete_tuple(txn)))
3282 : {
3283 764 : largest = txn;
3284 764 : largest_size = txn->total_size;
3285 : }
3286 : }
3287 :
3288 850 : return largest;
3289 : }
3290 :
3291 : /*
3292 : * Check whether the logical_decoding_work_mem limit was reached, and if yes
3293 : * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3294 : * disk until we reach under the memory limit.
3295 : *
3296 : * XXX At this point we select the transactions until we reach under the memory
3297 : * limit, but we might also adapt a more elaborate eviction strategy - for example
3298 : * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3299 : * limit.
3300 : */
3301 : static void
3302 2998210 : ReorderBufferCheckMemoryLimit(ReorderBuffer *rb)
3303 : {
3304 : ReorderBufferTXN *txn;
3305 :
3306 : /* bail out if we haven't exceeded the memory limit */
3307 2998210 : if (rb->size < logical_decoding_work_mem * 1024L)
3308 5989736 : return;
3309 :
3310 : /*
3311 : * Loop until we reach under the memory limit. One might think that just
3312 : * by evicting the largest (sub)transaction we will come under the memory
3313 : * limit based on assumption that the selected transaction is at least as
3314 : * large as the most recent change (which caused us to go over the memory
3315 : * limit). However, that is not true because a user can reduce the
3316 : * logical_decoding_work_mem to a smaller value before the most recent
3317 : * change.
3318 : */
3319 20028 : while (rb->size >= logical_decoding_work_mem * 1024L)
3320 : {
3321 : /*
3322 : * Pick the largest transaction (or subtransaction) and evict it from
3323 : * memory by streaming, if possible. Otherwise, spill to disk.
3324 : */
3325 6678 : if (ReorderBufferCanStartStreaming(rb) &&
3326 : (txn = ReorderBufferLargestTopTXN(rb)) != NULL)
3327 : {
3328 : /* we know there has to be one, because the size is not zero */
3329 764 : Assert(txn && !txn->toptxn);
3330 764 : Assert(txn->total_size > 0);
3331 764 : Assert(rb->size >= txn->total_size);
3332 :
3333 764 : ReorderBufferStreamTXN(rb, txn);
3334 : }
3335 : else
3336 : {
3337 : /*
3338 : * Pick the largest transaction (or subtransaction) and evict it
3339 : * from memory by serializing it to disk.
3340 : */
3341 5914 : txn = ReorderBufferLargestTXN(rb);
3342 :
3343 : /* we know there has to be one, because the size is not zero */
3344 5914 : Assert(txn);
3345 5914 : Assert(txn->size > 0);
3346 5914 : Assert(rb->size >= txn->size);
3347 :
3348 5914 : ReorderBufferSerializeTXN(rb, txn);
3349 : }
3350 :
3351 : /*
3352 : * After eviction, the transaction should have no entries in memory,
3353 : * and should use 0 bytes for changes.
3354 : */
3355 6672 : Assert(txn->size == 0);
3356 6672 : Assert(txn->nentries_mem == 0);
3357 : }
3358 :
3359 : /* We must be under the memory limit now. */
3360 6672 : Assert(rb->size < logical_decoding_work_mem * 1024L);
3361 : }
3362 :
3363 : /*
3364 : * Spill data of a large transaction (and its subtransactions) to disk.
3365 : */
3366 : static void
3367 6516 : ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
3368 : {
3369 : dlist_iter subtxn_i;
3370 : dlist_mutable_iter change_i;
3371 6516 : int fd = -1;
3372 6516 : XLogSegNo curOpenSegNo = 0;
3373 6516 : Size spilled = 0;
3374 6516 : Size size = txn->size;
3375 :
3376 6516 : elog(DEBUG2, "spill %u changes in XID %u to disk",
3377 : (uint32) txn->nentries_mem, txn->xid);
3378 :
3379 : /* do the same to all child TXs */
3380 7052 : dlist_foreach(subtxn_i, &txn->subtxns)
3381 : {
3382 : ReorderBufferTXN *subtxn;
3383 :
3384 536 : subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
3385 536 : ReorderBufferSerializeTXN(rb, subtxn);
3386 : }
3387 :
3388 : /* serialize changestream */
3389 2547672 : dlist_foreach_modify(change_i, &txn->changes)
3390 : {
3391 : ReorderBufferChange *change;
3392 :
3393 2541156 : change = dlist_container(ReorderBufferChange, node, change_i.cur);
3394 :
3395 : /*
3396 : * store in segment in which it belongs by start lsn, don't split over
3397 : * multiple segments tho
3398 : */
3399 5076298 : if (fd == -1 ||
3400 2535142 : !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
3401 : {
3402 : char path[MAXPGPATH];
3403 :
3404 6014 : if (fd != -1)
3405 0 : CloseTransientFile(fd);
3406 :
3407 6014 : XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
3408 :
3409 : /*
3410 : * No need to care about TLIs here, only used during a single run,
3411 : * so each LSN only maps to a specific WAL record.
3412 : */
3413 6014 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
3414 : curOpenSegNo);
3415 :
3416 : /* open segment, create it if necessary */
3417 6014 : fd = OpenTransientFile(path,
3418 : O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
3419 :
3420 6014 : if (fd < 0)
3421 0 : ereport(ERROR,
3422 : (errcode_for_file_access(),
3423 : errmsg("could not open file \"%s\": %m", path)));
3424 : }
3425 :
3426 2541156 : ReorderBufferSerializeChange(rb, txn, fd, change);
3427 2541156 : dlist_delete(&change->node);
3428 2541156 : ReorderBufferReturnChange(rb, change, true);
3429 :
3430 2541156 : spilled++;
3431 : }
3432 :
3433 : /* update the statistics iff we have spilled anything */
3434 6516 : if (spilled)
3435 : {
3436 6014 : rb->spillCount += 1;
3437 6014 : rb->spillBytes += size;
3438 :
3439 : /* don't consider already serialized transactions */
3440 6014 : rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
3441 : }
3442 :
3443 6516 : Assert(spilled == txn->nentries_mem);
3444 6516 : Assert(dlist_is_empty(&txn->changes));
3445 6516 : txn->nentries_mem = 0;
3446 6516 : txn->txn_flags |= RBTXN_IS_SERIALIZED;
3447 :
3448 6516 : if (fd != -1)
3449 6014 : CloseTransientFile(fd);
3450 6516 : }
3451 :
3452 : /*
3453 : * Serialize individual change to disk.
3454 : */
3455 : static void
3456 2541156 : ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
3457 : int fd, ReorderBufferChange *change)
3458 : {
3459 : ReorderBufferDiskChange *ondisk;
3460 2541156 : Size sz = sizeof(ReorderBufferDiskChange);
3461 :
3462 2541156 : ReorderBufferSerializeReserve(rb, sz);
3463 :
3464 2541156 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3465 2541156 : memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
3466 :
3467 2541156 : switch (change->action)
3468 : {
3469 : /* fall through these, they're all similar enough */
3470 : case REORDER_BUFFER_CHANGE_INSERT:
3471 : case REORDER_BUFFER_CHANGE_UPDATE:
3472 : case REORDER_BUFFER_CHANGE_DELETE:
3473 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
3474 : {
3475 : char *data;
3476 : ReorderBufferTupleBuf *oldtup,
3477 : *newtup;
3478 2506692 : Size oldlen = 0;
3479 2506692 : Size newlen = 0;
3480 :
3481 2506692 : oldtup = change->data.tp.oldtuple;
3482 2506692 : newtup = change->data.tp.newtuple;
3483 :
3484 2506692 : if (oldtup)
3485 : {
3486 184414 : sz += sizeof(HeapTupleData);
3487 184414 : oldlen = oldtup->tuple.t_len;
3488 184414 : sz += oldlen;
3489 : }
3490 :
3491 2506692 : if (newtup)
3492 : {
3493 2214970 : sz += sizeof(HeapTupleData);
3494 2214970 : newlen = newtup->tuple.t_len;
3495 2214970 : sz += newlen;
3496 : }
3497 :
3498 : /* make sure we have enough space */
3499 2506692 : ReorderBufferSerializeReserve(rb, sz);
3500 :
3501 2506692 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3502 : /* might have been reallocated above */
3503 2506692 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3504 :
3505 2506692 : if (oldlen)
3506 : {
3507 184414 : memcpy(data, &oldtup->tuple, sizeof(HeapTupleData));
3508 184414 : data += sizeof(HeapTupleData);
3509 :
3510 184414 : memcpy(data, oldtup->tuple.t_data, oldlen);
3511 184414 : data += oldlen;
3512 : }
3513 :
3514 2506692 : if (newlen)
3515 : {
3516 2214970 : memcpy(data, &newtup->tuple, sizeof(HeapTupleData));
3517 2214970 : data += sizeof(HeapTupleData);
3518 :
3519 2214970 : memcpy(data, newtup->tuple.t_data, newlen);
3520 2214970 : data += newlen;
3521 : }
3522 2506692 : break;
3523 : }
3524 : case REORDER_BUFFER_CHANGE_MESSAGE:
3525 : {
3526 : char *data;
3527 28 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
3528 :
3529 28 : sz += prefix_size + change->data.msg.message_size +
3530 : sizeof(Size) + sizeof(Size);
3531 28 : ReorderBufferSerializeReserve(rb, sz);
3532 :
3533 28 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3534 :
3535 : /* might have been reallocated above */
3536 28 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3537 :
3538 : /* write the prefix including the size */
3539 28 : memcpy(data, &prefix_size, sizeof(Size));
3540 28 : data += sizeof(Size);
3541 28 : memcpy(data, change->data.msg.prefix,
3542 : prefix_size);
3543 28 : data += prefix_size;
3544 :
3545 : /* write the message including the size */
3546 28 : memcpy(data, &change->data.msg.message_size, sizeof(Size));
3547 28 : data += sizeof(Size);
3548 28 : memcpy(data, change->data.msg.message,
3549 : change->data.msg.message_size);
3550 28 : data += change->data.msg.message_size;
3551 :
3552 28 : break;
3553 : }
3554 : case REORDER_BUFFER_CHANGE_INVALIDATION:
3555 : {
3556 : char *data;
3557 210 : Size inval_size = sizeof(SharedInvalidationMessage) *
3558 210 : change->data.inval.ninvalidations;
3559 :
3560 210 : sz += inval_size;
3561 :
3562 210 : ReorderBufferSerializeReserve(rb, sz);
3563 210 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3564 :
3565 : /* might have been reallocated above */
3566 210 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3567 210 : memcpy(data, change->data.inval.invalidations, inval_size);
3568 210 : data += inval_size;
3569 :
3570 210 : break;
3571 : }
3572 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
3573 : {
3574 : Snapshot snap;
3575 : char *data;
3576 :
3577 4 : snap = change->data.snapshot;
3578 :
3579 4 : sz += sizeof(SnapshotData) +
3580 8 : sizeof(TransactionId) * snap->xcnt +
3581 4 : sizeof(TransactionId) * snap->subxcnt;
3582 :
3583 : /* make sure we have enough space */
3584 4 : ReorderBufferSerializeReserve(rb, sz);
3585 4 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3586 : /* might have been reallocated above */
3587 4 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3588 :
3589 4 : memcpy(data, snap, sizeof(SnapshotData));
3590 4 : data += sizeof(SnapshotData);
3591 :
3592 4 : if (snap->xcnt)
3593 : {
3594 4 : memcpy(data, snap->xip,
3595 4 : sizeof(TransactionId) * snap->xcnt);
3596 4 : data += sizeof(TransactionId) * snap->xcnt;
3597 : }
3598 :
3599 4 : if (snap->subxcnt)
3600 : {
3601 0 : memcpy(data, snap->subxip,
3602 0 : sizeof(TransactionId) * snap->subxcnt);
3603 0 : data += sizeof(TransactionId) * snap->subxcnt;
3604 : }
3605 4 : break;
3606 : }
3607 : case REORDER_BUFFER_CHANGE_TRUNCATE:
3608 : {
3609 : Size size;
3610 : char *data;
3611 :
3612 : /* account for the OIDs of truncated relations */
3613 0 : size = sizeof(Oid) * change->data.truncate.nrelids;
3614 0 : sz += size;
3615 :
3616 : /* make sure we have enough space */
3617 0 : ReorderBufferSerializeReserve(rb, sz);
3618 :
3619 0 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
3620 : /* might have been reallocated above */
3621 0 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3622 :
3623 0 : memcpy(data, change->data.truncate.relids, size);
3624 0 : data += size;
3625 :
3626 0 : break;
3627 : }
3628 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
3629 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
3630 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
3631 : /* ReorderBufferChange contains everything important */
3632 34222 : break;
3633 : }
3634 :
3635 2541156 : ondisk->size = sz;
3636 :
3637 2541156 : errno = 0;
3638 2541156 : pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE);
3639 2541156 : if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
3640 : {
3641 0 : int save_errno = errno;
3642 :
3643 0 : CloseTransientFile(fd);
3644 :
3645 : /* if write didn't set errno, assume problem is no disk space */
3646 0 : errno = save_errno ? save_errno : ENOSPC;
3647 0 : ereport(ERROR,
3648 : (errcode_for_file_access(),
3649 : errmsg("could not write to data file for XID %u: %m",
3650 : txn->xid)));
3651 : }
3652 2541156 : pgstat_report_wait_end();
3653 :
3654 : /*
3655 : * Keep the transaction's final_lsn up to date with each change we send to
3656 : * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
3657 : * only do this on commit and abort records, but that doesn't work if a
3658 : * system crash leaves a transaction without its abort record).
3659 : *
3660 : * Make sure not to move it backwards.
3661 : */
3662 2541156 : if (txn->final_lsn < change->lsn)
3663 2531752 : txn->final_lsn = change->lsn;
3664 :
3665 2541156 : Assert(ondisk->change.action == change->action);
3666 2541156 : }
3667 :
3668 : /* Returns true, if the output plugin supports streaming, false, otherwise. */
3669 : static inline bool
3670 10195140 : ReorderBufferCanStream(ReorderBuffer *rb)
3671 : {
3672 10195140 : LogicalDecodingContext *ctx = rb->private_data;
3673 :
3674 10195140 : return ctx->streaming;
3675 : }
3676 :
3677 : /* Returns true, if the streaming can be started now, false, otherwise. */
3678 : static inline bool
3679 568674 : ReorderBufferCanStartStreaming(ReorderBuffer *rb)
3680 : {
3681 568674 : LogicalDecodingContext *ctx = rb->private_data;
3682 568674 : SnapBuild *builder = ctx->snapshot_builder;
3683 :
3684 : /*
3685 : * We can't start streaming immediately even if the streaming is enabled
3686 : * because we previously decoded this transaction and now just are
3687 : * restarting.
3688 : */
3689 1132066 : if (ReorderBufferCanStream(rb) &&
3690 563392 : !SnapBuildXactNeedsSkip(builder, ctx->reader->EndRecPtr))
3691 : {
3692 : /* We must have a consistent snapshot by this time */
3693 327614 : Assert(SnapBuildCurrentState(builder) == SNAPBUILD_CONSISTENT);
3694 327614 : return true;
3695 : }
3696 :
3697 241060 : return false;
3698 : }
3699 :
3700 : /*
3701 : * Send data of a large transaction (and its subtransactions) to the
3702 : * output plugin, but using the stream API.
3703 : */
3704 : static void
3705 820 : ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
3706 : {
3707 : Snapshot snapshot_now;
3708 : CommandId command_id;
3709 :
3710 : /* We can never reach here for a subtransaction. */
3711 820 : Assert(txn->toptxn == NULL);
3712 :
3713 : /*
3714 : * We can't make any assumptions about base snapshot here, similar to what
3715 : * ReorderBufferCommit() does. That relies on base_snapshot getting
3716 : * transferred from subxact in ReorderBufferCommitChild(), but that was
3717 : * not yet called as the transaction is in-progress.
3718 : *
3719 : * So just walk the subxacts and use the same logic here. But we only need
3720 : * to do that once, when the transaction is streamed for the first time.
3721 : * After that we need to reuse the snapshot from the previous run.
3722 : *
3723 : * Unlike DecodeCommit which adds xids of all the subtransactions in
3724 : * snapshot's xip array via SnapBuildCommittedTxn, we can't do that here
3725 : * but we do add them to subxip array instead via ReorderBufferCopySnap.
3726 : * This allows the catalog changes made in subtransactions decoded till
3727 : * now to be visible.
3728 : */
3729 820 : if (txn->snapshot_now == NULL)
3730 : {
3731 : dlist_iter subxact_i;
3732 :
3733 : /* make sure this transaction is streamed for the first time */
3734 62 : Assert(!rbtxn_is_streamed(txn));
3735 :
3736 : /* at the beginning we should have invalid command ID */
3737 62 : Assert(txn->command_id == InvalidCommandId);
3738 :
3739 64 : dlist_foreach(subxact_i, &txn->subtxns)
3740 : {
3741 : ReorderBufferTXN *subtxn;
3742 :
3743 2 : subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
3744 2 : ReorderBufferTransferSnapToParent(txn, subtxn);
3745 : }
3746 :
3747 : /*
3748 : * If this transaction has no snapshot, it didn't make any changes to
3749 : * the database till now, so there's nothing to decode.
3750 : */
3751 62 : if (txn->base_snapshot == NULL)
3752 : {
3753 0 : Assert(txn->ninvalidations == 0);
3754 814 : return;
3755 : }
3756 :
3757 62 : command_id = FirstCommandId;
3758 62 : snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
3759 : txn, command_id);
3760 : }
3761 : else
3762 : {
3763 : /* the transaction must have been already streamed */
3764 758 : Assert(rbtxn_is_streamed(txn));
3765 :
3766 : /*
3767 : * Nah, we already have snapshot from the previous streaming run. We
3768 : * assume new subxacts can't move the LSN backwards, and so can't beat
3769 : * the LSN condition in the previous branch (so no need to walk
3770 : * through subxacts again). In fact, we must not do that as we may be
3771 : * using snapshot half-way through the subxact.
3772 : */
3773 758 : command_id = txn->command_id;
3774 :
3775 : /*
3776 : * We can't use txn->snapshot_now directly because after the last
3777 : * streaming run, we might have got some new sub-transactions. So we
3778 : * need to add them to the snapshot.
3779 : */
3780 758 : snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
3781 : txn, command_id);
3782 :
3783 : /* Free the previously copied snapshot. */
3784 758 : Assert(txn->snapshot_now->copied);
3785 758 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
3786 758 : txn->snapshot_now = NULL;
3787 : }
3788 :
3789 : /* Process and send the changes to output plugin. */
3790 820 : ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
3791 : command_id, true);
3792 :
3793 814 : Assert(dlist_is_empty(&txn->changes));
3794 814 : Assert(txn->nentries == 0);
3795 814 : Assert(txn->nentries_mem == 0);
3796 : }
3797 :
3798 : /*
3799 : * Size of a change in memory.
3800 : */
3801 : static Size
3802 6628256 : ReorderBufferChangeSize(ReorderBufferChange *change)
3803 : {
3804 6628256 : Size sz = sizeof(ReorderBufferChange);
3805 :
3806 6628256 : switch (change->action)
3807 : {
3808 : /* fall through these, they're all similar enough */
3809 : case REORDER_BUFFER_CHANGE_INSERT:
3810 : case REORDER_BUFFER_CHANGE_UPDATE:
3811 : case REORDER_BUFFER_CHANGE_DELETE:
3812 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
3813 : {
3814 : ReorderBufferTupleBuf *oldtup,
3815 : *newtup;
3816 6473318 : Size oldlen = 0;
3817 6473318 : Size newlen = 0;
3818 :
3819 6473318 : oldtup = change->data.tp.oldtuple;
3820 6473318 : newtup = change->data.tp.newtuple;
3821 :
3822 6473318 : if (oldtup)
3823 : {
3824 547312 : sz += sizeof(HeapTupleData);
3825 547312 : oldlen = oldtup->tuple.t_len;
3826 547312 : sz += oldlen;
3827 : }
3828 :
3829 6473318 : if (newtup)
3830 : {
3831 5648958 : sz += sizeof(HeapTupleData);
3832 5648958 : newlen = newtup->tuple.t_len;
3833 5648958 : sz += newlen;
3834 : }
3835 :
3836 6473318 : break;
3837 : }
3838 : case REORDER_BUFFER_CHANGE_MESSAGE:
3839 : {
3840 104 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
3841 :
3842 104 : sz += prefix_size + change->data.msg.message_size +
3843 : sizeof(Size) + sizeof(Size);
3844 :
3845 104 : break;
3846 : }
3847 : case REORDER_BUFFER_CHANGE_INVALIDATION:
3848 : {
3849 11594 : sz += sizeof(SharedInvalidationMessage) *
3850 11594 : change->data.inval.ninvalidations;
3851 11594 : break;
3852 : }
3853 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
3854 : {
3855 : Snapshot snap;
3856 :
3857 2408 : snap = change->data.snapshot;
3858 :
3859 2408 : sz += sizeof(SnapshotData) +
3860 4816 : sizeof(TransactionId) * snap->xcnt +
3861 2408 : sizeof(TransactionId) * snap->subxcnt;
3862 :
3863 2408 : break;
3864 : }
3865 : case REORDER_BUFFER_CHANGE_TRUNCATE:
3866 : {
3867 48 : sz += sizeof(Oid) * change->data.truncate.nrelids;
3868 :
3869 48 : break;
3870 : }
3871 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
3872 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
3873 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
3874 : /* ReorderBufferChange contains everything important */
3875 140784 : break;
3876 : }
3877 :
3878 6628256 : return sz;
3879 : }
3880 :
3881 :
3882 : /*
3883 : * Restore a number of changes spilled to disk back into memory.
3884 : */
3885 : static Size
3886 182 : ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
3887 : TXNEntryFile *file, XLogSegNo *segno)
3888 : {
3889 182 : Size restored = 0;
3890 : XLogSegNo last_segno;
3891 : dlist_mutable_iter cleanup_iter;
3892 182 : File *fd = &file->vfd;
3893 :
3894 182 : Assert(txn->first_lsn != InvalidXLogRecPtr);
3895 182 : Assert(txn->final_lsn != InvalidXLogRecPtr);
3896 :
3897 : /* free current entries, so we have memory for more */
3898 310176 : dlist_foreach_modify(cleanup_iter, &txn->changes)
3899 : {
3900 309994 : ReorderBufferChange *cleanup =
3901 309994 : dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
3902 :
3903 309994 : dlist_delete(&cleanup->node);
3904 309994 : ReorderBufferReturnChange(rb, cleanup, true);
3905 : }
3906 182 : txn->nentries_mem = 0;
3907 182 : Assert(dlist_is_empty(&txn->changes));
3908 :
3909 182 : XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
3910 :
3911 317338 : while (restored < max_changes_in_memory && *segno <= last_segno)
3912 : {
3913 : int readBytes;
3914 : ReorderBufferDiskChange *ondisk;
3915 :
3916 316974 : if (*fd == -1)
3917 : {
3918 : char path[MAXPGPATH];
3919 :
3920 : /* first time in */
3921 66 : if (*segno == 0)
3922 66 : XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
3923 :
3924 66 : Assert(*segno != 0 || dlist_is_empty(&txn->changes));
3925 :
3926 : /*
3927 : * No need to care about TLIs here, only used during a single run,
3928 : * so each LSN only maps to a specific WAL record.
3929 : */
3930 66 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
3931 : *segno);
3932 :
3933 66 : *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
3934 :
3935 : /* No harm in resetting the offset even in case of failure */
3936 66 : file->curOffset = 0;
3937 :
3938 66 : if (*fd < 0 && errno == ENOENT)
3939 : {
3940 0 : *fd = -1;
3941 0 : (*segno)++;
3942 0 : continue;
3943 : }
3944 66 : else if (*fd < 0)
3945 0 : ereport(ERROR,
3946 : (errcode_for_file_access(),
3947 : errmsg("could not open file \"%s\": %m",
3948 : path)));
3949 : }
3950 :
3951 : /*
3952 : * Read the statically sized part of a change which has information
3953 : * about the total size. If we couldn't read a record, we're at the
3954 : * end of this file.
3955 : */
3956 316974 : ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
3957 316974 : readBytes = FileRead(file->vfd, rb->outbuf,
3958 : sizeof(ReorderBufferDiskChange),
3959 : file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ);
3960 :
3961 : /* eof */
3962 316974 : if (readBytes == 0)
3963 : {
3964 66 : FileClose(*fd);
3965 66 : *fd = -1;
3966 66 : (*segno)++;
3967 66 : continue;
3968 : }
3969 316908 : else if (readBytes < 0)
3970 0 : ereport(ERROR,
3971 : (errcode_for_file_access(),
3972 : errmsg("could not read from reorderbuffer spill file: %m")));
3973 316908 : else if (readBytes != sizeof(ReorderBufferDiskChange))
3974 0 : ereport(ERROR,
3975 : (errcode_for_file_access(),
3976 : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
3977 : readBytes,
3978 : (uint32) sizeof(ReorderBufferDiskChange))));
3979 :
3980 316908 : file->curOffset += readBytes;
3981 :
3982 316908 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3983 :
3984 316908 : ReorderBufferSerializeReserve(rb,
3985 316908 : sizeof(ReorderBufferDiskChange) + ondisk->size);
3986 316908 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
3987 :
3988 950724 : readBytes = FileRead(file->vfd,
3989 316908 : rb->outbuf + sizeof(ReorderBufferDiskChange),
3990 316908 : ondisk->size - sizeof(ReorderBufferDiskChange),
3991 : file->curOffset,
3992 : WAIT_EVENT_REORDER_BUFFER_READ);
3993 :
3994 316908 : if (readBytes < 0)
3995 0 : ereport(ERROR,
3996 : (errcode_for_file_access(),
3997 : errmsg("could not read from reorderbuffer spill file: %m")));
3998 316908 : else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
3999 0 : ereport(ERROR,
4000 : (errcode_for_file_access(),
4001 : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4002 : readBytes,
4003 : (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4004 :
4005 316908 : file->curOffset += readBytes;
4006 :
4007 : /*
4008 : * ok, read a full change from disk, now restore it into proper
4009 : * in-memory format
4010 : */
4011 316908 : ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4012 316908 : restored++;
4013 : }
4014 :
4015 182 : return restored;
4016 : }
4017 :
4018 : /*
4019 : * Convert change from its on-disk format to in-memory format and queue it onto
4020 : * the TXN's ->changes list.
4021 : *
4022 : * Note: although "data" is declared char*, at entry it points to a
4023 : * maxalign'd buffer, making it safe in most of this function to assume
4024 : * that the pointed-to data is suitably aligned for direct access.
4025 : */
4026 : static void
4027 316908 : ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
4028 : char *data)
4029 : {
4030 : ReorderBufferDiskChange *ondisk;
4031 : ReorderBufferChange *change;
4032 :
4033 316908 : ondisk = (ReorderBufferDiskChange *) data;
4034 :
4035 316908 : change = ReorderBufferGetChange(rb);
4036 :
4037 : /* copy static part */
4038 316908 : memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4039 :
4040 316908 : data += sizeof(ReorderBufferDiskChange);
4041 :
4042 : /* restore individual stuff */
4043 316908 : switch (change->action)
4044 : {
4045 : /* fall through these, they're all similar enough */
4046 : case REORDER_BUFFER_CHANGE_INSERT:
4047 : case REORDER_BUFFER_CHANGE_UPDATE:
4048 : case REORDER_BUFFER_CHANGE_DELETE:
4049 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4050 313124 : if (change->data.tp.oldtuple)
4051 : {
4052 10012 : uint32 tuplelen = ((HeapTuple) data)->t_len;
4053 :
4054 10012 : change->data.tp.oldtuple =
4055 10012 : ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4056 :
4057 : /* restore ->tuple */
4058 10012 : memcpy(&change->data.tp.oldtuple->tuple, data,
4059 : sizeof(HeapTupleData));
4060 10012 : data += sizeof(HeapTupleData);
4061 :
4062 : /* reset t_data pointer into the new tuplebuf */
4063 20024 : change->data.tp.oldtuple->tuple.t_data =
4064 10012 : ReorderBufferTupleBufData(change->data.tp.oldtuple);
4065 :
4066 : /* restore tuple data itself */
4067 10012 : memcpy(change->data.tp.oldtuple->tuple.t_data, data, tuplelen);
4068 10012 : data += tuplelen;
4069 : }
4070 :
4071 313124 : if (change->data.tp.newtuple)
4072 : {
4073 : /* here, data might not be suitably aligned! */
4074 : uint32 tuplelen;
4075 :
4076 292684 : memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
4077 : sizeof(uint32));
4078 :
4079 292684 : change->data.tp.newtuple =
4080 292684 : ReorderBufferGetTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4081 :
4082 : /* restore ->tuple */
4083 292684 : memcpy(&change->data.tp.newtuple->tuple, data,
4084 : sizeof(HeapTupleData));
4085 292684 : data += sizeof(HeapTupleData);
4086 :
4087 : /* reset t_data pointer into the new tuplebuf */
4088 585368 : change->data.tp.newtuple->tuple.t_data =
4089 292684 : ReorderBufferTupleBufData(change->data.tp.newtuple);
4090 :
4091 : /* restore tuple data itself */
4092 292684 : memcpy(change->data.tp.newtuple->tuple.t_data, data, tuplelen);
4093 292684 : data += tuplelen;
4094 : }
4095 :
4096 313124 : break;
4097 : case REORDER_BUFFER_CHANGE_MESSAGE:
4098 : {
4099 : Size prefix_size;
4100 :
4101 : /* read prefix */
4102 2 : memcpy(&prefix_size, data, sizeof(Size));
4103 2 : data += sizeof(Size);
4104 2 : change->data.msg.prefix = MemoryContextAlloc(rb->context,
4105 : prefix_size);
4106 2 : memcpy(change->data.msg.prefix, data, prefix_size);
4107 2 : Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
4108 2 : data += prefix_size;
4109 :
4110 : /* read the message */
4111 2 : memcpy(&change->data.msg.message_size, data, sizeof(Size));
4112 2 : data += sizeof(Size);
4113 2 : change->data.msg.message = MemoryContextAlloc(rb->context,
4114 : change->data.msg.message_size);
4115 2 : memcpy(change->data.msg.message, data,
4116 : change->data.msg.message_size);
4117 2 : data += change->data.msg.message_size;
4118 :
4119 2 : break;
4120 : }
4121 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4122 : {
4123 36 : Size inval_size = sizeof(SharedInvalidationMessage) *
4124 36 : change->data.inval.ninvalidations;
4125 :
4126 36 : change->data.inval.invalidations =
4127 36 : MemoryContextAlloc(rb->context, inval_size);
4128 :
4129 : /* read the message */
4130 36 : memcpy(change->data.inval.invalidations, data, inval_size);
4131 :
4132 36 : break;
4133 : }
4134 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4135 : {
4136 : Snapshot oldsnap;
4137 : Snapshot newsnap;
4138 : Size size;
4139 :
4140 4 : oldsnap = (Snapshot) data;
4141 :
4142 4 : size = sizeof(SnapshotData) +
4143 8 : sizeof(TransactionId) * oldsnap->xcnt +
4144 4 : sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4145 :
4146 4 : change->data.snapshot = MemoryContextAllocZero(rb->context, size);
4147 :
4148 4 : newsnap = change->data.snapshot;
4149 :
4150 4 : memcpy(newsnap, data, size);
4151 4 : newsnap->xip = (TransactionId *)
4152 : (((char *) newsnap) + sizeof(SnapshotData));
4153 4 : newsnap->subxip = newsnap->xip + newsnap->xcnt;
4154 4 : newsnap->copied = true;
4155 4 : break;
4156 : }
4157 : /* the base struct contains all the data, easy peasy */
4158 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4159 : {
4160 : Oid *relids;
4161 :
4162 0 : relids = ReorderBufferGetRelids(rb,
4163 0 : change->data.truncate.nrelids);
4164 0 : memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4165 0 : change->data.truncate.relids = relids;
4166 :
4167 0 : break;
4168 : }
4169 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4170 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4171 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4172 3742 : break;
4173 : }
4174 :
4175 316908 : dlist_push_tail(&txn->changes, &change->node);
4176 316908 : txn->nentries_mem++;
4177 :
4178 : /*
4179 : * Update memory accounting for the restored change. We need to do this
4180 : * although we don't check the memory limit when restoring the changes in
4181 : * this branch (we only do that when initially queueing the changes after
4182 : * decoding), because we will release the changes later, and that will
4183 : * update the accounting too (subtracting the size from the counters). And
4184 : * we don't want to underflow there.
4185 : */
4186 316908 : ReorderBufferChangeMemoryUpdate(rb, change, true);
4187 316908 : }
4188 :
4189 : /*
4190 : * Remove all on-disk stored for the passed in transaction.
4191 : */
4192 : static void
4193 426 : ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
4194 : {
4195 : XLogSegNo first;
4196 : XLogSegNo cur;
4197 : XLogSegNo last;
4198 :
4199 426 : Assert(txn->first_lsn != InvalidXLogRecPtr);
4200 426 : Assert(txn->final_lsn != InvalidXLogRecPtr);
4201 :
4202 426 : XLByteToSeg(txn->first_lsn, first, wal_segment_size);
4203 426 : XLByteToSeg(txn->final_lsn, last, wal_segment_size);
4204 :
4205 : /* iterate over all possible filenames, and delete them */
4206 852 : for (cur = first; cur <= last; cur++)
4207 : {
4208 : char path[MAXPGPATH];
4209 :
4210 426 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, cur);
4211 426 : if (unlink(path) != 0 && errno != ENOENT)
4212 0 : ereport(ERROR,
4213 : (errcode_for_file_access(),
4214 : errmsg("could not remove file \"%s\": %m", path)));
4215 : }
4216 426 : }
4217 :
4218 : /*
4219 : * Remove any leftover serialized reorder buffers from a slot directory after a
4220 : * prior crash or decoding session exit.
4221 : */
4222 : static void
4223 1574 : ReorderBufferCleanupSerializedTXNs(const char *slotname)
4224 : {
4225 : DIR *spill_dir;
4226 : struct dirent *spill_de;
4227 : struct stat statbuf;
4228 : char path[MAXPGPATH * 2 + 12];
4229 :
4230 1574 : sprintf(path, "pg_replslot/%s", slotname);
4231 :
4232 : /* we're only handling directories here, skip if it's not ours */
4233 1574 : if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
4234 1574 : return;
4235 :
4236 1574 : spill_dir = AllocateDir(path);
4237 1574 : while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4238 : {
4239 : /* only look at names that can be ours */
4240 4722 : if (strncmp(spill_de->d_name, "xid", 3) == 0)
4241 : {
4242 0 : snprintf(path, sizeof(path),
4243 : "pg_replslot/%s/%s", slotname,
4244 0 : spill_de->d_name);
4245 :
4246 0 : if (unlink(path) != 0)
4247 0 : ereport(ERROR,
4248 : (errcode_for_file_access(),
4249 : errmsg("could not remove file \"%s\" during removal of pg_replslot/%s/xid*: %m",
4250 : path, slotname)));
4251 : }
4252 : }
4253 1574 : FreeDir(spill_dir);
4254 : }
4255 :
4256 : /*
4257 : * Given a replication slot, transaction ID and segment number, fill in the
4258 : * corresponding spill file into 'path', which is a caller-owned buffer of size
4259 : * at least MAXPGPATH.
4260 : */
4261 : static void
4262 6506 : ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid,
4263 : XLogSegNo segno)
4264 : {
4265 : XLogRecPtr recptr;
4266 :
4267 6506 : XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
4268 :
4269 19518 : snprintf(path, MAXPGPATH, "pg_replslot/%s/xid-%u-lsn-%X-%X.spill",
4270 6506 : NameStr(MyReplicationSlot->data.name),
4271 : xid,
4272 6506 : (uint32) (recptr >> 32), (uint32) recptr);
4273 6506 : }
4274 :
4275 : /*
4276 : * Delete all data spilled to disk after we've restarted/crashed. It will be
4277 : * recreated when the respective slots are reused.
4278 : */
4279 : void
4280 316 : StartupReorderBuffer(void)
4281 : {
4282 : DIR *logical_dir;
4283 : struct dirent *logical_de;
4284 :
4285 316 : logical_dir = AllocateDir("pg_replslot");
4286 1280 : while ((logical_de = ReadDir(logical_dir, "pg_replslot")) != NULL)
4287 : {
4288 980 : if (strcmp(logical_de->d_name, ".") == 0 ||
4289 332 : strcmp(logical_de->d_name, "..") == 0)
4290 632 : continue;
4291 :
4292 : /* if it cannot be a slot, skip the directory */
4293 16 : if (!ReplicationSlotValidateName(logical_de->d_name, DEBUG2))
4294 0 : continue;
4295 :
4296 : /*
4297 : * ok, has to be a surviving logical slot, iterate and delete
4298 : * everything starting with xid-*
4299 : */
4300 16 : ReorderBufferCleanupSerializedTXNs(logical_de->d_name);
4301 : }
4302 316 : FreeDir(logical_dir);
4303 316 : }
4304 :
4305 : /* ---------------------------------------
4306 : * toast reassembly support
4307 : * ---------------------------------------
4308 : */
4309 :
4310 : /*
4311 : * Initialize per tuple toast reconstruction support.
4312 : */
4313 : static void
4314 60 : ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
4315 : {
4316 : HASHCTL hash_ctl;
4317 :
4318 60 : Assert(txn->toast_hash == NULL);
4319 :
4320 60 : memset(&hash_ctl, 0, sizeof(hash_ctl));
4321 60 : hash_ctl.keysize = sizeof(Oid);
4322 60 : hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4323 60 : hash_ctl.hcxt = rb->context;
4324 60 : txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4325 : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
4326 60 : }
4327 :
4328 : /*
4329 : * Per toast-chunk handling for toast reconstruction
4330 : *
4331 : * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4332 : * toasted Datum comes along.
4333 : */
4334 : static void
4335 3338 : ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
4336 : Relation relation, ReorderBufferChange *change)
4337 : {
4338 : ReorderBufferToastEnt *ent;
4339 : ReorderBufferTupleBuf *newtup;
4340 : bool found;
4341 : int32 chunksize;
4342 : bool isnull;
4343 : Pointer chunk;
4344 3338 : TupleDesc desc = RelationGetDescr(relation);
4345 : Oid chunk_id;
4346 : int32 chunk_seq;
4347 :
4348 3338 : if (txn->toast_hash == NULL)
4349 60 : ReorderBufferToastInitHash(rb, txn);
4350 :
4351 3338 : Assert(IsToastRelation(relation));
4352 :
4353 3338 : newtup = change->data.tp.newtuple;
4354 3338 : chunk_id = DatumGetObjectId(fastgetattr(&newtup->tuple, 1, desc, &isnull));
4355 3338 : Assert(!isnull);
4356 3338 : chunk_seq = DatumGetInt32(fastgetattr(&newtup->tuple, 2, desc, &isnull));
4357 3338 : Assert(!isnull);
4358 :
4359 3338 : ent = (ReorderBufferToastEnt *)
4360 3338 : hash_search(txn->toast_hash,
4361 : (void *) &chunk_id,
4362 : HASH_ENTER,
4363 : &found);
4364 :
4365 3338 : if (!found)
4366 : {
4367 68 : Assert(ent->chunk_id == chunk_id);
4368 68 : ent->num_chunks = 0;
4369 68 : ent->last_chunk_seq = 0;
4370 68 : ent->size = 0;
4371 68 : ent->reconstructed = NULL;
4372 68 : dlist_init(&ent->chunks);
4373 :
4374 68 : if (chunk_seq != 0)
4375 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
4376 : chunk_seq, chunk_id);
4377 : }
4378 3270 : else if (found && chunk_seq != ent->last_chunk_seq + 1)
4379 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
4380 : chunk_seq, chunk_id, ent->last_chunk_seq + 1);
4381 :
4382 3338 : chunk = DatumGetPointer(fastgetattr(&newtup->tuple, 3, desc, &isnull));
4383 3338 : Assert(!isnull);
4384 :
4385 : /* calculate size so we can allocate the right size at once later */
4386 3338 : if (!VARATT_IS_EXTENDED(chunk))
4387 3338 : chunksize = VARSIZE(chunk) - VARHDRSZ;
4388 0 : else if (VARATT_IS_SHORT(chunk))
4389 : /* could happen due to heap_form_tuple doing its thing */
4390 0 : chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
4391 : else
4392 0 : elog(ERROR, "unexpected type of toast chunk");
4393 :
4394 3338 : ent->size += chunksize;
4395 3338 : ent->last_chunk_seq = chunk_seq;
4396 3338 : ent->num_chunks++;
4397 3338 : dlist_push_tail(&ent->chunks, &change->node);
4398 3338 : }
4399 :
4400 : /*
4401 : * Rejigger change->newtuple to point to in-memory toast tuples instead to
4402 : * on-disk toast tuples that may not longer exist (think DROP TABLE or VACUUM).
4403 : *
4404 : * We cannot replace unchanged toast tuples though, so those will still point
4405 : * to on-disk toast data.
4406 : *
4407 : * While updating the existing change with detoasted tuple data, we need to
4408 : * update the memory accounting info, because the change size will differ.
4409 : * Otherwise the accounting may get out of sync, triggering serialization
4410 : * at unexpected times.
4411 : *
4412 : * We simply subtract size of the change before rejiggering the tuple, and
4413 : * then adding the new size. This makes it look like the change was removed
4414 : * and then added back, except it only tweaks the accounting info.
4415 : *
4416 : * In particular it can't trigger serialization, which would be pointless
4417 : * anyway as it happens during commit processing right before handing
4418 : * the change to the output plugin.
4419 : */
4420 : static void
4421 632718 : ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
4422 : Relation relation, ReorderBufferChange *change)
4423 : {
4424 : TupleDesc desc;
4425 : int natt;
4426 : Datum *attrs;
4427 : bool *isnull;
4428 : bool *free;
4429 : HeapTuple tmphtup;
4430 : Relation toast_rel;
4431 : TupleDesc toast_desc;
4432 : MemoryContext oldcontext;
4433 : ReorderBufferTupleBuf *newtup;
4434 :
4435 : /* no toast tuples changed */
4436 632718 : if (txn->toast_hash == NULL)
4437 1264972 : return;
4438 :
4439 : /*
4440 : * We're going to modify the size of the change, so to make sure the
4441 : * accounting is correct we'll make it look like we're removing the change
4442 : * now (with the old size), and then re-add it at the end.
4443 : */
4444 464 : ReorderBufferChangeMemoryUpdate(rb, change, false);
4445 :
4446 464 : oldcontext = MemoryContextSwitchTo(rb->context);
4447 :
4448 : /* we should only have toast tuples in an INSERT or UPDATE */
4449 464 : Assert(change->data.tp.newtuple);
4450 :
4451 464 : desc = RelationGetDescr(relation);
4452 :
4453 464 : toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
4454 464 : if (!RelationIsValid(toast_rel))
4455 0 : elog(ERROR, "could not open relation with OID %u",
4456 : relation->rd_rel->reltoastrelid);
4457 :
4458 464 : toast_desc = RelationGetDescr(toast_rel);
4459 :
4460 : /* should we allocate from stack instead? */
4461 464 : attrs = palloc0(sizeof(Datum) * desc->natts);
4462 464 : isnull = palloc0(sizeof(bool) * desc->natts);
4463 464 : free = palloc0(sizeof(bool) * desc->natts);
4464 :
4465 464 : newtup = change->data.tp.newtuple;
4466 :
4467 464 : heap_deform_tuple(&newtup->tuple, desc, attrs, isnull);
4468 :
4469 1456 : for (natt = 0; natt < desc->natts; natt++)
4470 : {
4471 992 : Form_pg_attribute attr = TupleDescAttr(desc, natt);
4472 : ReorderBufferToastEnt *ent;
4473 : struct varlena *varlena;
4474 :
4475 : /* va_rawsize is the size of the original datum -- including header */
4476 : struct varatt_external toast_pointer;
4477 : struct varatt_indirect redirect_pointer;
4478 992 : struct varlena *new_datum = NULL;
4479 : struct varlena *reconstructed;
4480 : dlist_iter it;
4481 992 : Size data_done = 0;
4482 :
4483 : /* system columns aren't toasted */
4484 992 : if (attr->attnum < 0)
4485 924 : continue;
4486 :
4487 992 : if (attr->attisdropped)
4488 0 : continue;
4489 :
4490 : /* not a varlena datatype */
4491 992 : if (attr->attlen != -1)
4492 480 : continue;
4493 :
4494 : /* no data */
4495 512 : if (isnull[natt])
4496 24 : continue;
4497 :
4498 : /* ok, we know we have a toast datum */
4499 488 : varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
4500 :
4501 : /* no need to do anything if the tuple isn't external */
4502 488 : if (!VARATT_IS_EXTERNAL(varlena))
4503 404 : continue;
4504 :
4505 84 : VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
4506 :
4507 : /*
4508 : * Check whether the toast tuple changed, replace if so.
4509 : */
4510 84 : ent = (ReorderBufferToastEnt *)
4511 84 : hash_search(txn->toast_hash,
4512 : (void *) &toast_pointer.va_valueid,
4513 : HASH_FIND,
4514 : NULL);
4515 84 : if (ent == NULL)
4516 16 : continue;
4517 :
4518 68 : new_datum =
4519 : (struct varlena *) palloc0(INDIRECT_POINTER_SIZE);
4520 :
4521 68 : free[natt] = true;
4522 :
4523 68 : reconstructed = palloc0(toast_pointer.va_rawsize);
4524 :
4525 68 : ent->reconstructed = reconstructed;
4526 :
4527 : /* stitch toast tuple back together from its parts */
4528 3406 : dlist_foreach(it, &ent->chunks)
4529 : {
4530 : bool isnull;
4531 : ReorderBufferChange *cchange;
4532 : ReorderBufferTupleBuf *ctup;
4533 : Pointer chunk;
4534 :
4535 3338 : cchange = dlist_container(ReorderBufferChange, node, it.cur);
4536 3338 : ctup = cchange->data.tp.newtuple;
4537 3338 : chunk = DatumGetPointer(fastgetattr(&ctup->tuple, 3, toast_desc, &isnull));
4538 :
4539 3338 : Assert(!isnull);
4540 3338 : Assert(!VARATT_IS_EXTERNAL(chunk));
4541 3338 : Assert(!VARATT_IS_SHORT(chunk));
4542 :
4543 6676 : memcpy(VARDATA(reconstructed) + data_done,
4544 3338 : VARDATA(chunk),
4545 3338 : VARSIZE(chunk) - VARHDRSZ);
4546 3338 : data_done += VARSIZE(chunk) - VARHDRSZ;
4547 : }
4548 68 : Assert(data_done == toast_pointer.va_extsize);
4549 :
4550 : /* make sure its marked as compressed or not */
4551 68 : if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
4552 10 : SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
4553 : else
4554 58 : SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
4555 :
4556 68 : memset(&redirect_pointer, 0, sizeof(redirect_pointer));
4557 68 : redirect_pointer.pointer = reconstructed;
4558 :
4559 68 : SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT);
4560 68 : memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
4561 : sizeof(redirect_pointer));
4562 :
4563 68 : attrs[natt] = PointerGetDatum(new_datum);
4564 : }
4565 :
4566 : /*
4567 : * Build tuple in separate memory & copy tuple back into the tuplebuf
4568 : * passed to the output plugin. We can't directly heap_fill_tuple() into
4569 : * the tuplebuf because attrs[] will point back into the current content.
4570 : */
4571 464 : tmphtup = heap_form_tuple(desc, attrs, isnull);
4572 464 : Assert(newtup->tuple.t_len <= MaxHeapTupleSize);
4573 464 : Assert(ReorderBufferTupleBufData(newtup) == newtup->tuple.t_data);
4574 :
4575 464 : memcpy(newtup->tuple.t_data, tmphtup->t_data, tmphtup->t_len);
4576 464 : newtup->tuple.t_len = tmphtup->t_len;
4577 :
4578 : /*
4579 : * free resources we won't further need, more persistent stuff will be
4580 : * free'd in ReorderBufferToastReset().
4581 : */
4582 464 : RelationClose(toast_rel);
4583 464 : pfree(tmphtup);
4584 1456 : for (natt = 0; natt < desc->natts; natt++)
4585 : {
4586 992 : if (free[natt])
4587 68 : pfree(DatumGetPointer(attrs[natt]));
4588 : }
4589 464 : pfree(attrs);
4590 464 : pfree(free);
4591 464 : pfree(isnull);
4592 :
4593 464 : MemoryContextSwitchTo(oldcontext);
4594 :
4595 : /* now add the change back, with the correct size */
4596 464 : ReorderBufferChangeMemoryUpdate(rb, change, true);
4597 : }
4598 :
4599 : /*
4600 : * Free all resources allocated for toast reconstruction.
4601 : */
4602 : static void
4603 632316 : ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
4604 : {
4605 : HASH_SEQ_STATUS hstat;
4606 : ReorderBufferToastEnt *ent;
4607 :
4608 632316 : if (txn->toast_hash == NULL)
4609 1264572 : return;
4610 :
4611 : /* sequentially walk over the hash and free everything */
4612 60 : hash_seq_init(&hstat, txn->toast_hash);
4613 188 : while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
4614 : {
4615 : dlist_mutable_iter it;
4616 :
4617 68 : if (ent->reconstructed != NULL)
4618 68 : pfree(ent->reconstructed);
4619 :
4620 3406 : dlist_foreach_modify(it, &ent->chunks)
4621 : {
4622 3338 : ReorderBufferChange *change =
4623 3338 : dlist_container(ReorderBufferChange, node, it.cur);
4624 :
4625 3338 : dlist_delete(&change->node);
4626 3338 : ReorderBufferReturnChange(rb, change, true);
4627 : }
4628 : }
4629 :
4630 60 : hash_destroy(txn->toast_hash);
4631 60 : txn->toast_hash = NULL;
4632 : }
4633 :
4634 :
4635 : /* ---------------------------------------
4636 : * Visibility support for logical decoding
4637 : *
4638 : *
4639 : * Lookup actual cmin/cmax values when using decoding snapshot. We can't
4640 : * always rely on stored cmin/cmax values because of two scenarios:
4641 : *
4642 : * * A tuple got changed multiple times during a single transaction and thus
4643 : * has got a combocid. Combocid's are only valid for the duration of a
4644 : * single transaction.
4645 : * * A tuple with a cmin but no cmax (and thus no combocid) got
4646 : * deleted/updated in another transaction than the one which created it
4647 : * which we are looking at right now. As only one of cmin, cmax or combocid
4648 : * is actually stored in the heap we don't have access to the value we
4649 : * need anymore.
4650 : *
4651 : * To resolve those problems we have a per-transaction hash of (cmin,
4652 : * cmax) tuples keyed by (relfilenode, ctid) which contains the actual
4653 : * (cmin, cmax) values. That also takes care of combocids by simply
4654 : * not caring about them at all. As we have the real cmin/cmax values
4655 : * combocids aren't interesting.
4656 : *
4657 : * As we only care about catalog tuples here the overhead of this
4658 : * hashtable should be acceptable.
4659 : *
4660 : * Heap rewrites complicate this a bit, check rewriteheap.c for
4661 : * details.
4662 : * -------------------------------------------------------------------------
4663 : */
4664 :
4665 : /* struct for sorting mapping files by LSN efficiently */
4666 : typedef struct RewriteMappingFile
4667 : {
4668 : XLogRecPtr lsn;
4669 : char fname[MAXPGPATH];
4670 : } RewriteMappingFile;
4671 :
4672 : #ifdef NOT_USED
4673 : static void
4674 : DisplayMapping(HTAB *tuplecid_data)
4675 : {
4676 : HASH_SEQ_STATUS hstat;
4677 : ReorderBufferTupleCidEnt *ent;
4678 :
4679 : hash_seq_init(&hstat, tuplecid_data);
4680 : while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
4681 : {
4682 : elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
4683 : ent->key.relnode.dbNode,
4684 : ent->key.relnode.spcNode,
4685 : ent->key.relnode.relNode,
4686 : ItemPointerGetBlockNumber(&ent->key.tid),
4687 : ItemPointerGetOffsetNumber(&ent->key.tid),
4688 : ent->cmin,
4689 : ent->cmax
4690 : );
4691 : }
4692 : }
4693 : #endif
4694 :
4695 : /*
4696 : * Apply a single mapping file to tuplecid_data.
4697 : *
4698 : * The mapping file has to have been verified to be a) committed b) for our
4699 : * transaction c) applied in LSN order.
4700 : */
4701 : static void
4702 44 : ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
4703 : {
4704 : char path[MAXPGPATH];
4705 : int fd;
4706 : int readBytes;
4707 : LogicalRewriteMappingData map;
4708 :
4709 44 : sprintf(path, "pg_logical/mappings/%s", fname);
4710 44 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
4711 44 : if (fd < 0)
4712 0 : ereport(ERROR,
4713 : (errcode_for_file_access(),
4714 : errmsg("could not open file \"%s\": %m", path)));
4715 :
4716 : while (true)
4717 : {
4718 : ReorderBufferTupleCidKey key;
4719 : ReorderBufferTupleCidEnt *ent;
4720 : ReorderBufferTupleCidEnt *new_ent;
4721 : bool found;
4722 :
4723 : /* be careful about padding */
4724 282 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
4725 :
4726 : /* read all mappings till the end of the file */
4727 282 : pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ);
4728 282 : readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
4729 282 : pgstat_report_wait_end();
4730 :
4731 282 : if (readBytes < 0)
4732 0 : ereport(ERROR,
4733 : (errcode_for_file_access(),
4734 : errmsg("could not read file \"%s\": %m",
4735 : path)));
4736 282 : else if (readBytes == 0) /* EOF */
4737 44 : break;
4738 238 : else if (readBytes != sizeof(LogicalRewriteMappingData))
4739 0 : ereport(ERROR,
4740 : (errcode_for_file_access(),
4741 : errmsg("could not read from file \"%s\": read %d instead of %d bytes",
4742 : path, readBytes,
4743 : (int32) sizeof(LogicalRewriteMappingData))));
4744 :
4745 238 : key.relnode = map.old_node;
4746 238 : ItemPointerCopy(&map.old_tid,
4747 : &key.tid);
4748 :
4749 :
4750 238 : ent = (ReorderBufferTupleCidEnt *)
4751 : hash_search(tuplecid_data,
4752 : (void *) &key,
4753 : HASH_FIND,
4754 : NULL);
4755 :
4756 : /* no existing mapping, no need to update */
4757 238 : if (!ent)
4758 0 : continue;
4759 :
4760 238 : key.relnode = map.new_node;
4761 238 : ItemPointerCopy(&map.new_tid,
4762 : &key.tid);
4763 :
4764 238 : new_ent = (ReorderBufferTupleCidEnt *)
4765 : hash_search(tuplecid_data,
4766 : (void *) &key,
4767 : HASH_ENTER,
4768 : &found);
4769 :
4770 238 : if (found)
4771 : {
4772 : /*
4773 : * Make sure the existing mapping makes sense. We sometime update
4774 : * old records that did not yet have a cmax (e.g. pg_class' own
4775 : * entry while rewriting it) during rewrites, so allow that.
4776 : */
4777 12 : Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
4778 12 : Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
4779 : }
4780 : else
4781 : {
4782 : /* update mapping */
4783 226 : new_ent->cmin = ent->cmin;
4784 226 : new_ent->cmax = ent->cmax;
4785 226 : new_ent->combocid = ent->combocid;
4786 : }
4787 238 : }
4788 :
4789 44 : if (CloseTransientFile(fd) != 0)
4790 0 : ereport(ERROR,
4791 : (errcode_for_file_access(),
4792 : errmsg("could not close file \"%s\": %m", path)));
4793 44 : }
4794 :
4795 :
4796 : /*
4797 : * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
4798 : */
4799 : static bool
4800 580 : TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
4801 : {
4802 580 : return bsearch(&xid, xip, num,
4803 : sizeof(TransactionId), xidComparator) != NULL;
4804 : }
4805 :
4806 : /*
4807 : * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
4808 : */
4809 : static int
4810 34 : file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
4811 : {
4812 34 : RewriteMappingFile *a = (RewriteMappingFile *) lfirst(a_p);
4813 34 : RewriteMappingFile *b = (RewriteMappingFile *) lfirst(b_p);
4814 :
4815 34 : if (a->lsn < b->lsn)
4816 34 : return -1;
4817 0 : else if (a->lsn > b->lsn)
4818 0 : return 1;
4819 0 : return 0;
4820 : }
4821 :
4822 : /*
4823 : * Apply any existing logical remapping files if there are any targeted at our
4824 : * transaction for relid.
4825 : */
4826 : static void
4827 10 : UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
4828 : {
4829 : DIR *mapping_dir;
4830 : struct dirent *mapping_de;
4831 10 : List *files = NIL;
4832 : ListCell *file;
4833 10 : Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
4834 :
4835 10 : mapping_dir = AllocateDir("pg_logical/mappings");
4836 930 : while ((mapping_de = ReadDir(mapping_dir, "pg_logical/mappings")) != NULL)
4837 : {
4838 : Oid f_dboid;
4839 : Oid f_relid;
4840 : TransactionId f_mapped_xid;
4841 : TransactionId f_create_xid;
4842 : XLogRecPtr f_lsn;
4843 : uint32 f_hi,
4844 : f_lo;
4845 : RewriteMappingFile *f;
4846 :
4847 1810 : if (strcmp(mapping_de->d_name, ".") == 0 ||
4848 900 : strcmp(mapping_de->d_name, "..") == 0)
4849 886 : continue;
4850 :
4851 : /* Ignore files that aren't ours */
4852 890 : if (strncmp(mapping_de->d_name, "map-", 4) != 0)
4853 0 : continue;
4854 :
4855 890 : if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
4856 : &f_dboid, &f_relid, &f_hi, &f_lo,
4857 : &f_mapped_xid, &f_create_xid) != 6)
4858 0 : elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
4859 :
4860 890 : f_lsn = ((uint64) f_hi) << 32 | f_lo;
4861 :
4862 : /* mapping for another database */
4863 890 : if (f_dboid != dboid)
4864 0 : continue;
4865 :
4866 : /* mapping for another relation */
4867 890 : if (f_relid != relid)
4868 90 : continue;
4869 :
4870 : /* did the creating transaction abort? */
4871 800 : if (!TransactionIdDidCommit(f_create_xid))
4872 220 : continue;
4873 :
4874 : /* not for our transaction */
4875 580 : if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
4876 536 : continue;
4877 :
4878 : /* ok, relevant, queue for apply */
4879 44 : f = palloc(sizeof(RewriteMappingFile));
4880 44 : f->lsn = f_lsn;
4881 44 : strcpy(f->fname, mapping_de->d_name);
4882 44 : files = lappend(files, f);
4883 : }
4884 10 : FreeDir(mapping_dir);
4885 :
4886 : /* sort files so we apply them in LSN order */
4887 10 : list_sort(files, file_sort_by_lsn);
4888 :
4889 54 : foreach(file, files)
4890 : {
4891 44 : RewriteMappingFile *f = (RewriteMappingFile *) lfirst(file);
4892 :
4893 44 : elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
4894 : snapshot->subxip[0]);
4895 44 : ApplyLogicalMappingFile(tuplecid_data, relid, f->fname);
4896 44 : pfree(f);
4897 : }
4898 10 : }
4899 :
4900 : /*
4901 : * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
4902 : * combocids.
4903 : */
4904 : bool
4905 996 : ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
4906 : Snapshot snapshot,
4907 : HeapTuple htup, Buffer buffer,
4908 : CommandId *cmin, CommandId *cmax)
4909 : {
4910 : ReorderBufferTupleCidKey key;
4911 : ReorderBufferTupleCidEnt *ent;
4912 : ForkNumber forkno;
4913 : BlockNumber blockno;
4914 996 : bool updated_mapping = false;
4915 :
4916 : /*
4917 : * Return unresolved if tuplecid_data is not valid. That's because when
4918 : * streaming in-progress transactions we may run into tuples with the CID
4919 : * before actually decoding them. Think e.g. about INSERT followed by
4920 : * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
4921 : * INSERT. So in such cases, we assume the CID is from the future
4922 : * command.
4923 : */
4924 996 : if (tuplecid_data == NULL)
4925 18 : return false;
4926 :
4927 : /* be careful about padding */
4928 978 : memset(&key, 0, sizeof(key));
4929 :
4930 978 : Assert(!BufferIsLocal(buffer));
4931 :
4932 : /*
4933 : * get relfilenode from the buffer, no convenient way to access it other
4934 : * than that.
4935 : */
4936 978 : BufferGetTag(buffer, &key.relnode, &forkno, &blockno);
4937 :
4938 : /* tuples can only be in the main fork */
4939 978 : Assert(forkno == MAIN_FORKNUM);
4940 978 : Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
4941 :
4942 978 : ItemPointerCopy(&htup->t_self,
4943 : &key.tid);
4944 :
4945 : restart:
4946 988 : ent = (ReorderBufferTupleCidEnt *)
4947 : hash_search(tuplecid_data,
4948 : (void *) &key,
4949 : HASH_FIND,
4950 : NULL);
4951 :
4952 : /*
4953 : * failed to find a mapping, check whether the table was rewritten and
4954 : * apply mapping if so, but only do that once - there can be no new
4955 : * mappings while we are in here since we have to hold a lock on the
4956 : * relation.
4957 : */
4958 988 : if (ent == NULL && !updated_mapping)
4959 : {
4960 10 : UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
4961 : /* now check but don't update for a mapping again */
4962 10 : updated_mapping = true;
4963 10 : goto restart;
4964 : }
4965 978 : else if (ent == NULL)
4966 0 : return false;
4967 :
4968 978 : if (cmin)
4969 978 : *cmin = ent->cmin;
4970 978 : if (cmax)
4971 978 : *cmax = ent->cmax;
4972 978 : return true;
4973 : }
|