From 6b8e856c15750f89f9d559ae9f9fbd7f3f2db125 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 1 Apr 2025 00:18:14 +0300 Subject: [PATCH v6 12/12] Add a cache to Snapshot to avoid repeated CSN lookups Cache the status of all XIDs that have been looked up in the CSN log in the SnapshotData. This avoids having to go the CSN log in the common case that the same XIDs are looked up over and over again. --- src/backend/utils/time/snapmgr.c | 111 +++++++++++++++++++++++++++++-- src/include/utils/snapshot.h | 9 +++ 2 files changed, 116 insertions(+), 4 deletions(-) diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 049c706f2cf..250ba1650e4 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -114,6 +114,35 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" +/* + * Define a radix tree implementation to cache CSN lookups in a snapshot. + * + * We need only one bit of information for each XID stored in the cache: was + * the XID running or not. However, the radix tree implementation uses 8 + * bytes for each entry (on 64-bit machines) even if the value type is smaller + * than that. To reduce memory usage, we use uint64 as the value type, and + * store multiple XIDs in each value. + * + * The 64-bit value word holds two bits for each XID: whether the XID is + * present in the cache or not, and if it's present, whether it's considered + * as in-progress by the snapshot or not. So each entry in the radix tree + * holds the status for 32 XIDs. + */ +#define RT_PREFIX inprogress_cache +#define RT_SCOPE +#define RT_DECLARE +#define RT_DEFINE +#define RT_VALUE_TYPE uint64 +#include "lib/radixtree.h" + +#define INPROGRESS_CACHE_BITS 2 +#define INPROGRESS_CACHE_XIDS_PER_WORD 32 + +#define INPROGRESS_CACHE_XID_IS_CACHED(word, slotno) \ + ((((word) & (UINT64CONST(1) << (slotno)))) != 0) + +#define INPROGRESS_CACHE_XID_IS_IN_PROGRESS(word, slotno) \ + ((((word) & (UINT64CONST(1) << ((slotno) + 1)))) != 0) /* * CurrentSnapshot points to the only snapshot taken in transaction-snapshot @@ -240,6 +269,7 @@ typedef struct SerializedSnapshotData int32 subxcnt; bool suboverflowed; bool takenDuringRecovery; + XLogRecPtr snapshotCsn; CommandId curcid; } SerializedSnapshotData; @@ -1177,6 +1207,7 @@ ExportSnapshot(MVCCSnapshotShared snapshot) appendStringInfo(&buf, "sxp:%u\n", children[i]); } appendStringInfo(&buf, "rec:%u\n", snapshot->takenDuringRecovery); + appendStringInfo(&buf, "snapshotcsn:%X/%X\n", LSN_FORMAT_ARGS(snapshot->snapshotCsn)); /* * Now write the text representation into a file. We first write to a @@ -1449,6 +1480,7 @@ ImportSnapshot(const char *idstr) } snapshot->takenDuringRecovery = parseIntFromText("rec:", &filebuf, path); + snapshot->snapshotCsn = parseIntFromText("snapshotcsn:", &filebuf, path); snapshot->refcount = 1; valid_snapshots_push_out_of_order(snapshot); @@ -1702,6 +1734,7 @@ SerializeSnapshot(MVCCSnapshot snapshot, char *start_address) serialized_snapshot.subxcnt = snapshot->shared->subxcnt; serialized_snapshot.suboverflowed = snapshot->shared->suboverflowed; serialized_snapshot.takenDuringRecovery = snapshot->shared->takenDuringRecovery; + serialized_snapshot.snapshotCsn = snapshot->shared->snapshotCsn; serialized_snapshot.curcid = snapshot->curcid; /* @@ -1776,6 +1809,9 @@ RestoreSnapshot(char *start_address) snapshot->shared->subxcnt = serialized_snapshot.subxcnt; snapshot->shared->suboverflowed = serialized_snapshot.suboverflowed; snapshot->shared->takenDuringRecovery = serialized_snapshot.takenDuringRecovery; + snapshot->shared->snapshotCsn = serialized_snapshot.snapshotCsn; + snapshot->shared->inprogress_cache = NULL; + snapshot->shared->inprogress_cache_cxt = NULL; snapshot->shared->snapXactCompletionCount = 0; snapshot->shared->refcount = 1; @@ -1889,12 +1925,62 @@ XidInMVCCSnapshot(TransactionId xid, MVCCSnapshotShared snapshot) } else { - XLogRecPtr csn = CSNLogGetCSNByXid(xid); + XLogRecPtr csn; + bool inprogress; + uint64 *cache_entry; + uint64 cache_word = 0; - if (csn != InvalidXLogRecPtr && csn <= snapshot->snapshotCsn) - return false; + /* + * Calculate the word and bit slot for the XID in the cache. We use an + * offset from xmax as the key instead of the XID directly, because + * the radix tree can compact away leading zeros and is thus more + * efficient with keys closer to 0. + */ + uint32 cache_idx = snapshot->xmax - xid; + uint64 wordno = cache_idx / INPROGRESS_CACHE_XIDS_PER_WORD; + uint64 slotno = (cache_idx % INPROGRESS_CACHE_XIDS_PER_WORD) * INPROGRESS_CACHE_BITS; + + if (snapshot->inprogress_cache) + { + cache_entry = inprogress_cache_find(snapshot->inprogress_cache, wordno); + if (cache_entry) + { + cache_word = *cache_entry; + if (INPROGRESS_CACHE_XID_IS_CACHED(cache_word, slotno)) + return INPROGRESS_CACHE_XID_IS_IN_PROGRESS(cache_word, slotno); + } + } else - return true; + { + MemoryContext save_cxt; + + save_cxt = MemoryContextSwitchTo(TopMemoryContext); + + if (snapshot->inprogress_cache_cxt == NULL) + snapshot->inprogress_cache_cxt = + AllocSetContextCreate(TopMemoryContext, + "snapshot inprogress cache context", + ALLOCSET_SMALL_SIZES); + snapshot->inprogress_cache = inprogress_cache_create(snapshot->inprogress_cache_cxt); + cache_entry = NULL; + MemoryContextSwitchTo(save_cxt); + } + + /* Not found in cache, look up the CSN */ + csn = CSNLogGetCSNByXid(xid); + inprogress = (csn == InvalidXLogRecPtr || csn > snapshot->snapshotCsn); + + /* Update the cache word, and store it back to the radix tree */ + cache_word |= UINT64CONST(1) << slotno; /* cached */ + if (inprogress) + cache_word |= UINT64CONST(1) << (slotno + 1); /* in-progress */ + + if (cache_entry) + *cache_entry = cache_word; + else + inprogress_cache_set(snapshot->inprogress_cache, wordno, &cache_word); + + return inprogress; } return false; @@ -1944,6 +2030,9 @@ AllocMVCCSnapshotShared(void) shared->snapXactCompletionCount = 0; shared->refcount = 0; + shared->snapshotCsn = InvalidXLogRecPtr; + shared->inprogress_cache = NULL; + shared->inprogress_cache_cxt = NULL; MemoryContextSwitchTo(save_cxt); @@ -1972,8 +2061,22 @@ void FreeMVCCSnapshotShared(MVCCSnapshotShared shared) { Assert(shared->refcount == 0); + + if (shared->inprogress_cache) + { + inprogress_cache_free(shared->inprogress_cache); + shared->inprogress_cache = NULL; + } + if (shared->inprogress_cache_cxt) + { + MemoryContextDelete(shared->inprogress_cache_cxt); + shared->inprogress_cache_cxt = NULL; + } + if (spareSnapshotShared == NULL) + { spareSnapshotShared = shared; + } else pfree(shared); } diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 14ff80904c8..edf5bf1ba0a 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -129,6 +129,8 @@ typedef enum MVCCSnapshotKind SNAPSHOT_REGISTERED, } MVCCSnapshotKind; +struct inprogress_cache_radix_tree; /* private to snapmgr.c */ + /* * Struct representing a normal MVCC snapshot. * @@ -194,6 +196,13 @@ typedef struct MVCCSnapshotSharedData */ XLogRecPtr snapshotCsn; + /* + * Cache of XIDs known to be running or not according to the snapshot. + * Used in snapshots taken during recovery. + */ + struct inprogress_cache_radix_tree *inprogress_cache; + MemoryContext inprogress_cache_cxt; + bool takenDuringRecovery; /* recovery-shaped snapshot? */ /* -- 2.39.5