From c76104ba85a5668cfbcb236610bc494127642102 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Tue, 31 Jan 2023 17:41:31 +0900 Subject: [PATCH v24 8/9] Update TidStore patch from v23. Incorporate the comments, update comments, and add the description of concurrency support. --- src/backend/access/common/tidstore.c | 110 +++++++++++++++------------ 1 file changed, 62 insertions(+), 48 deletions(-) diff --git a/src/backend/access/common/tidstore.c b/src/backend/access/common/tidstore.c index 89aea71945..f656de2189 100644 --- a/src/backend/access/common/tidstore.c +++ b/src/backend/access/common/tidstore.c @@ -11,7 +11,10 @@ * to tidstore_create(). Other backends can attach to the shared TidStore by * tidstore_attach(). * - * XXX: Only one process is allowed to iterate over the TidStore at a time. + * Regarding the concurrency, it basically relies on the concurrency support in + * the radix tree, but we acquires the lock on a TidStore in some cases, for + * example, when to reset the store and when to access the number tids in the + * store (num_tids). * * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -23,7 +26,6 @@ */ #include "postgres.h" -#include "access/htup_details.h" #include "access/tidstore.h" #include "miscadmin.h" #include "port/pg_bitutils.h" @@ -87,14 +89,17 @@ #define RT_VALUE_TYPE uint64 #include "lib/radixtree.h" -/* The header object for a TidStore */ +/* The control object for a TidStore */ typedef struct TidStoreControl { - int64 num_tids; /* the number of Tids stored so far */ + /* the number of tids in the store */ + int64 num_tids; + + /* These values are never changed after creation */ size_t max_bytes; /* the maximum bytes a TidStore can use */ int max_offset; /* the maximum offset number */ + int offset_nbits; /* the number of bits required for max_offset */ bool encode_tids; /* do we use tid encoding? */ - int offset_nbits; /* the number of bits used for offset number */ int offset_key_nbits; /* the number of bits of a offset number * used for the key */ @@ -117,7 +122,7 @@ struct TidStore */ TidStoreControl *control; - /* Storage for Tids. Use either one depending on TidStoreIsShared() */ + /* Storage for Tids. Use either one depending on TidStoreIsShared() */ union { local_rt_radix_tree *local; @@ -170,24 +175,24 @@ tidstore_create(size_t max_bytes, int max_offset, dsa_area *area) /* * Create the radix tree for the main storage. * - * Memory consumption depends on the number of Tids stored, but also on the + * Memory consumption depends on the number of stored tids, but also on the * distribution of them, how the radix tree stores, and the memory management * that backed the radix tree. The maximum bytes that a TidStore can * use is specified by the max_bytes in tidstore_create(). We want the total - * amount of memory consumption not to exceed the max_bytes. + * amount of memory consumption by a TidStore not to exceed the max_bytes. * - * In non-shared cases, the radix tree uses slab allocators for each kind of - * node class. The most memory consuming case while adding Tids associated - * with one page (i.e. during tidstore_add_tids()) is that we allocate the - * largest radix tree node in a new slab block, which is approximately 70kB. - * Therefore, we deduct 70kB from the maximum bytes. + * In local TidStore cases, the radix tree uses slab allocators for each kind + * of node class. The most memory consuming case while adding Tids associated + * with one page (i.e. during tidstore_add_tids()) is that we allocate a new + * slab block for a new radix tree node, which is approximately 70kB. Therefore, + * we deduct 70kB from the max_bytes. * * In shared cases, DSA allocates the memory segments big enough to follow * a geometric series that approximately doubles the total DSA size (see * make_new_segment() in dsa.c). We simulated the how DSA increases segment * size and the simulation revealed, the 75% threshold for the maximum bytes - * perfectly works in case where it is a power-of-2, and the 60% threshold - * works for other cases. + * perfectly works in case where the max_bytes is a power-of-2, and the 60% + * threshold works for other cases. */ if (area != NULL) { @@ -199,7 +204,7 @@ tidstore_create(size_t max_bytes, int max_offset, dsa_area *area) dp = dsa_allocate0(area, sizeof(TidStoreControl)); ts->control = (TidStoreControl *) dsa_get_address(area, dp); - ts->control->max_bytes =(uint64) (max_bytes * ratio); + ts->control->max_bytes = (uint64) (max_bytes * ratio); ts->area = area; ts->control->magic = TIDSTORE_MAGIC; @@ -212,12 +217,16 @@ tidstore_create(size_t max_bytes, int max_offset, dsa_area *area) ts->tree.local = local_rt_create(CurrentMemoryContext); ts->control = (TidStoreControl *) palloc0(sizeof(TidStoreControl)); - ts->control->max_bytes = max_bytes - (1024 * 70); + ts->control->max_bytes = max_bytes - (70 * 1024); } ts->control->max_offset = max_offset; ts->control->offset_nbits = pg_ceil_log2_32(max_offset); + /* + * We use tid encoding if the number of bits for the offset number doesn't + * fix in a value, uint64. + */ if (ts->control->offset_nbits > TIDSTORE_VALUE_NBITS) { ts->control->encode_tids = true; @@ -311,7 +320,10 @@ tidstore_destroy(TidStore *ts) pfree(ts); } -/* Forget all collected Tids */ +/* + * Forget all collected Tids. It's similar to tidstore_destroy but we don't free + * entire TidStore but recreate only the radix tree storage. + */ void tidstore_reset(TidStore *ts) { @@ -350,15 +362,6 @@ tidstore_reset(TidStore *ts) } } -static inline void -tidstore_insert_kv(TidStore *ts, uint64 key, uint64 val) -{ - if (TidStoreIsShared(ts)) - shared_rt_set(ts->tree.shared, key, val); - else - local_rt_set(ts->tree.local, key, val); -} - /* Add Tids on a block to TidStore */ void tidstore_add_tids(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, @@ -371,8 +374,6 @@ tidstore_add_tids(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, Assert(!TidStoreIsShared(ts) || ts->control->magic == TIDSTORE_MAGIC); - ItemPointerSetBlockNumber(&tid, blkno); - if (ts->control->encode_tids) { key_base = ((uint64) blkno) << ts->control->offset_key_nbits; @@ -383,9 +384,9 @@ tidstore_add_tids(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, key_base = (uint64) blkno; nkeys = 1; } - values = palloc0(sizeof(uint64) * nkeys); + ItemPointerSetBlockNumber(&tid, blkno); for (int i = 0; i < num_offsets; i++) { uint64 key; @@ -413,7 +414,10 @@ tidstore_add_tids(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, { uint64 key = key_base + i; - tidstore_insert_kv(ts, key, values[i]); + if (TidStoreIsShared(ts)) + shared_rt_set(ts->tree.shared, key, values[i]); + else + local_rt_set(ts->tree.local, key, values[i]); } } @@ -449,8 +453,11 @@ tidstore_lookup_tid(TidStore *ts, ItemPointer tid) } /* - * Prepare to iterate through a TidStore. The caller must be certain that - * no other backend will attempt to update the TidStore during the iteration. + * Prepare to iterate through a TidStore. Since the radix tree is locked during the + * iteration, so tidstore_end_iterate() needs to called when finished. + * + * Concurrent updates during the iteration will be blocked when inserting a + * key-value to the radix tree. */ TidStoreIter * tidstore_begin_iterate(TidStore *ts) @@ -482,13 +489,14 @@ tidstore_iter_kv(TidStoreIter *iter, uint64 *key, uint64 *val) { if (TidStoreIsShared(iter->ts)) return shared_rt_iterate_next(iter->tree_iter.shared, key, val); - else - return local_rt_iterate_next(iter->tree_iter.local, key, val); + + return local_rt_iterate_next(iter->tree_iter.local, key, val); } /* - * Scan the TidStore and return a TidStoreIterResult representing Tids - * in one page. Offset numbers in the result is sorted. + * Scan the TidStore and return a pointer to TidStoreIterResult that has tids + * in one block. We return the block numbers in ascending order and the offset + * numbers in each result is also sorted in ascending order. */ TidStoreIterResult * tidstore_iterate_next(TidStoreIter *iter) @@ -502,6 +510,7 @@ tidstore_iterate_next(TidStoreIter *iter) if (BlockNumberIsValid(result->blkno)) { + /* Process the previously collected key-value */ result->num_offsets = 0; tidstore_iter_extract_tids(iter, iter->next_key, iter->next_val); } @@ -515,8 +524,8 @@ tidstore_iterate_next(TidStoreIter *iter) if (BlockNumberIsValid(result->blkno) && result->blkno != blkno) { /* - * Remember the key-value pair for the next block for the - * next iteration. + * We got a key-value pair for a different block. So return the + * collected tids, and remember the key-value for the next iteration. */ iter->next_key = key; iter->next_val = val; @@ -531,7 +540,10 @@ tidstore_iterate_next(TidStoreIter *iter) return result; } -/* Finish an iteration over TidStore */ +/* + * Finish an iteration over TidStore. This needs to be called after finishing + * or when existing an iteration. + */ void tidstore_end_iterate(TidStoreIter *iter) { @@ -544,7 +556,7 @@ tidstore_end_iterate(TidStoreIter *iter) pfree(iter); } -/* Return the number of Tids we collected so far */ +/* Return the number of tids we collected so far */ int64 tidstore_num_tids(TidStore *ts) { @@ -552,7 +564,7 @@ tidstore_num_tids(TidStore *ts) Assert(!TidStoreIsShared(ts) || ts->control->magic == TIDSTORE_MAGIC); - if (TidStoreIsShared(ts)) + if (!TidStoreIsShared(ts)) return ts->control->num_tids; LWLockAcquire(&ts->control->lock, LW_SHARED); @@ -593,9 +605,8 @@ tidstore_memory_usage(TidStore *ts) */ if (TidStoreIsShared(ts)) return sizeof(TidStore) + shared_rt_memory_usage(ts->tree.shared); - else - return sizeof(TidStore) + sizeof(TidStore) + - local_rt_memory_usage(ts->tree.local); + + return sizeof(TidStore) + sizeof(TidStore) + local_rt_memory_usage(ts->tree.local); } /* @@ -609,7 +620,7 @@ tidstore_get_handle(TidStore *ts) return ts->control->handle; } -/* Extract Tids from the given key-value pair */ +/* Extract tids from the given key-value pair */ static void tidstore_iter_extract_tids(TidStoreIter *iter, uint64 key, uint64 val) { @@ -621,7 +632,10 @@ tidstore_iter_extract_tids(TidStoreIter *iter, uint64 key, uint64 val) OffsetNumber off; if (i > iter->ts->control->max_offset) + { + Assert(!iter->ts->control->encode_tids); break; + } if ((val & (UINT64CONST(1) << i)) == 0) continue; @@ -644,8 +658,8 @@ key_get_blkno(TidStore *ts, uint64 key) { if (ts->control->encode_tids) return (BlockNumber) (key >> ts->control->offset_key_nbits); - else - return (BlockNumber) key; + + return (BlockNumber) key; } /* Encode a tid to key and offset */ -- 2.31.1