From eac9256167afc948166144820e0d884c9e89f8cc Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Thu, 27 Oct 2022 14:02:00 +0900 Subject: [PATCH v8 4/4] PoC: DSA support for radix tree. --- .../bench_radix_tree--1.0.sql | 2 + contrib/bench_radix_tree/bench_radix_tree.c | 12 +- src/backend/lib/radixtree.c | 683 ++++++++++++------ src/backend/utils/mmgr/dsa.c | 12 + src/include/lib/radixtree.h | 6 +- src/include/utils/dsa.h | 1 + .../expected/test_radixtree.out | 17 + .../modules/test_radixtree/test_radixtree.c | 98 ++- 8 files changed, 558 insertions(+), 273 deletions(-) diff --git a/contrib/bench_radix_tree/bench_radix_tree--1.0.sql b/contrib/bench_radix_tree/bench_radix_tree--1.0.sql index 0874201d7e..cf294c01d6 100644 --- a/contrib/bench_radix_tree/bench_radix_tree--1.0.sql +++ b/contrib/bench_radix_tree/bench_radix_tree--1.0.sql @@ -7,6 +7,7 @@ create function bench_shuffle_search( minblk int4, maxblk int4, random_block bool DEFAULT false, +shared bool DEFAULT false, OUT nkeys int8, OUT rt_mem_allocated int8, OUT array_mem_allocated int8, @@ -23,6 +24,7 @@ create function bench_seq_search( minblk int4, maxblk int4, random_block bool DEFAULT false, +shared bool DEFAULT false, OUT nkeys int8, OUT rt_mem_allocated int8, OUT array_mem_allocated int8, diff --git a/contrib/bench_radix_tree/bench_radix_tree.c b/contrib/bench_radix_tree/bench_radix_tree.c index 7abb237e96..be3f7ed811 100644 --- a/contrib/bench_radix_tree/bench_radix_tree.c +++ b/contrib/bench_radix_tree/bench_radix_tree.c @@ -15,6 +15,7 @@ #include "lib/radixtree.h" #include #include "miscadmin.h" +#include "storage/lwlock.h" #include "utils/timestamp.h" PG_MODULE_MAGIC; @@ -149,7 +150,9 @@ bench_search(FunctionCallInfo fcinfo, bool shuffle) BlockNumber minblk = PG_GETARG_INT32(0); BlockNumber maxblk = PG_GETARG_INT32(1); bool random_block = PG_GETARG_BOOL(2); + bool shared = PG_GETARG_BOOL(3); radix_tree *rt = NULL; + dsa_area *dsa = NULL; uint64 ntids; uint64 key; uint64 last_key = PG_UINT64_MAX; @@ -171,8 +174,11 @@ bench_search(FunctionCallInfo fcinfo, bool shuffle) tids = generate_tids(minblk, maxblk, TIDS_PER_BLOCK_FOR_LOAD, &ntids, random_block); + if (shared) + dsa = dsa_create(LWLockNewTrancheId()); + /* measure the load time of the radix tree */ - rt = rt_create(CurrentMemoryContext); + rt = rt_create(CurrentMemoryContext, dsa); start_time = GetCurrentTimestamp(); for (int i = 0; i < ntids; i++) { @@ -323,7 +329,7 @@ bench_load_random_int(PG_FUNCTION_ARGS) elog(ERROR, "return type must be a row type"); pg_prng_seed(&state, 0); - rt = rt_create(CurrentMemoryContext); + rt = rt_create(CurrentMemoryContext, NULL); start_time = GetCurrentTimestamp(); for (uint64 i = 0; i < cnt; i++) @@ -375,7 +381,7 @@ bench_fixed_height_search(PG_FUNCTION_ARGS) if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); - rt = rt_create(CurrentMemoryContext); + rt = rt_create(CurrentMemoryContext, NULL); start_time = GetCurrentTimestamp(); diff --git a/src/backend/lib/radixtree.c b/src/backend/lib/radixtree.c index b239b3c615..3b06f22af5 100644 --- a/src/backend/lib/radixtree.c +++ b/src/backend/lib/radixtree.c @@ -22,6 +22,15 @@ * choose it to avoid an additional pointer traversal. It is the reason this code * currently does not support variable-length keys. * + * If DSA space is specified when rt_create(), the radix tree is created in the + * DSA space so that multiple processes can access to it simultaneously. The process + * who created the shared radix tree need to tell both DSA area specified when + * calling to rt_create() and dsa_pointer of the radix tree, fetched by + * rt_get_dsa_pointer(), other processes so that they can attach by rt_attach(). + * + * XXX: shared radix tree is still PoC state as it doesn't have any locking support. + * Also, it supports only single-process iteration. + * * XXX: Most functions in this file have two variants for inner nodes and leaf * nodes, therefore there are duplication codes. While this sometimes makes the * code maintenance tricky, this reduces branch prediction misses when judging @@ -59,12 +68,13 @@ #include "postgres.h" +#include "lib/radixtree.h" +#include "lib/stringinfo.h" #include "miscadmin.h" #include "port/pg_bitutils.h" #include "port/pg_lfind.h" +#include "utils/dsa.h" #include "utils/memutils.h" -#include "lib/radixtree.h" -#include "lib/stringinfo.h" /* The number of bits encoded in one tree level */ #define RT_NODE_SPAN BITS_PER_BYTE @@ -152,6 +162,17 @@ typedef struct rt_node #define NODE_HAS_FREE_SLOT(n) \ (((rt_node *) (n))->count < rt_node_kind_info[((rt_node *) (n))->kind].fanout) +/* + * rt_node_ptr is used as a pointer for rt_node. It can be either a local address + * in non-shared radix tree case (RadixTreeIsShared() is true) or a dsa_pointer in + * shared radix tree case. The inner nodes of the radix tree need to use rt_node_ptr + * to store the child rt_node pointer instead of C-pointers. A rt_node_ptr can be + * converted to a local address of rt_node by using node_ptr_get_local(). + */ +typedef uintptr_t rt_node_ptr; +#define InvalidRTNodePointer ((rt_node_ptr) 0) +#define RTNodePtrIsValid(x) ((x) != InvalidRTNodePointer) + /* Base type of each node kinds for leaf and inner nodes */ typedef struct rt_node_base_4 { @@ -205,7 +226,7 @@ typedef struct rt_node_inner_4 rt_node_base_4 base; /* 4 children, for key chunks */ - rt_node *children[4]; + rt_node_ptr children[4]; } rt_node_inner_4; typedef struct rt_node_leaf_4 @@ -221,7 +242,7 @@ typedef struct rt_node_inner_32 rt_node_base_32 base; /* 32 children, for key chunks */ - rt_node *children[32]; + rt_node_ptr children[32]; } rt_node_inner_32; typedef struct rt_node_leaf_32 @@ -237,7 +258,7 @@ typedef struct rt_node_inner_128 rt_node_base_128 base; /* Slots for 128 children */ - rt_node *children[128]; + rt_node_ptr children[128]; } rt_node_inner_128; typedef struct rt_node_leaf_128 @@ -260,7 +281,7 @@ typedef struct rt_node_inner_256 rt_node_base_256 base; /* Slots for 256 children */ - rt_node *children[RT_NODE_MAX_SLOTS]; + rt_node_ptr children[RT_NODE_MAX_SLOTS]; } rt_node_inner_256; typedef struct rt_node_leaf_256 @@ -344,6 +365,11 @@ static rt_node_kind_info_elem rt_node_kind_info[RT_NODE_KIND_COUNT] = { * construct the key whenever updating the node iteration information, e.g., when * advancing the current index within the node or when moving to the next node * at the same level. + * + * XXX: Currently we allow only one process to do iteration. Therefore, rt_node_iter + * has the local pointers to nodes, rather than rt_node_ptr. + * We need either a safeguard to disallow other processes to begin the iteration + * while one process is doing or to allow multiple processes to do the iteration. */ typedef struct rt_node_iter { @@ -363,37 +389,56 @@ struct rt_iter uint64 key; }; -/* A radix tree with nodes */ -struct radix_tree +/* Control information for an radix tree */ +typedef struct radix_tree_control { - MemoryContext context; + rt_node_ptr root; - rt_node *root; + /* XXX: use pg_atomic_uint64 instead */ uint64 max_val; uint64 num_keys; - MemoryContextData *inner_slabs[RT_NODE_KIND_COUNT]; - MemoryContextData *leaf_slabs[RT_NODE_KIND_COUNT]; - /* statistics */ #ifdef RT_DEBUG int32 cnt[RT_NODE_KIND_COUNT]; #endif +} radix_tree_control; + +/* A radix tree with nodes */ +struct radix_tree +{ + MemoryContext context; + + /* pointing to either local memory or DSA */ + radix_tree_control *ctl; + + /* used only when the radix tree is shared */ + dsa_area *dsa; + dsa_pointer ctl_dp; + + /* used only when the radix tree is private */ + MemoryContextData *inner_slabs[RT_NODE_KIND_COUNT]; + MemoryContextData *leaf_slabs[RT_NODE_KIND_COUNT]; }; +#define RadixTreeIsShared(rt) ((rt)->dsa != NULL) static void rt_new_root(radix_tree *tree, uint64 key); -static rt_node *rt_alloc_node(radix_tree *tree, int kind, uint8 shift, uint8 chunk, - bool inner); -static void rt_free_node(radix_tree *tree, rt_node *node); +static rt_node_ptr rt_alloc_node(radix_tree *tree, int kind, uint8 shift, uint8 chunk, + bool inner); +static rt_node_ptr rt_copy_node(radix_tree *tree, rt_node *node, int new_kind); +static void rt_free_node(radix_tree *tree, rt_node_ptr nodep); +static void rt_replace_node(radix_tree *tree, rt_node *parent, rt_node_ptr oldp, + rt_node_ptr newp, uint64 key); static void rt_extend(radix_tree *tree, uint64 key); static inline bool rt_node_search_inner(rt_node *node, uint64 key, rt_action action, - rt_node **child_p); + rt_node_ptr *childp_p); static inline bool rt_node_search_leaf(rt_node *node, uint64 key, rt_action action, uint64 *value_p); -static bool rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node *node, - uint64 key, rt_node *child); -static bool rt_node_insert_leaf(radix_tree *tree, rt_node *parent, rt_node *node, - uint64 key, uint64 value); +static bool rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node_ptr nodep, + rt_node *node, uint64 key, rt_node_ptr childp); +static bool rt_node_insert_leaf(radix_tree *tree, rt_node *parent, rt_node_ptr nodep, + rt_node *node, uint64 key, uint64 value); +static inline void rt_node_update_inner(rt_node *node, uint64 key, rt_node_ptr newchildp); static inline rt_node *rt_node_inner_iterate_next(rt_iter *iter, rt_node_iter *node_iter); static inline bool rt_node_leaf_iterate_next(rt_iter *iter, rt_node_iter *node_iter, uint64 *value_p); @@ -403,6 +448,15 @@ static inline void rt_iter_update_key(rt_iter *iter, uint8 chunk, uint8 shift); /* verification (available only with assertion) */ static void rt_verify_node(rt_node *node); +/* Get the local address of nodep */ +static inline rt_node * +node_ptr_get_local(radix_tree *tree, rt_node_ptr nodep) +{ + return RadixTreeIsShared(tree) + ? (rt_node *) dsa_get_address(tree->dsa, (dsa_pointer) nodep) + : (rt_node *) nodep; +} + /* * Return index of the first element in 'base' that equals 'key'. Return -1 * if there is no such element. @@ -550,10 +604,10 @@ node_32_get_insertpos(rt_node_base_32 *node, uint8 chunk) /* Shift the elements right at 'idx' by one */ static inline void -chunk_children_array_shift(uint8 *chunks, rt_node **children, int count, int idx) +chunk_children_array_shift(uint8 *chunks, rt_node_ptr *children, int count, int idx) { memmove(&(chunks[idx + 1]), &(chunks[idx]), sizeof(uint8) * (count - idx)); - memmove(&(children[idx + 1]), &(children[idx]), sizeof(rt_node *) * (count - idx)); + memmove(&(children[idx + 1]), &(children[idx]), sizeof(rt_node_ptr) * (count - idx)); } static inline void @@ -565,7 +619,7 @@ chunk_values_array_shift(uint8 *chunks, uint64 *values, int count, int idx) /* Delete the element at 'idx' */ static inline void -chunk_children_array_delete(uint8 *chunks, rt_node **children, int count, int idx) +chunk_children_array_delete(uint8 *chunks, rt_node_ptr *children, int count, int idx) { memmove(&(chunks[idx]), &(chunks[idx + 1]), sizeof(uint8) * (count - idx - 1)); memmove(&(children[idx]), &(children[idx + 1]), sizeof(rt_node *) * (count - idx - 1)); @@ -580,15 +634,15 @@ chunk_values_array_delete(uint8 *chunks, uint64 *values, int count, int idx) /* Copy both chunks and children/values arrays */ static inline void -chunk_children_array_copy(uint8 *src_chunks, rt_node **src_children, - uint8 *dst_chunks, rt_node **dst_children, int count) +chunk_children_array_copy(uint8 *src_chunks, rt_node_ptr *src_children, + uint8 *dst_chunks, rt_node_ptr *dst_children, int count) { /* For better code generation */ if (count > rt_node_kind_info[RT_NODE_KIND_4].fanout) pg_unreachable(); memcpy(dst_chunks, src_chunks, sizeof(uint8) * count); - memcpy(dst_children, src_children, sizeof(rt_node *) * count); + memcpy(dst_children, src_children, sizeof(rt_node_ptr) * count); } static inline void @@ -617,7 +671,7 @@ static inline bool node_inner_128_is_slot_used(rt_node_inner_128 *node, uint8 slot) { Assert(!NODE_IS_LEAF(node)); - return (node->children[slot] != NULL); + return RTNodePtrIsValid(node->children[slot]); } static inline bool @@ -627,7 +681,7 @@ node_leaf_128_is_slot_used(rt_node_leaf_128 *node, uint8 slot) return ((node->isset[RT_NODE_BITMAP_BYTE(slot)] & RT_NODE_BITMAP_BIT(slot)) != 0); } -static inline rt_node * +static inline rt_node_ptr node_inner_128_get_child(rt_node_inner_128 *node, uint8 chunk) { Assert(!NODE_IS_LEAF(node)); @@ -695,7 +749,7 @@ node_leaf_128_find_unused_slot(rt_node_leaf_128 *node, uint8 chunk) } static inline void -node_inner_128_insert(rt_node_inner_128 *node, uint8 chunk, rt_node *child) +node_inner_128_insert(rt_node_inner_128 *node, uint8 chunk, rt_node_ptr child) { int slotpos; @@ -726,10 +780,10 @@ node_leaf_128_insert(rt_node_leaf_128 *node, uint8 chunk, uint64 value) /* Update the child corresponding to 'chunk' to 'child' */ static inline void -node_inner_128_update(rt_node_inner_128 *node, uint8 chunk, rt_node *child) +node_inner_128_update(rt_node_inner_128 *node, uint8 chunk, rt_node_ptr childp) { Assert(!NODE_IS_LEAF(node)); - node->children[node->base.slot_idxs[chunk]] = child; + node->children[node->base.slot_idxs[chunk]] = childp; } static inline void @@ -746,7 +800,7 @@ static inline bool node_inner_256_is_chunk_used(rt_node_inner_256 *node, uint8 chunk) { Assert(!NODE_IS_LEAF(node)); - return (node->children[chunk] != NULL); + return RTNodePtrIsValid(node->children[chunk]); } static inline bool @@ -756,7 +810,7 @@ node_leaf_256_is_chunk_used(rt_node_leaf_256 *node, uint8 chunk) return (node->isset[RT_NODE_BITMAP_BYTE(chunk)] & RT_NODE_BITMAP_BIT(chunk)) != 0; } -static inline rt_node * +static inline rt_node_ptr node_inner_256_get_child(rt_node_inner_256 *node, uint8 chunk) { Assert(!NODE_IS_LEAF(node)); @@ -774,7 +828,7 @@ node_leaf_256_get_value(rt_node_leaf_256 *node, uint8 chunk) /* Set the child in the node-256 */ static inline void -node_inner_256_set(rt_node_inner_256 *node, uint8 chunk, rt_node *child) +node_inner_256_set(rt_node_inner_256 *node, uint8 chunk, rt_node_ptr child) { Assert(!NODE_IS_LEAF(node)); node->children[chunk] = child; @@ -794,7 +848,7 @@ static inline void node_inner_256_delete(rt_node_inner_256 *node, uint8 chunk) { Assert(!NODE_IS_LEAF(node)); - node->children[chunk] = NULL; + node->children[chunk] = InvalidRTNodePointer; } static inline void @@ -835,28 +889,45 @@ static void rt_new_root(radix_tree *tree, uint64 key) { int shift = key_get_shift(key); - rt_node *node; + rt_node_ptr nodep; - node = (rt_node *) rt_alloc_node(tree, RT_NODE_KIND_4, shift, 0, - shift > 0); - tree->max_val = shift_get_max_val(shift); - tree->root = node; + nodep = rt_alloc_node(tree, RT_NODE_KIND_4, shift, 0, shift > 0); + tree->ctl->max_val = shift_get_max_val(shift); + tree->ctl->root = nodep; } /* * Allocate a new node with the given node kind. */ -static rt_node * +static rt_node_ptr rt_alloc_node(radix_tree *tree, int kind, uint8 shift, uint8 chunk, bool inner) { rt_node *newnode; + rt_node_ptr newnodep; + + if (tree->dsa != NULL) + { + dsa_pointer dp; + + if (inner) + dp = dsa_allocate0(tree->dsa, rt_node_kind_info[kind].inner_size); + else + dp = dsa_allocate0(tree->dsa, rt_node_kind_info[kind].leaf_size); - if (inner) - newnode = (rt_node *) MemoryContextAllocZero(tree->inner_slabs[kind], - rt_node_kind_info[kind].inner_size); + newnodep = (rt_node_ptr) dp; + newnode = (rt_node *) dsa_get_address(tree->dsa, newnodep); + } else - newnode = (rt_node *) MemoryContextAllocZero(tree->leaf_slabs[kind], - rt_node_kind_info[kind].leaf_size); + { + if (inner) + newnode = (rt_node *) MemoryContextAllocZero(tree->inner_slabs[kind], + rt_node_kind_info[kind].inner_size); + else + newnode = (rt_node *) MemoryContextAllocZero(tree->leaf_slabs[kind], + rt_node_kind_info[kind].leaf_size); + + newnodep = (rt_node_ptr) newnode; + } newnode->kind = kind; newnode->shift = shift; @@ -872,69 +943,81 @@ rt_alloc_node(radix_tree *tree, int kind, uint8 shift, uint8 chunk, bool inner) #ifdef RT_DEBUG /* update the statistics */ - tree->cnt[kind]++; + tree->ctl->cnt[kind]++; #endif - return newnode; + return newnodep; } /* * Create a new node with 'new_kind' and the same shift, chunk, and * count of 'node'. */ -static rt_node * +static rt_node_ptr rt_copy_node(radix_tree *tree, rt_node *node, int new_kind) { rt_node *newnode; + rt_node_ptr newnodep; - newnode = rt_alloc_node(tree, new_kind, node->shift, node->chunk, - node->shift > 0); + newnodep = rt_alloc_node(tree, new_kind, node->shift, node->chunk, + node->shift > 0); + newnode = node_ptr_get_local(tree, newnodep); newnode->count = node->count; - return newnode; + return newnodep; } /* Free the given node */ static void -rt_free_node(radix_tree *tree, rt_node *node) +rt_free_node(radix_tree *tree, rt_node_ptr nodep) { /* If we're deleting the root node, make the tree empty */ - if (tree->root == node) - tree->root = NULL; + if (tree->ctl->root == nodep) + tree->ctl->root = InvalidRTNodePointer; #ifdef RT_DEBUG - /* update the statistics */ - tree->cnt[node->kind]--; - Assert(tree->cnt[node->kind] >= 0); + { + rt_node *node = node_ptr_get_local(tree, nodep); + + /* update the statistics */ + tree->ctl->cnt[node->kind]--; + Assert(tree->ctl->cnt[node->kind] >= 0); + } #endif - pfree(node); + if (RadixTreeIsShared(tree)) + dsa_free(tree->dsa, (dsa_pointer) nodep); + else + pfree((rt_node *) nodep); } /* * Replace old_child with new_child, and free the old one. */ static void -rt_replace_node(radix_tree *tree, rt_node *parent, rt_node *old_child, - rt_node *new_child, uint64 key) +rt_replace_node(radix_tree *tree, rt_node *parent, rt_node_ptr oldp, + rt_node_ptr newp, uint64 key) { - Assert(old_child->chunk == new_child->chunk); - Assert(old_child->shift == new_child->shift); + rt_node *old = node_ptr_get_local(tree, oldp); - if (parent == old_child) +#ifdef USE_ASSERT_CHECKING { - /* Replace the root node with the new large node */ - tree->root = new_child; + rt_node *new = node_ptr_get_local(tree, newp); + + Assert(old->chunk == new->chunk); + Assert(old->shift == new->shift); } - else - { - bool replaced PG_USED_FOR_ASSERTS_ONLY; +#endif - replaced = rt_node_insert_inner(tree, NULL, parent, key, new_child); - Assert(replaced); + if (parent == old) + { + /* Replace the root node with the new large node */ + tree->ctl->root = newp; } + else + rt_node_update_inner(parent, key, newp); - rt_free_node(tree, old_child); + rt_free_node(tree, oldp); } /* @@ -945,7 +1028,8 @@ static void rt_extend(radix_tree *tree, uint64 key) { int target_shift; - int shift = tree->root->shift + RT_NODE_SPAN; + rt_node *root = node_ptr_get_local(tree, tree->ctl->root); + int shift = root->shift + RT_NODE_SPAN; target_shift = key_get_shift(key); @@ -953,20 +1037,77 @@ rt_extend(radix_tree *tree, uint64 key) while (shift <= target_shift) { rt_node_inner_4 *node; + rt_node_ptr nodep; - node = (rt_node_inner_4 *) rt_alloc_node(tree, RT_NODE_KIND_4, - shift, 0, true); + /* create the new root */ + nodep = rt_alloc_node(tree, RT_NODE_KIND_4, shift, 0, true); + node = (rt_node_inner_4 *) node_ptr_get_local(tree, nodep); node->base.n.count = 1; node->base.chunks[0] = 0; - node->children[0] = tree->root; + node->children[0] = tree->ctl->root; - tree->root->chunk = 0; - tree->root = (rt_node *) node; + /* Update the root */ + root->chunk = 0; + tree->ctl->root = nodep; + root = (rt_node *) node; shift += RT_NODE_SPAN; } - tree->max_val = shift_get_max_val(target_shift); + tree->ctl->max_val = shift_get_max_val(target_shift); +} + +/* XXX: can be merged to rt_node_search_inner with RT_ACTION_UPDATE? */ +static inline void +rt_node_update_inner(rt_node *node, uint64 key, rt_node_ptr newchildp) +{ + uint8 chunk = RT_GET_KEY_CHUNK(key, node->shift); + + switch (node->kind) + { + case RT_NODE_KIND_4: + { + rt_node_inner_4 *n4 = (rt_node_inner_4 *) node; + int idx = node_4_search_eq((rt_node_base_4 *) n4, chunk); + + if (idx < -1) + break; + + n4->children[idx] = newchildp; + break; + } + case RT_NODE_KIND_32: + { + rt_node_inner_32 *n32 = (rt_node_inner_32 *) node; + int idx = node_32_search_eq((rt_node_base_32 *) n32, chunk); + + if (idx < -1) + break; + + n32->children[idx] = newchildp; + break; + } + case RT_NODE_KIND_128: + { + rt_node_inner_128 *n128 = (rt_node_inner_128 *) node; + + if (!node_128_is_chunk_used((rt_node_base_128 *) n128, chunk)) + break; + + node_inner_128_update(n128, chunk, newchildp); + break; + } + case RT_NODE_KIND_256: + { + rt_node_inner_256 *n256 = (rt_node_inner_256 *) node; + + if (!node_inner_256_is_chunk_used(n256, chunk)) + break; + + node_inner_256_set(n256, chunk, newchildp); + break; + } + } } /* @@ -975,27 +1116,31 @@ rt_extend(radix_tree *tree, uint64 key) */ static inline void rt_set_extend(radix_tree *tree, uint64 key, uint64 value, rt_node *parent, - rt_node *node) + rt_node_ptr nodep, rt_node *node) { int shift = node->shift; + Assert(node_ptr_get_local(tree, nodep) == node); + while (shift >= RT_NODE_SPAN) { - rt_node *newchild; + rt_node_ptr newchildp; int newshift = shift - RT_NODE_SPAN; - newchild = rt_alloc_node(tree, RT_NODE_KIND_4, newshift, - RT_GET_KEY_CHUNK(key, node->shift), - newshift > 0); - rt_node_insert_inner(tree, parent, node, key, newchild); + newchildp = rt_alloc_node(tree, RT_NODE_KIND_4, newshift, + RT_GET_KEY_CHUNK(key, node->shift), + newshift > 0); + + rt_node_insert_inner(tree, parent, nodep, node, key, newchildp); parent = node; - node = newchild; + node = node_ptr_get_local(tree, newchildp); + nodep = newchildp; shift -= RT_NODE_SPAN; } - rt_node_insert_leaf(tree, parent, node, key, value); - tree->num_keys++; + rt_node_insert_leaf(tree, parent, nodep, node, key, value); + tree->ctl->num_keys++; } /* @@ -1006,11 +1151,11 @@ rt_set_extend(radix_tree *tree, uint64 key, uint64 value, rt_node *parent, * pointer is set to child_p. */ static inline bool -rt_node_search_inner(rt_node *node, uint64 key, rt_action action, rt_node **child_p) +rt_node_search_inner(rt_node *node, uint64 key, rt_action action, rt_node_ptr *childp_p) { uint8 chunk = RT_GET_KEY_CHUNK(key, node->shift); bool found = false; - rt_node *child = NULL; + rt_node_ptr childp = InvalidRTNodePointer; switch (node->kind) { @@ -1025,7 +1170,7 @@ rt_node_search_inner(rt_node *node, uint64 key, rt_action action, rt_node **chil found = true; if (action == RT_ACTION_FIND) - child = n4->children[idx]; + childp = n4->children[idx]; else /* RT_ACTION_DELETE */ chunk_children_array_delete(n4->base.chunks, n4->children, n4->base.n.count, idx); @@ -1041,8 +1186,9 @@ rt_node_search_inner(rt_node *node, uint64 key, rt_action action, rt_node **chil break; found = true; + if (action == RT_ACTION_FIND) - child = n32->children[idx]; + childp = n32->children[idx]; else /* RT_ACTION_DELETE */ chunk_children_array_delete(n32->base.chunks, n32->children, n32->base.n.count, idx); @@ -1058,7 +1204,7 @@ rt_node_search_inner(rt_node *node, uint64 key, rt_action action, rt_node **chil found = true; if (action == RT_ACTION_FIND) - child = node_inner_128_get_child(n128, chunk); + childp = node_inner_128_get_child(n128, chunk); else /* RT_ACTION_DELETE */ node_inner_128_delete(n128, chunk); @@ -1073,7 +1219,7 @@ rt_node_search_inner(rt_node *node, uint64 key, rt_action action, rt_node **chil found = true; if (action == RT_ACTION_FIND) - child = node_inner_256_get_child(n256, chunk); + childp = node_inner_256_get_child(n256, chunk); else /* RT_ACTION_DELETE */ node_inner_256_delete(n256, chunk); @@ -1085,8 +1231,8 @@ rt_node_search_inner(rt_node *node, uint64 key, rt_action action, rt_node **chil if (action == RT_ACTION_DELETE && found) node->count--; - if (found && child_p) - *child_p = child; + if (found && childp_p) + *childp_p = childp; return found; } @@ -1186,8 +1332,8 @@ rt_node_search_leaf(rt_node *node, uint64 key, rt_action action, uint64 *value_p /* Insert the child to the inner node */ static bool -rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node *node, uint64 key, - rt_node *child) +rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node_ptr nodep, rt_node *node, + uint64 key, rt_node_ptr childp) { uint8 chunk = RT_GET_KEY_CHUNK(key, node->shift); bool chunk_exists = false; @@ -1206,23 +1352,24 @@ rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node *node, uint64 ke { /* found the existing chunk */ chunk_exists = true; - n4->children[idx] = child; + n4->children[idx] = childp; break; } if (unlikely(!NODE_HAS_FREE_SLOT(n4))) { rt_node_inner_32 *new32; + rt_node_ptr new32p; /* grow node from 4 to 32 */ - new32 = (rt_node_inner_32 *) rt_copy_node(tree, (rt_node *) n4, - RT_NODE_KIND_32); + new32p = rt_copy_node(tree, (rt_node *) n4, RT_NODE_KIND_32); + new32 = (rt_node_inner_32 *) node_ptr_get_local(tree, new32p); + chunk_children_array_copy(n4->base.chunks, n4->children, new32->base.chunks, new32->children, n4->base.n.count); - rt_replace_node(tree, parent, (rt_node *) n4, (rt_node *) new32, - key); + rt_replace_node(tree, parent, nodep, new32p, key); node = (rt_node *) new32; } else @@ -1236,7 +1383,7 @@ rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node *node, uint64 ke count, insertpos); n4->base.chunks[insertpos] = chunk; - n4->children[insertpos] = child; + n4->children[insertpos] = childp; break; } } @@ -1251,22 +1398,23 @@ rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node *node, uint64 ke { /* found the existing chunk */ chunk_exists = true; - n32->children[idx] = child; + n32->children[idx] = childp; break; } if (unlikely(!NODE_HAS_FREE_SLOT(n32))) { rt_node_inner_128 *new128; + rt_node_ptr new128p; /* grow node from 32 to 128 */ - new128 = (rt_node_inner_128 *) rt_copy_node(tree, (rt_node *) n32, - RT_NODE_KIND_128); + new128p = rt_copy_node(tree, (rt_node *) n32, RT_NODE_KIND_128); + new128 = (rt_node_inner_128 *) node_ptr_get_local(tree, new128p); + for (int i = 0; i < n32->base.n.count; i++) node_inner_128_insert(new128, n32->base.chunks[i], n32->children[i]); - rt_replace_node(tree, parent, (rt_node *) n32, (rt_node *) new128, - key); + rt_replace_node(tree, parent, nodep, new128p, key); node = (rt_node *) new128; } else @@ -1279,7 +1427,7 @@ rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node *node, uint64 ke count, insertpos); n32->base.chunks[insertpos] = chunk; - n32->children[insertpos] = child; + n32->children[insertpos] = childp; break; } } @@ -1293,17 +1441,19 @@ rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node *node, uint64 ke { /* found the existing chunk */ chunk_exists = true; - node_inner_128_update(n128, chunk, child); + node_inner_128_update(n128, chunk, childp); break; } if (unlikely(!NODE_HAS_FREE_SLOT(n128))) { rt_node_inner_256 *new256; + rt_node_ptr new256p; /* grow node from 128 to 256 */ - new256 = (rt_node_inner_256 *) rt_copy_node(tree, (rt_node *) n128, - RT_NODE_KIND_256); + new256p = rt_copy_node(tree, (rt_node *) n128, RT_NODE_KIND_256); + new256 = (rt_node_inner_256 *) node_ptr_get_local(tree, new256p); + for (int i = 0; i < RT_NODE_MAX_SLOTS && cnt < n128->base.n.count; i++) { if (!node_128_is_chunk_used((rt_node_base_128 *) n128, i)) @@ -1313,13 +1463,12 @@ rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node *node, uint64 ke cnt++; } - rt_replace_node(tree, parent, (rt_node *) n128, (rt_node *) new256, - key); + rt_replace_node(tree, parent, nodep, new256p, key); node = (rt_node *) new256; } else { - node_inner_128_insert(n128, chunk, child); + node_inner_128_insert(n128, chunk, childp); break; } } @@ -1331,7 +1480,7 @@ rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node *node, uint64 ke chunk_exists = node_inner_256_is_chunk_used(n256, chunk); Assert(chunk_exists || NODE_HAS_FREE_SLOT(n256)); - node_inner_256_set(n256, chunk, child); + node_inner_256_set(n256, chunk, childp); break; } } @@ -1351,7 +1500,7 @@ rt_node_insert_inner(radix_tree *tree, rt_node *parent, rt_node *node, uint64 ke /* Insert the value to the leaf node */ static bool -rt_node_insert_leaf(radix_tree *tree, rt_node *parent, rt_node *node, +rt_node_insert_leaf(radix_tree *tree, rt_node *parent, rt_node_ptr nodep, rt_node *node, uint64 key, uint64 value) { uint8 chunk = RT_GET_KEY_CHUNK(key, node->shift); @@ -1378,16 +1527,16 @@ rt_node_insert_leaf(radix_tree *tree, rt_node *parent, rt_node *node, if (unlikely(!NODE_HAS_FREE_SLOT(n4))) { rt_node_leaf_32 *new32; + rt_node_ptr new32p; /* grow node from 4 to 32 */ - new32 = (rt_node_leaf_32 *) rt_copy_node(tree, (rt_node *) n4, - RT_NODE_KIND_32); + new32p = rt_copy_node(tree, (rt_node *) n4, RT_NODE_KIND_32); + new32 = (rt_node_leaf_32 *) node_ptr_get_local(tree, new32p); chunk_values_array_copy(n4->base.chunks, n4->values, new32->base.chunks, new32->values, n4->base.n.count); - rt_replace_node(tree, parent, (rt_node *) n4, (rt_node *) new32, - key); + rt_replace_node(tree, parent, nodep, new32p, key); node = (rt_node *) new32; } else @@ -1423,15 +1572,16 @@ rt_node_insert_leaf(radix_tree *tree, rt_node *parent, rt_node *node, if (unlikely(!NODE_HAS_FREE_SLOT(n32))) { rt_node_leaf_128 *new128; + rt_node_ptr new128p; /* grow node from 32 to 128 */ - new128 = (rt_node_leaf_128 *) rt_copy_node(tree, (rt_node *) n32, - RT_NODE_KIND_128); + new128p = rt_copy_node(tree, (rt_node *) n32, RT_NODE_KIND_128); + new128 = (rt_node_leaf_128 *) node_ptr_get_local(tree, new128p); + for (int i = 0; i < n32->base.n.count; i++) node_leaf_128_insert(new128, n32->base.chunks[i], n32->values[i]); - rt_replace_node(tree, parent, (rt_node *) n32, (rt_node *) new128, - key); + rt_replace_node(tree, parent, nodep, new128p, key); node = (rt_node *) new128; } else @@ -1465,10 +1615,12 @@ rt_node_insert_leaf(radix_tree *tree, rt_node *parent, rt_node *node, if (unlikely(!NODE_HAS_FREE_SLOT(n128))) { rt_node_leaf_256 *new256; + rt_node_ptr new256p; /* grow node from 128 to 256 */ - new256 = (rt_node_leaf_256 *) rt_copy_node(tree, (rt_node *) n128, - RT_NODE_KIND_256); + new256p = rt_copy_node(tree, (rt_node *) n128, RT_NODE_KIND_256); + new256 = (rt_node_leaf_256 *) node_ptr_get_local(tree, new256p); + for (int i = 0; i < RT_NODE_MAX_SLOTS && cnt < n128->base.n.count; i++) { if (!node_128_is_chunk_used((rt_node_base_128 *) n128, i)) @@ -1478,8 +1630,7 @@ rt_node_insert_leaf(radix_tree *tree, rt_node *parent, rt_node *node, cnt++; } - rt_replace_node(tree, parent, (rt_node *) n128, (rt_node *) new256, - key); + rt_replace_node(tree, parent, nodep, new256p, key); node = (rt_node *) new256; } else @@ -1518,33 +1669,46 @@ rt_node_insert_leaf(radix_tree *tree, rt_node *parent, rt_node *node, * Create the radix tree in the given memory context and return it. */ radix_tree * -rt_create(MemoryContext ctx) +rt_create(MemoryContext ctx, dsa_area *dsa) { radix_tree *tree; MemoryContext old_ctx; old_ctx = MemoryContextSwitchTo(ctx); - tree = palloc(sizeof(radix_tree)); + tree = (radix_tree *) palloc0(sizeof(radix_tree)); tree->context = ctx; - tree->root = NULL; - tree->max_val = 0; - tree->num_keys = 0; + + if (dsa != NULL) + { + tree->dsa = dsa; + tree->ctl_dp = dsa_allocate0(dsa, sizeof(radix_tree_control)); + tree->ctl = (radix_tree_control *) dsa_get_address(dsa, tree->ctl_dp); + } + else + { + tree->ctl_dp = InvalidDsaPointer; + tree->ctl = (radix_tree_control *) palloc0(sizeof(radix_tree_control)); + } + + tree->ctl->root = InvalidRTNodePointer; + tree->ctl->max_val = 0; + tree->ctl->num_keys = 0; /* Create the slab allocator for each size class */ - for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + if (dsa == NULL) { - tree->inner_slabs[i] = SlabContextCreate(ctx, - rt_node_kind_info[i].name, - rt_node_kind_info[i].inner_blocksize, - rt_node_kind_info[i].inner_size); - tree->leaf_slabs[i] = SlabContextCreate(ctx, - rt_node_kind_info[i].name, - rt_node_kind_info[i].leaf_blocksize, - rt_node_kind_info[i].leaf_size); -#ifdef RT_DEBUG - tree->cnt[i] = 0; -#endif + for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + { + tree->inner_slabs[i] = SlabContextCreate(ctx, + rt_node_kind_info[i].name, + rt_node_kind_info[i].inner_blocksize, + rt_node_kind_info[i].inner_size); + tree->leaf_slabs[i] = SlabContextCreate(ctx, + rt_node_kind_info[i].name, + rt_node_kind_info[i].leaf_blocksize, + rt_node_kind_info[i].leaf_size); + } } MemoryContextSwitchTo(old_ctx); @@ -1552,16 +1716,48 @@ rt_create(MemoryContext ctx) return tree; } +dsa_pointer +rt_get_dsa_pointer(radix_tree *tree) +{ + return tree->ctl_dp; +} + +radix_tree * +rt_attach(dsa_area *dsa, dsa_pointer dp) +{ + radix_tree *tree; + + /* XXX: memory context support */ + tree = (radix_tree *) palloc0(sizeof(radix_tree)); + + tree->ctl_dp = dp; + tree->ctl = (radix_tree_control *) dsa_get_address(dsa, dp); + + /* XXX: do we need to set a callback on exit to detach dsa? */ + + return tree; +} + /* * Free the given radix tree. */ void rt_free(radix_tree *tree) { - for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + if (RadixTreeIsShared(tree)) + { + dsa_free(tree->dsa, tree->ctl_dp); + dsa_detach(tree->dsa); + } + else { - MemoryContextDelete(tree->inner_slabs[i]); - MemoryContextDelete(tree->leaf_slabs[i]); + pfree(tree->ctl); + + for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + { + MemoryContextDelete(tree->inner_slabs[i]); + MemoryContextDelete(tree->leaf_slabs[i]); + } } pfree(tree); @@ -1576,48 +1772,48 @@ rt_set(radix_tree *tree, uint64 key, uint64 value) { int shift; bool updated; + rt_node *parent; rt_node *node; - rt_node *parent = tree->root; + rt_node_ptr nodep; /* Empty tree, create the root */ - if (!tree->root) + if (!RTNodePtrIsValid(tree->ctl->root)) rt_new_root(tree, key); /* Extend the tree if necessary */ - if (key > tree->max_val) + if (key > tree->ctl->max_val) rt_extend(tree, key); - Assert(tree->root); - - shift = tree->root->shift; - node = tree->root; + parent = node_ptr_get_local(tree, tree->ctl->root); + nodep = tree->ctl->root; + shift = parent->shift; /* Descend the tree until a leaf node */ while (shift >= 0) { - rt_node *child; + rt_node_ptr childp; + + node = node_ptr_get_local(tree, nodep); if (NODE_IS_LEAF(node)) break; - if (!rt_node_search_inner(node, key, RT_ACTION_FIND, &child)) + if (!rt_node_search_inner(node, key, RT_ACTION_FIND, &childp)) { - rt_set_extend(tree, key, value, parent, node); + rt_set_extend(tree, key, value, parent, nodep, node); return false; } - Assert(child); - parent = node; - node = child; + nodep = childp; shift -= RT_NODE_SPAN; } - updated = rt_node_insert_leaf(tree, parent, node, key, value); + updated = rt_node_insert_leaf(tree, parent, nodep, node, key, value); /* Update the statistics */ if (!updated) - tree->num_keys++; + tree->ctl->num_keys++; return updated; } @@ -1635,24 +1831,24 @@ rt_search(radix_tree *tree, uint64 key, uint64 *value_p) Assert(value_p != NULL); - if (!tree->root || key > tree->max_val) + if (!RTNodePtrIsValid(tree->ctl->root) || key > tree->ctl->max_val) return false; - node = tree->root; - shift = tree->root->shift; + node = node_ptr_get_local(tree, tree->ctl->root); + shift = node->shift; /* Descend the tree until a leaf node */ while (shift >= 0) { - rt_node *child; + rt_node_ptr childp; if (NODE_IS_LEAF(node)) break; - if (!rt_node_search_inner(node, key, RT_ACTION_FIND, &child)) + if (!rt_node_search_inner(node, key, RT_ACTION_FIND, &childp)) return false; - node = child; + node = node_ptr_get_local(tree, childp); shift -= RT_NODE_SPAN; } @@ -1667,37 +1863,40 @@ bool rt_delete(radix_tree *tree, uint64 key) { rt_node *node; - rt_node *stack[RT_MAX_LEVEL] = {0}; + rt_node_ptr nodep; + rt_node_ptr stack[RT_MAX_LEVEL] = {0}; int shift; int level; bool deleted; - if (!tree->root || key > tree->max_val) + if (!RTNodePtrIsValid(tree->ctl->root) || key > tree->ctl->max_val) return false; /* * Descend the tree to search the key while building a stack of nodes we * visited. */ - node = tree->root; - shift = tree->root->shift; + nodep = tree->ctl->root; + node = node_ptr_get_local(tree, nodep); + shift = node->shift; level = -1; while (shift > 0) { - rt_node *child; + rt_node_ptr childp; /* Push the current node to the stack */ - stack[++level] = node; + stack[++level] = nodep; + node = node_ptr_get_local(tree, nodep); - if (!rt_node_search_inner(node, key, RT_ACTION_FIND, &child)) + if (!rt_node_search_inner(node, key, RT_ACTION_FIND, &childp)) return false; - node = child; + nodep = childp; shift -= RT_NODE_SPAN; } /* Delete the key from the leaf node if exists */ - Assert(NODE_IS_LEAF(node)); + node = node_ptr_get_local(tree, nodep); deleted = rt_node_search_leaf(node, key, RT_ACTION_DELETE, NULL); if (!deleted) @@ -1707,7 +1906,7 @@ rt_delete(radix_tree *tree, uint64 key) } /* Found the key to delete. Update the statistics */ - tree->num_keys--; + tree->ctl->num_keys--; /* * Return if the leaf node still has keys and we don't need to delete the @@ -1717,12 +1916,13 @@ rt_delete(radix_tree *tree, uint64 key) return true; /* Free the empty leaf node */ - rt_free_node(tree, node); + rt_free_node(tree, nodep); /* Delete the key in inner nodes recursively */ while (level >= 0) { - node = stack[level--]; + nodep = stack[level--]; + node = node_ptr_get_local(tree, nodep); deleted = rt_node_search_inner(node, key, RT_ACTION_DELETE, NULL); Assert(deleted); @@ -1732,7 +1932,7 @@ rt_delete(radix_tree *tree, uint64 key) break; /* The node became empty */ - rt_free_node(tree, node); + rt_free_node(tree, nodep); } /* @@ -1741,8 +1941,8 @@ rt_delete(radix_tree *tree, uint64 key) */ if (level == 0) { - tree->root = NULL; - tree->max_val = 0; + tree->ctl->root = InvalidRTNodePointer; + tree->ctl->max_val = 0; } return true; @@ -1753,6 +1953,7 @@ rt_iter * rt_begin_iterate(radix_tree *tree) { MemoryContext old_ctx; + rt_node *root; rt_iter *iter; int top_level; @@ -1765,14 +1966,15 @@ rt_begin_iterate(radix_tree *tree) if (!iter->tree) return iter; - top_level = iter->tree->root->shift / RT_NODE_SPAN; + root = node_ptr_get_local(tree, tree->ctl->root); + top_level = root->shift / RT_NODE_SPAN; iter->stack_len = top_level; /* * Descend to the left most leaf node from the root. The key is being * constructed while descending to the leaf. */ - rt_update_iter_stack(iter, iter->tree->root, top_level); + rt_update_iter_stack(iter, root, top_level); MemoryContextSwitchTo(old_ctx); @@ -1792,7 +1994,6 @@ rt_update_iter_stack(rt_iter *iter, rt_node *from_node, int from) { rt_node_iter *node_iter = &(iter->stack[level--]); - /* Set the node to this level */ node_iter->node = node; node_iter->current_idx = -1; @@ -1828,7 +2029,6 @@ rt_iterate_next(rt_iter *iter, uint64 *key_p, uint64 *value_p) /* Advance the leaf node iterator to get next key-value pair */ found = rt_node_leaf_iterate_next(iter, &(iter->stack[0]), &value); - if (found) { *key_p = iter->key; @@ -1898,7 +2098,7 @@ rt_node_inner_iterate_next(rt_iter *iter, rt_node_iter *node_iter) if (node_iter->current_idx >= n4->base.n.count) break; - child = n4->children[node_iter->current_idx]; + child = node_ptr_get_local(iter->tree, n4->children[node_iter->current_idx]); key_chunk = n4->base.chunks[node_iter->current_idx]; found = true; break; @@ -1911,7 +2111,7 @@ rt_node_inner_iterate_next(rt_iter *iter, rt_node_iter *node_iter) if (node_iter->current_idx >= n32->base.n.count) break; - child = n32->children[node_iter->current_idx]; + child = node_ptr_get_local(iter->tree, n32->children[node_iter->current_idx]); key_chunk = n32->base.chunks[node_iter->current_idx]; found = true; break; @@ -1931,7 +2131,7 @@ rt_node_inner_iterate_next(rt_iter *iter, rt_node_iter *node_iter) break; node_iter->current_idx = i; - child = node_inner_128_get_child(n128, i); + child = node_ptr_get_local(iter->tree, node_inner_128_get_child(n128, i)); key_chunk = i; found = true; break; @@ -1951,7 +2151,7 @@ rt_node_inner_iterate_next(rt_iter *iter, rt_node_iter *node_iter) break; node_iter->current_idx = i; - child = node_inner_256_get_child(n256, i); + child = node_ptr_get_local(iter->tree, node_inner_256_get_child(n256, i)); key_chunk = i; found = true; break; @@ -2062,7 +2262,7 @@ rt_node_leaf_iterate_next(rt_iter *iter, rt_node_iter *node_iter, uint64 rt_num_entries(radix_tree *tree) { - return tree->num_keys; + return tree->ctl->num_keys; } /* @@ -2071,12 +2271,17 @@ rt_num_entries(radix_tree *tree) uint64 rt_memory_usage(radix_tree *tree) { - Size total = sizeof(radix_tree); + Size total = sizeof(radix_tree) + sizeof(radix_tree_control); - for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + if (RadixTreeIsShared(tree)) + total = dsa_get_total_size(tree->dsa); + else { - total += MemoryContextMemAllocated(tree->inner_slabs[i], true); - total += MemoryContextMemAllocated(tree->leaf_slabs[i], true); + for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + { + total += MemoryContextMemAllocated(tree->inner_slabs[i], true); + total += MemoryContextMemAllocated(tree->leaf_slabs[i], true); + } } return total; @@ -2161,17 +2366,18 @@ void rt_stats(radix_tree *tree) { ereport(LOG, (errmsg("num_keys = %lu, height = %u, n4 = %u, n32 = %u, n128 = %u, n256 = %u", - tree->num_keys, - tree->root->shift / RT_NODE_SPAN, - tree->cnt[0], - tree->cnt[1], - tree->cnt[2], - tree->cnt[3]))); + tree->ctl->num_keys, + node_ptr_get_local(tree, tree->ctl->root)->shift / RT_NODE_SPAN, + tree->ctl->cnt[0], + tree->ctl->cnt[1], + tree->ctl->cnt[2], + tree->ctl->cnt[3]))); } static void -rt_dump_node(rt_node *node, int level, bool recurse) +rt_dump_node(radix_tree *tree, rt_node_ptr nodep, int level, bool recurse) { + rt_node *node = node_ptr_get_local(tree, nodep); char space[128] = {0}; fprintf(stderr, "[%s] kind %d, count %u, shift %u, chunk 0x%X:\n", @@ -2205,7 +2411,7 @@ rt_dump_node(rt_node *node, int level, bool recurse) space, n4->base.chunks[i]); if (recurse) - rt_dump_node(n4->children[i], level + 1, recurse); + rt_dump_node(tree, n4->children[i], level + 1, recurse); else fprintf(stderr, "\n"); } @@ -2232,7 +2438,7 @@ rt_dump_node(rt_node *node, int level, bool recurse) if (recurse) { - rt_dump_node(n32->children[i], level + 1, recurse); + rt_dump_node(tree, n32->children[i], level + 1, recurse); } else fprintf(stderr, "\n"); @@ -2284,7 +2490,7 @@ rt_dump_node(rt_node *node, int level, bool recurse) space, i); if (recurse) - rt_dump_node(node_inner_128_get_child(n128, i), + rt_dump_node(tree, node_inner_128_get_child(n128, i), level + 1, recurse); else fprintf(stderr, "\n"); @@ -2317,8 +2523,8 @@ rt_dump_node(rt_node *node, int level, bool recurse) space, i); if (recurse) - rt_dump_node(node_inner_256_get_child(n256, i), level + 1, - recurse); + rt_dump_node(tree, node_inner_256_get_child(n256, i), + level + 1, recurse); else fprintf(stderr, "\n"); } @@ -2328,6 +2534,28 @@ rt_dump_node(rt_node *node, int level, bool recurse) } } +void +rt_dump(radix_tree *tree) +{ + for (int i = 0; i < RT_NODE_KIND_COUNT; i++) + fprintf(stderr, "%s\tinner_size%lu\tinner_blocksize %lu\tleaf_size %lu\tleaf_blocksize %lu\n", + rt_node_kind_info[i].name, + rt_node_kind_info[i].inner_size, + rt_node_kind_info[i].inner_blocksize, + rt_node_kind_info[i].leaf_size, + rt_node_kind_info[i].leaf_blocksize); + fprintf(stderr, "max_val = %lu\n", tree->ctl->max_val); + + if (!tree->ctl->root) + { + fprintf(stderr, "empty tree\n"); + return; + } + + rt_dump_node(tree, tree->ctl->root, 0, true); +} + +#ifdef unused void rt_dump_search(radix_tree *tree, uint64 key) { @@ -2336,23 +2564,23 @@ rt_dump_search(radix_tree *tree, uint64 key) int level = 0; elog(NOTICE, "-----------------------------------------------------------"); - elog(NOTICE, "max_val = %lu (0x%lX)", tree->max_val, tree->max_val); + elog(NOTICE, "max_val = %lu (0x%lX)", tree->ctl->max_val, tree->ctl->max_val); - if (!tree->root) + if (!tree->ctl->root) { elog(NOTICE, "tree is empty"); return; } - if (key > tree->max_val) + if (key > tree->ctl->max_val) { elog(NOTICE, "key %lu (0x%lX) is larger than max val", key, key); return; } - node = tree->root; - shift = tree->root->shift; + node = tree->ctl->root; + shift = tree->ctl->root->shift; while (shift >= 0) { rt_node *child; @@ -2377,25 +2605,6 @@ rt_dump_search(radix_tree *tree, uint64 key) level++; } } +#endif -void -rt_dump(radix_tree *tree) -{ - for (int i = 0; i < RT_NODE_KIND_COUNT; i++) - fprintf(stderr, "%s\tinner_size%lu\tinner_blocksize %lu\tleaf_size %lu\tleaf_blocksize %lu\n", - rt_node_kind_info[i].name, - rt_node_kind_info[i].inner_size, - rt_node_kind_info[i].inner_blocksize, - rt_node_kind_info[i].leaf_size, - rt_node_kind_info[i].leaf_blocksize); - fprintf(stderr, "max_val = %lu\n", tree->max_val); - - if (!tree->root) - { - fprintf(stderr, "empty tree\n"); - return; - } - - rt_dump_node(tree->root, 0, true); -} #endif diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c index 82376fde2d..ad169882af 100644 --- a/src/backend/utils/mmgr/dsa.c +++ b/src/backend/utils/mmgr/dsa.c @@ -1024,6 +1024,18 @@ dsa_set_size_limit(dsa_area *area, size_t limit) LWLockRelease(DSA_AREA_LOCK(area)); } +size_t +dsa_get_total_size(dsa_area *area) +{ + size_t size; + + LWLockAcquire(DSA_AREA_LOCK(area), LW_SHARED); + size = area->control->total_segment_size; + LWLockRelease(DSA_AREA_LOCK(area)); + + return size; +} + /* * Aggressively free all spare memory in the hope of returning DSM segments to * the operating system. diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h index d5d7668617..d9d8355c21 100644 --- a/src/include/lib/radixtree.h +++ b/src/include/lib/radixtree.h @@ -14,18 +14,22 @@ #define RADIXTREE_H #include "postgres.h" +#include "utils/dsa.h" #define RT_DEBUG 1 typedef struct radix_tree radix_tree; typedef struct rt_iter rt_iter; -extern radix_tree *rt_create(MemoryContext ctx); +extern radix_tree *rt_create(MemoryContext ctx, dsa_area *dsa); extern void rt_free(radix_tree *tree); extern bool rt_search(radix_tree *tree, uint64 key, uint64 *val_p); extern bool rt_set(radix_tree *tree, uint64 key, uint64 val); extern rt_iter *rt_begin_iterate(radix_tree *tree); +extern dsa_pointer rt_get_dsa_pointer(radix_tree *tree); +extern radix_tree *rt_attach(dsa_area *dsa, dsa_pointer dp); + extern bool rt_iterate_next(rt_iter *iter, uint64 *key_p, uint64 *value_p); extern void rt_end_iterate(rt_iter *iter); extern bool rt_delete(radix_tree *tree, uint64 key); diff --git a/src/include/utils/dsa.h b/src/include/utils/dsa.h index 405606fe2f..dad06adecc 100644 --- a/src/include/utils/dsa.h +++ b/src/include/utils/dsa.h @@ -117,6 +117,7 @@ extern dsa_handle dsa_get_handle(dsa_area *area); extern dsa_pointer dsa_allocate_extended(dsa_area *area, size_t size, int flags); extern void dsa_free(dsa_area *area, dsa_pointer dp); extern void *dsa_get_address(dsa_area *area, dsa_pointer dp); +extern size_t dsa_get_total_size(dsa_area *area); extern void dsa_trim(dsa_area *area); extern void dsa_dump(dsa_area *area); diff --git a/src/test/modules/test_radixtree/expected/test_radixtree.out b/src/test/modules/test_radixtree/expected/test_radixtree.out index cc6970c87c..a0ff1e1c77 100644 --- a/src/test/modules/test_radixtree/expected/test_radixtree.out +++ b/src/test/modules/test_radixtree/expected/test_radixtree.out @@ -5,21 +5,38 @@ CREATE EXTENSION test_radixtree; -- SELECT test_radixtree(); NOTICE: testing radix tree node types with shift "0" +NOTICE: testing radix tree node types with shift "0" +NOTICE: testing radix tree node types with shift "8" NOTICE: testing radix tree node types with shift "8" NOTICE: testing radix tree node types with shift "16" +NOTICE: testing radix tree node types with shift "16" NOTICE: testing radix tree node types with shift "24" +NOTICE: testing radix tree node types with shift "24" +NOTICE: testing radix tree node types with shift "32" NOTICE: testing radix tree node types with shift "32" NOTICE: testing radix tree node types with shift "40" +NOTICE: testing radix tree node types with shift "40" +NOTICE: testing radix tree node types with shift "48" NOTICE: testing radix tree node types with shift "48" NOTICE: testing radix tree node types with shift "56" +NOTICE: testing radix tree node types with shift "56" +NOTICE: testing radix tree with pattern "all ones" NOTICE: testing radix tree with pattern "all ones" NOTICE: testing radix tree with pattern "alternating bits" +NOTICE: testing radix tree with pattern "alternating bits" +NOTICE: testing radix tree with pattern "clusters of ten" NOTICE: testing radix tree with pattern "clusters of ten" NOTICE: testing radix tree with pattern "clusters of hundred" +NOTICE: testing radix tree with pattern "clusters of hundred" +NOTICE: testing radix tree with pattern "one-every-64k" NOTICE: testing radix tree with pattern "one-every-64k" NOTICE: testing radix tree with pattern "sparse" +NOTICE: testing radix tree with pattern "sparse" +NOTICE: testing radix tree with pattern "single values, distance > 2^32" NOTICE: testing radix tree with pattern "single values, distance > 2^32" NOTICE: testing radix tree with pattern "clusters, distance > 2^32" +NOTICE: testing radix tree with pattern "clusters, distance > 2^32" +NOTICE: testing radix tree with pattern "clusters, distance > 2^60" NOTICE: testing radix tree with pattern "clusters, distance > 2^60" test_radixtree ---------------- diff --git a/src/test/modules/test_radixtree/test_radixtree.c b/src/test/modules/test_radixtree/test_radixtree.c index cb3596755d..a08495834e 100644 --- a/src/test/modules/test_radixtree/test_radixtree.c +++ b/src/test/modules/test_radixtree/test_radixtree.c @@ -19,6 +19,7 @@ #include "nodes/bitmapset.h" #include "storage/block.h" #include "storage/itemptr.h" +#include "storage/lwlock.h" #include "utils/memutils.h" #include "utils/timestamp.h" @@ -111,7 +112,7 @@ test_empty(void) radix_tree *radixtree; uint64 dummy; - radixtree = rt_create(CurrentMemoryContext); + radixtree = rt_create(CurrentMemoryContext, NULL); if (rt_search(radixtree, 0, &dummy)) elog(ERROR, "rt_search on empty tree returned true"); @@ -217,14 +218,10 @@ test_node_types_delete(radix_tree *radixtree, uint8 shift) * level. */ static void -test_node_types(uint8 shift) +do_test_node_types(radix_tree *radixtree, uint8 shift) { - radix_tree *radixtree; - elog(NOTICE, "testing radix tree node types with shift \"%d\"", shift); - radixtree = rt_create(CurrentMemoryContext); - /* * Insert and search entries for every node type at the 'shift' level, * then delete all entries to make it empty, and insert and search entries @@ -233,19 +230,38 @@ test_node_types(uint8 shift) test_node_types_insert(radixtree, shift); test_node_types_delete(radixtree, shift); test_node_types_insert(radixtree, shift); +} - rt_free(radixtree); +static void +test_node_types(void) +{ + int tranche_id = LWLockNewTrancheId(); + + for (int shift = 0; shift <= (64 - 8); shift += 8) + { + radix_tree *tree; + dsa_area *dsa; + + /* Test the local radix tree */ + tree = rt_create(CurrentMemoryContext, NULL); + do_test_node_types(tree, shift); + rt_free(tree); + + /* Test the shared radix tree */ + dsa = dsa_create(tranche_id); + tree = rt_create(CurrentMemoryContext, dsa); + do_test_node_types(tree, shift); + rt_free(tree); + } } /* * Test with a repeating pattern, defined by the 'spec'. */ static void -test_pattern(const test_spec * spec) +do_test_pattern(radix_tree *radixtree, const test_spec * spec) { - radix_tree *radixtree; rt_iter *iter; - MemoryContext radixtree_ctx; TimestampTz starttime; TimestampTz endtime; uint64 n; @@ -271,18 +287,6 @@ test_pattern(const test_spec * spec) pattern_values[pattern_num_values++] = i; } - /* - * Allocate the radix tree. - * - * Allocate it in a separate memory context, so that we can print its - * memory usage easily. - */ - radixtree_ctx = AllocSetContextCreate(CurrentMemoryContext, - "radixtree test", - ALLOCSET_SMALL_SIZES); - MemoryContextSetIdentifier(radixtree_ctx, spec->test_name); - radixtree = rt_create(radixtree_ctx); - /* * Add values to the set. */ @@ -336,8 +340,6 @@ test_pattern(const test_spec * spec) mem_usage = rt_memory_usage(radixtree); fprintf(stderr, "rt_memory_usage() reported " UINT64_FORMAT " (%0.2f bytes / integer)\n", mem_usage, (double) mem_usage / spec->num_values); - - MemoryContextStats(radixtree_ctx); } /* Check that rt_num_entries works */ @@ -484,21 +486,53 @@ test_pattern(const test_spec * spec) if ((nbefore - ndeleted) != nafter) elog(ERROR, "rt_num_entries returned " UINT64_FORMAT ", expected " UINT64_FORMAT "after " UINT64_FORMAT " deletion", nafter, (nbefore - ndeleted), ndeleted); +} + +static void +test_patterns(void) +{ + int tranche_id = LWLockNewTrancheId(); + + /* Test different test patterns, with lots of entries */ + for (int i = 0; i < lengthof(test_specs); i++) + { + radix_tree *tree; + MemoryContext radixtree_ctx; + dsa_area *dsa; + const test_spec *spec = &test_specs[i]; - MemoryContextDelete(radixtree_ctx); + /* + * Allocate the radix tree. + * + * Allocate it in a separate memory context, so that we can print its + * memory usage easily. + */ + radixtree_ctx = AllocSetContextCreate(CurrentMemoryContext, + "radixtree test", + ALLOCSET_SMALL_SIZES); + MemoryContextSetIdentifier(radixtree_ctx, spec->test_name); + + /* Test the local radix tree */ + tree = rt_create(radixtree_ctx, NULL); + do_test_pattern(tree, spec); + rt_free(tree); + MemoryContextReset(radixtree_ctx); + + /* Test the shared radix tree */ + dsa = dsa_create(tranche_id); + tree = rt_create(radixtree_ctx, dsa); + do_test_pattern(tree, spec); + rt_free(tree); + MemoryContextDelete(radixtree_ctx); + } } Datum test_radixtree(PG_FUNCTION_ARGS) { test_empty(); - - for (int shift = 0; shift <= (64 - 8); shift += 8) - test_node_types(shift); - - /* Test different test patterns, with lots of entries */ - for (int i = 0; i < lengthof(test_specs); i++) - test_pattern(&test_specs[i]); + test_node_types(); + test_patterns(); PG_RETURN_VOID(); } -- 2.31.1