From 01ac02bfe6ff33f779ae0da2d0794cd1a3a2f1c3 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada Date: Wed, 14 Sep 2022 12:38:51 +0000 Subject: [PATCH v35 3/7] Add radixtree template WIP: commit message based on template comments --- src/backend/utils/mmgr/dsa.c | 12 + src/include/lib/radixtree.h | 3101 +++++++++++++++++ src/include/utils/dsa.h | 1 + src/test/modules/Makefile | 1 + src/test/modules/meson.build | 1 + src/test/modules/test_radixtree/.gitignore | 4 + src/test/modules/test_radixtree/Makefile | 23 + src/test/modules/test_radixtree/README | 7 + .../expected/test_radixtree.out | 48 + src/test/modules/test_radixtree/meson.build | 35 + .../test_radixtree/sql/test_radixtree.sql | 7 + .../test_radixtree/test_radixtree--1.0.sql | 8 + .../modules/test_radixtree/test_radixtree.c | 776 +++++ .../test_radixtree/test_radixtree.control | 4 + src/tools/pginclude/cpluspluscheck | 6 + src/tools/pginclude/headerscheck | 6 + 16 files changed, 4040 insertions(+) create mode 100644 src/include/lib/radixtree.h create mode 100644 src/test/modules/test_radixtree/.gitignore create mode 100644 src/test/modules/test_radixtree/Makefile create mode 100644 src/test/modules/test_radixtree/README create mode 100644 src/test/modules/test_radixtree/expected/test_radixtree.out create mode 100644 src/test/modules/test_radixtree/meson.build create mode 100644 src/test/modules/test_radixtree/sql/test_radixtree.sql create mode 100644 src/test/modules/test_radixtree/test_radixtree--1.0.sql create mode 100644 src/test/modules/test_radixtree/test_radixtree.c create mode 100644 src/test/modules/test_radixtree/test_radixtree.control diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c index 7a3781466e..0fa155c525 100644 --- a/src/backend/utils/mmgr/dsa.c +++ b/src/backend/utils/mmgr/dsa.c @@ -1024,6 +1024,18 @@ dsa_set_size_limit(dsa_area *area, size_t limit) LWLockRelease(DSA_AREA_LOCK(area)); } +size_t +dsa_get_total_size(dsa_area *area) +{ + size_t size; + + LWLockAcquire(DSA_AREA_LOCK(area), LW_SHARED); + size = area->control->total_segment_size; + LWLockRelease(DSA_AREA_LOCK(area)); + + return size; +} + /* * Aggressively free all spare memory in the hope of returning DSM segments to * the operating system. diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h new file mode 100644 index 0000000000..4df273ddeb --- /dev/null +++ b/src/include/lib/radixtree.h @@ -0,0 +1,3101 @@ +/*------------------------------------------------------------------------- + * + * radixtree.h + * Template for adaptive radix tree. + * + * This module employs the idea from the paper "The Adaptive Radix Tree: ARTful + * Indexing for Main-Memory Databases" by Viktor Leis, Alfons Kemper, and Thomas + * Neumann, 2013. The radix tree uses adaptive node sizes, a small number of node + * types, each with a different numbers of elements. Depending on the number of + * children, the appropriate node type is used. + * + * WIP: notes about traditional radix tree trading off span vs height... + * + * There are two kinds of nodes, inner nodes and leaves. Inner nodes + * map partial keys to child pointers. + * + * The ART paper mentions three ways to implement leaves: + * + * "- Single-value leaves: The values are stored using an addi- + * tional leaf node type which stores one value. + * - Multi-value leaves: The values are stored in one of four + * different leaf node types, which mirror the structure of + * inner nodes, but contain values instead of pointers. + * - Combined pointer/value slots: If values fit into point- + * ers, no separate node types are necessary. Instead, each + * pointer storage location in an inner node can either + * store a pointer or a value." + * + * We chose "multi-value leaves" to avoid the additional pointer traversal + * required by "single-value leaves" + * + * For simplicity, the key is assumed to be 64-bit unsigned integer. The + * tree doesn't need to contain paths where the highest bytes of all keys + * are zero. That way, the tree's height adapts to the distribution of keys. + * + * TODO: In the future it might be worthwhile to offer configurability of + * leaf implementation for different use cases. Single-values leaves would + * give more flexibility in key type, including variable-length keys. + * + * There are some optimizations not yet implemented, particularly path + * compression and lazy path expansion. + * + * To handle concurrency, we use a single reader-writer lock for the radix + * tree. The radix tree is exclusively locked during write operations such + * as RT_SET() and RT_DELETE(), and shared locked during read operations + * such as RT_SEARCH(). An iteration also holds the shared lock on the radix + * tree until it is completed. + * + * TODO: The current locking mechanism is not optimized for high concurrency + * with mixed read-write workloads. In the future it might be worthwhile + * to replace it with the Optimistic Lock Coupling or ROWEX mentioned in + * the paper "The ART of Practical Synchronization" by the same authors as + * the ART paper, 2016. + * + * WIP: the radix tree nodes don't shrink. + * + * To generate a radix tree and associated functions for a use case several + * macros have to be #define'ed before this file is included. Including + * the file #undef's all those, so a new radix tree can be generated + * afterwards. + * The relevant parameters are: + * - RT_PREFIX - prefix for all symbol names generated. A prefix of 'foo' + * will result in radix tree type 'foo_radix_tree' and functions like + * 'foo_create'/'foo_free' and so forth. + * - RT_DECLARE - if defined function prototypes and type declarations are + * generated + * - RT_DEFINE - if defined function definitions are generated + * - RT_SCOPE - in which scope (e.g. extern, static inline) do function + * declarations reside + * - RT_VALUE_TYPE - the type of the value. + * + * Optional parameters: + * - RT_SHMEM - if defined, the radix tree is created in the DSA area + * so that multiple processes can access it simultaneously. + * - RT_DEBUG - if defined add stats tracking and debugging functions + * + * Interface + * --------- + * + * RT_CREATE - Create a new, empty radix tree + * RT_FREE - Free the radix tree + * RT_SEARCH - Search a key-value pair + * RT_SET - Set a key-value pair + * RT_BEGIN_ITERATE - Begin iterating through all key-value pairs + * RT_ITERATE_NEXT - Return next key-value pair, if any + * RT_END_ITERATE - End iteration + * RT_MEMORY_USAGE - Get the memory usage + * + * Interface for Shared Memory + * --------- + * + * RT_ATTACH - Attach to the radix tree + * RT_DETACH - Detach from the radix tree + * RT_GET_HANDLE - Return the handle of the radix tree + * + * Optional Interface + * --------- + * + * RT_DELETE - Delete a key-value pair. Declared/define if RT_USE_DELETE is defined + * + * + * Copyright (c) 2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/lib/radixtree.h + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "lib/stringinfo.h" +#include "miscadmin.h" +#include "nodes/bitmapset.h" +#include "port/pg_bitutils.h" +#include "port/simd.h" +#include "utils/dsa.h" +#include "utils/memutils.h" + +/* helpers */ +#define RT_MAKE_PREFIX(a) CppConcat(a,_) +#define RT_MAKE_NAME(name) RT_MAKE_NAME_(RT_MAKE_PREFIX(RT_PREFIX),name) +#define RT_MAKE_NAME_(a,b) CppConcat(a,b) + +/* function declarations */ +#define RT_CREATE RT_MAKE_NAME(create) +#define RT_FREE RT_MAKE_NAME(free) +#define RT_SEARCH RT_MAKE_NAME(search) +#ifdef RT_SHMEM +#define RT_ATTACH RT_MAKE_NAME(attach) +#define RT_DETACH RT_MAKE_NAME(detach) +#define RT_GET_HANDLE RT_MAKE_NAME(get_handle) +#endif +#define RT_SET RT_MAKE_NAME(set) +#define RT_BEGIN_ITERATE RT_MAKE_NAME(begin_iterate) +#define RT_ITERATE_NEXT RT_MAKE_NAME(iterate_next) +#define RT_END_ITERATE RT_MAKE_NAME(end_iterate) +#ifdef RT_USE_DELETE +#define RT_DELETE RT_MAKE_NAME(delete) +#endif +#define RT_MEMORY_USAGE RT_MAKE_NAME(memory_usage) +#define RT_DUMP RT_MAKE_NAME(dump) +#define RT_DUMP_NODE RT_MAKE_NAME(dump_node) +#define RT_DUMP_SEARCH RT_MAKE_NAME(dump_search) + +#define RT_STATS RT_MAKE_NAME(stats) + +/* internal helper functions (no externally visible prototypes) */ +#define RT_NEW_ROOT RT_MAKE_NAME(new_root) +#define RT_RECURSIVE_SET RT_MAKE_NAME(recursive_set) +#define RT_RECURSIVE_DELETE RT_MAKE_NAME(recursive_delete) +#define RT_ALLOC_NODE RT_MAKE_NAME(alloc_node) +#define RT_ALLOC_LEAF RT_MAKE_NAME(alloc_leaf) +#define RT_FREE_NODE RT_MAKE_NAME(free_node) +#define RT_FREE_LEAF RT_MAKE_NAME(free_leaf) +#define RT_FREE_RECURSE RT_MAKE_NAME(free_recurse) +#define RT_EXTEND_UP RT_MAKE_NAME(extend_up) +#define RT_EXTEND_DOWN RT_MAKE_NAME(extend_down) +#define RT_COPY_COMMON RT_MAKE_NAME(copy_common) +#define RT_PTR_SET_LOCAL RT_MAKE_NAME(ptr_set_local) +#define RT_PTR_ALLOC_IS_VALID RT_MAKE_NAME(ptr_stored_is_valid) +#define RT_NODE_3_SEARCH_EQ RT_MAKE_NAME(node_3_search_eq) +#define RT_NODE_32_SEARCH_EQ RT_MAKE_NAME(node_32_search_eq) +#define RT_NODE_3_GET_INSERTPOS RT_MAKE_NAME(node_3_get_insertpos) +#define RT_NODE_32_GET_INSERTPOS RT_MAKE_NAME(node_32_get_insertpos) +#define RT_CHUNK_CHILDREN_ARRAY_SHIFT RT_MAKE_NAME(chunk_children_array_shift) +#define RT_CHUNK_CHILDREN_ARRAY_DELETE RT_MAKE_NAME(chunk_children_array_delete) +#define RT_CHUNK_CHILDREN_ARRAY_COPY RT_MAKE_NAME(chunk_children_array_copy) +#define RT_CHUNK_VALUES_ARRAY_COPY RT_MAKE_NAME(chunk_values_array_copy) +#define RT_NODE_125_IS_CHUNK_USED RT_MAKE_NAME(node_125_is_chunk_used) +#define RT_NODE_INNER_125_GET_CHILD RT_MAKE_NAME(node_inner_125_get_child) +#define RT_NODE_INNER_256_IS_CHUNK_USED RT_MAKE_NAME(node_inner_256_is_chunk_used) +#define RT_NODE_INNER_256_GET_CHILD RT_MAKE_NAME(node_inner_256_get_child) +#define RT_NODE_INNER_256_SET RT_MAKE_NAME(node_inner_256_set) +#define RT_NODE_INNER_256_DELETE RT_MAKE_NAME(node_inner_256_delete) +#define RT_KEY_GET_SHIFT RT_MAKE_NAME(key_get_shift) +#define RT_SHIFT_GET_MAX_VAL RT_MAKE_NAME(shift_get_max_val) +#define RT_NODE_SEARCH_INNER RT_MAKE_NAME(node_search_inner) +#define RT_NODE_SEARCH_LEAF RT_MAKE_NAME(node_search_leaf) +#define RT_NODE_UPDATE_INNER RT_MAKE_NAME(node_update_inner) +#define RT_NODE_DELETE_INNER RT_MAKE_NAME(node_delete_inner) +#define RT_NODE_DELETE_LEAF RT_MAKE_NAME(node_delete_leaf) +#define RT_NODE_INSERT_INNER RT_MAKE_NAME(node_insert_inner) +#define RT_ADD_CHILD_4 RT_MAKE_NAME(add_child_4) +#define RT_ADD_CHILD_16 RT_MAKE_NAME(add_child_16) +#define RT_ADD_CHILD_48 RT_MAKE_NAME(add_child_48) +#define RT_ADD_CHILD_256 RT_MAKE_NAME(add_child_256) +#define RT_GROW_NODE_4 RT_MAKE_NAME(grow_node_4) +#define RT_GROW_NODE_16 RT_MAKE_NAME(grow_node_16) +#define RT_GROW_NODE_48 RT_MAKE_NAME(grow_node_48) +#define RT_GROW_NODE_256 RT_MAKE_NAME(grow_node_256) +#define RT_REMOVE_CHILD_4 RT_MAKE_NAME(remove_child_4) +#define RT_REMOVE_CHILD_16 RT_MAKE_NAME(remove_child_16) +#define RT_REMOVE_CHILD_48 RT_MAKE_NAME(remove_child_48) +#define RT_REMOVE_CHILD_256 RT_MAKE_NAME(remove_child_256) +#define RT_NODE_INSERT_LEAF RT_MAKE_NAME(node_insert_leaf) +#define RT_NODE_INNER_ITERATE_NEXT RT_MAKE_NAME(node_inner_iterate_next) +#define RT_NODE_LEAF_ITERATE_NEXT RT_MAKE_NAME(node_leaf_iterate_next) +#define RT_ITER_SET_NODE_FROM RT_MAKE_NAME(iter_set_node_from) +#define RT_ITER_UPDATE_KEY RT_MAKE_NAME(iter_update_key) +#define RT_VERIFY_NODE RT_MAKE_NAME(verify_node) + +/* type declarations */ +#define RT_RADIX_TREE RT_MAKE_NAME(radix_tree) +#define RT_RADIX_TREE_CONTROL RT_MAKE_NAME(radix_tree_control) +#define RT_ITER RT_MAKE_NAME(iter) +#ifdef RT_SHMEM +#define RT_HANDLE RT_MAKE_NAME(handle) +#endif +#define RT_NODE RT_MAKE_NAME(node) +#define RT_NODE_PTR RT_MAKE_NAME(node_ptr) +#define RT_NODE_ITER RT_MAKE_NAME(node_iter) +#define RT_NODE_BASE_4 RT_MAKE_NAME(node_base_4) +#define RT_NODE_BASE_16 RT_MAKE_NAME(node_base_16) +#define RT_NODE_BASE_48 RT_MAKE_NAME(node_base_48) +#define RT_NODE_BASE_256 RT_MAKE_NAME(node_base_256) +#define RT_NODE_INNER_4 RT_MAKE_NAME(node_inner_4) +#define RT_NODE_INNER_16 RT_MAKE_NAME(node_inner_16) +#define RT_NODE_INNER_48 RT_MAKE_NAME(node_inner_48) +#define RT_NODE_INNER_256 RT_MAKE_NAME(node_inner_256) +#define RT_NODE_LEAF_4 RT_MAKE_NAME(node_leaf_4) +#define RT_NODE_LEAF_16 RT_MAKE_NAME(node_leaf_16) +#define RT_NODE_LEAF_48 RT_MAKE_NAME(node_leaf_48) +#define RT_NODE_LEAF_256 RT_MAKE_NAME(node_leaf_256) +#define RT_SIZE_CLASS RT_MAKE_NAME(size_class) +#define RT_SIZE_CLASS_ELEM RT_MAKE_NAME(size_class_elem) +#define RT_SIZE_CLASS_INFO RT_MAKE_NAME(size_class_info) +#define RT_CLASS_4 RT_MAKE_NAME(class_4) +#define RT_CLASS_16_LO RT_MAKE_NAME(class_32_min) +#define RT_CLASS_16_HI RT_MAKE_NAME(class_32_max) +#define RT_CLASS_48 RT_MAKE_NAME(class_48) +#define RT_CLASS_256 RT_MAKE_NAME(class_256) + +/* generate forward declarations necessary to use the radix tree */ +#ifdef RT_DECLARE + +typedef struct RT_RADIX_TREE RT_RADIX_TREE; +typedef struct RT_ITER RT_ITER; + +#ifdef RT_SHMEM +typedef dsa_pointer RT_HANDLE; +#endif +#define RT_PTR_LOCAL RT_NODE * + +#ifdef RT_SHMEM +RT_SCOPE RT_RADIX_TREE * RT_CREATE(MemoryContext ctx, dsa_area *dsa, int tranche_id); +RT_SCOPE RT_RADIX_TREE * RT_ATTACH(dsa_area *dsa, dsa_pointer dp); +RT_SCOPE void RT_DETACH(RT_RADIX_TREE *tree); +RT_SCOPE RT_HANDLE RT_GET_HANDLE(RT_RADIX_TREE *tree); +#else +RT_SCOPE RT_RADIX_TREE * RT_CREATE(MemoryContext ctx); +#endif +RT_SCOPE void RT_FREE(RT_RADIX_TREE *tree); + +RT_SCOPE bool RT_SEARCH(RT_RADIX_TREE *tree, uint64 key, RT_VALUE_TYPE *value_p); +RT_SCOPE bool RT_SET(RT_RADIX_TREE *tree, uint64 key, RT_VALUE_TYPE *value_p); +#ifdef RT_USE_DELETE +RT_SCOPE bool RT_DELETE(RT_RADIX_TREE *tree, uint64 key); +#endif + +RT_SCOPE RT_ITER * RT_BEGIN_ITERATE(RT_RADIX_TREE *tree); +RT_SCOPE bool RT_ITERATE_NEXT(RT_ITER *iter, uint64 *key_p, RT_VALUE_TYPE *value_p); +RT_SCOPE void RT_END_ITERATE(RT_ITER *iter); + +RT_SCOPE uint64 RT_MEMORY_USAGE(RT_RADIX_TREE *tree); + +#if 0 +RT_SCOPE void RT_DUMP(RT_RADIX_TREE *tree); +RT_SCOPE void RT_DUMP_SEARCH(RT_RADIX_TREE *tree, uint64 key); +#endif + +RT_SCOPE void RT_STATS(RT_RADIX_TREE *tree); + +#endif /* RT_DECLARE */ + + +/* generate implementation of the radix tree */ +#ifdef RT_DEFINE + +/* The number of bits encoded in one tree level */ +#define RT_NODE_SPAN BITS_PER_BYTE + +/* The number of maximum slots in the node */ +#define RT_NODE_MAX_SLOTS (1 << RT_NODE_SPAN) + +/* Mask for extracting a chunk from the key */ +#define RT_CHUNK_MASK ((1 << RT_NODE_SPAN) - 1) + +/* Maximum shift the radix tree uses */ +#define RT_MAX_SHIFT RT_KEY_GET_SHIFT(UINT64_MAX) + +/* Tree level the radix tree uses */ +#define RT_MAX_LEVEL ((sizeof(uint64) * BITS_PER_BYTE) / RT_NODE_SPAN) + +/* + * Number of bits necessary for isset array in the slot-index node. + * Since bitmapword can be 64 bits, the only values that make sense + * here are 64 and 128. + */ +#define RT_SLOT_IDX_LIMIT (RT_NODE_MAX_SLOTS / 2) + +/* Invalid index used in node-125 */ +#define RT_INVALID_SLOT_IDX 0xFF + +/* Get a chunk from the key */ +#define RT_GET_KEY_CHUNK(key, shift) ((uint8) (((key) >> (shift)) & RT_CHUNK_MASK)) + +/* For accessing bitmaps */ +#define RT_BM_IDX(x) ((x) / BITS_PER_BITMAPWORD) +#define RT_BM_BIT(x) ((x) % BITS_PER_BITMAPWORD) + +/* + * Node kinds + * + * The different node kinds are what make the tree "adaptive". + * + * Each node kind is associated with a different datatype and different + * search/set/delete/iterate algorithms adapted for its size. The largest + * kind, node256 is basically the same as a traditional radix tree, + * and would be most wasteful of memory when sparsely populated. The + * smaller nodes expend some additional CPU time to enable a smaller + * memory footprint. + * + * XXX There are 4 node kinds, and this should never be increased, + * for several reasons: + * 1. With 5 or more kinds, gcc tends to use a jump table for switch + * statements. + * 2. The 4 kinds can be represented with 2 bits, so we have the option + * in the future to tag the node pointer with the kind, even on + * platforms with 32-bit pointers. This might speed up node traversal + * in trees with highly random node kinds. + * 3. We can have multiple size classes per node kind. + */ +#define RT_NODE_KIND_4 0x00 +#define RT_NODE_KIND_16 0x01 +#define RT_NODE_KIND_48 0x02 +#define RT_NODE_KIND_256 0x03 +#define RT_NODE_KIND_COUNT 4 + +/* + * Calculate the slab blocksize so that we can allocate at least 32 chunks + * from the block. + */ +#define RT_SLAB_BLOCK_SIZE(size) \ + Max((SLAB_DEFAULT_BLOCK_SIZE / (size)) * (size), (size) * 32) + +/* Common type for all nodes types */ +typedef struct RT_NODE +{ + /* + * Number of children. uint8 is + sufficient for all node kinds, because nodes shrink when this number + gets lower than some thresold. Since node256 cannot possibly have zero + children, we let the counter overflow and we intepret zero as "256" for + this node kind. + */ + uint8 count; + + /* + * Max capacity for the current size class. Storing this in the + * node enables multiple size classes per node kind. + * Technically, kinds with a single size class don't need this, so we could + * keep this in the individual base types, but the code is simpler this way. + * Note: node256 is unique in that it cannot possibly have more than a + * single size class, so for that kind we store zero, and uint8 is + * sufficient for other kinds. + */ + uint8 fanout; + + /* Node kind, one per search/set algorithm */ + uint8 kind; +} RT_NODE; + + +#define RT_PTR_LOCAL RT_NODE * + +/* pointer returned by allocation */ +#ifdef RT_SHMEM +#define RT_PTR_ALLOC dsa_pointer +#else +#define RT_PTR_ALLOC RT_PTR_LOCAL +#endif + + +#ifdef RT_SHMEM +#define RT_INVALID_PTR_ALLOC InvalidDsaPointer +#else +#define RT_INVALID_PTR_ALLOC NULL +#endif + +#ifdef RT_SHMEM +#define RT_LOCK_EXCLUSIVE(tree) LWLockAcquire(&tree->ctl->lock, LW_EXCLUSIVE) +#define RT_LOCK_SHARED(tree) LWLockAcquire(&tree->ctl->lock, LW_SHARED) +#define RT_UNLOCK(tree) LWLockRelease(&tree->ctl->lock); +#else +#define RT_LOCK_EXCLUSIVE(tree) ((void) 0) +#define RT_LOCK_SHARED(tree) ((void) 0) +#define RT_UNLOCK(tree) ((void) 0) +#endif + +// fixme +#define RT_NODE_IS_LEAF(x) false + +//todo: caller can define function to abbreviate value +#define RT_VALUE_IS_EMBEDDABLE (sizeof(RT_VALUE_TYPE) <= SIZEOF_VOID_P) + +/* + * Inner nodes and leaf nodes have analogous structure. To distinguish + * them at runtime, we take advantage of the fact that the key chunk + * is accessed by shifting: Inner tree nodes (shift > 0), store the + * pointer to its child node in the slot. In leaf nodes (shift == 0), + * the slot contains the value corresponding to the key. + */ + +#define RT_NODE_MUST_GROW(node) \ + ((node)->base.n.count == (node)->base.n.fanout) + +#ifdef RT_SHMEM +typedef struct RT_NODE_PTR +#else +typedef union RT_NODE_PTR +#endif +{ + RT_PTR_ALLOC alloc; + RT_PTR_LOCAL local; +} RT_NODE_PTR; + +/* + * Base type of each node kinds for leaf and inner nodes. + * The base types must be a be able to accommodate the largest size + * class for variable-sized node kinds. + */ +typedef struct RT_NODE_BASE_4 +{ + RT_NODE n; + + /* 3 children, for key chunks */ + uint8 chunks[3]; +} RT_NODE_BASE_4; + +typedef struct RT_NODE_BASE_16 +{ + RT_NODE n; + + /* 32 children, for key chunks */ + uint8 chunks[32]; +} RT_NODE_BASE_16; + +/* + * node-125 uses slot_idx array, an array of RT_NODE_MAX_SLOTS length + * to store indexes into a second array that contains the values (or + * child pointers). + */ +typedef struct RT_NODE_BASE_48 +{ + RT_NODE n; + + /* The index of slots for each fanout */ + uint8 slot_idxs[RT_NODE_MAX_SLOTS]; + + /* bitmap to track which slots are in use */ + bitmapword isset[RT_BM_IDX(RT_SLOT_IDX_LIMIT)]; +} RT_NODE_BASE_48; + +typedef struct RT_NODE_BASE_256 +{ + RT_NODE n; +} RT_NODE_BASE_256; + +/* + * Inner and leaf nodes. + * + * These are separate because the value type might be different than + * something fitting into a pointer-width type. + */ +typedef struct RT_NODE_INNER_4 +{ + RT_NODE_BASE_4 base; + + /* number of children depends on size class */ + RT_PTR_ALLOC children[FLEXIBLE_ARRAY_MEMBER]; +} RT_NODE_INNER_4; + +typedef struct RT_NODE_LEAF_4 +{ + RT_NODE_BASE_4 base; + + /* number of values depends on size class */ + RT_VALUE_TYPE values[FLEXIBLE_ARRAY_MEMBER]; +} RT_NODE_LEAF_4; + +typedef struct RT_NODE_INNER_16 +{ + RT_NODE_BASE_16 base; + + /* number of children depends on size class */ + RT_PTR_ALLOC children[FLEXIBLE_ARRAY_MEMBER]; +} RT_NODE_INNER_16; + +typedef struct RT_NODE_LEAF_16 +{ + RT_NODE_BASE_16 base; + + /* number of values depends on size class */ + RT_VALUE_TYPE values[FLEXIBLE_ARRAY_MEMBER]; +} RT_NODE_LEAF_16; + +typedef struct RT_NODE_INNER_48 +{ + RT_NODE_BASE_48 base; + + /* number of children depends on size class */ + RT_PTR_ALLOC children[FLEXIBLE_ARRAY_MEMBER]; +} RT_NODE_INNER_48; + +typedef struct RT_NODE_LEAF_48 +{ + RT_NODE_BASE_48 base; + + /* number of values depends on size class */ + RT_VALUE_TYPE values[FLEXIBLE_ARRAY_MEMBER]; +} RT_NODE_LEAF_48; + +/* + * node-256 is the largest node type. This node has an array + * for directly storing values (or child pointers in inner nodes). + * Unlike other node kinds, it's array size is by definition + * fixed. + */ +typedef struct RT_NODE_INNER_256 +{ + RT_NODE_BASE_256 base; + + /* + * Zero is a valid value for embedded values, so we use a + * bitmap to track which slots are in use. + */ + bitmapword isset[RT_BM_IDX(RT_NODE_MAX_SLOTS)]; + + /* Slots for 256 children */ + RT_PTR_ALLOC children[RT_NODE_MAX_SLOTS]; +} RT_NODE_INNER_256; + +typedef struct RT_NODE_LEAF_256 +{ + RT_NODE_BASE_256 base; + + /* + * Unlike with inner256, zero is a valid value here, so we use a + * bitmap to track which slots are in use. + */ + bitmapword isset[RT_BM_IDX(RT_NODE_MAX_SLOTS)]; + + /* Slots for 256 values */ + RT_VALUE_TYPE values[RT_NODE_MAX_SLOTS]; +} RT_NODE_LEAF_256; + +/* + * Node size classes + * + * Nodes of different kinds necessarily belong to different size classes. + * The main innovation in our implementation compared to the ART paper + * is decoupling the notion of size class from kind. + * + * The size classes within a given node kind have the same underlying + * type, but a variable number of children/values. This is possible + * because the base type contains small fixed data structures that + * work the same way regardless of how full the node is. We store the + * node's allocated capacity in the "fanout" member of RT_NODE, to allow + * runtime introspection. + * + * Growing from one node kind to another requires special code for each + * case, but growing from one size class to another within the same kind + * is basically just allocate + memcpy. + * + * The size classes have been chosen so that inner nodes on platforms + * with 64-bit pointers (and leaf nodes when using a 64-bit key) are + * equal to or slightly smaller than some DSA size class. + */ +typedef enum RT_SIZE_CLASS +{ + RT_CLASS_4 = 0, + RT_CLASS_16_LO, + RT_CLASS_16_HI, + RT_CLASS_48, + RT_CLASS_256 +} RT_SIZE_CLASS; + +// todo: macro based on DSA segment sizes +#define RT_FANOUT_4 3 /* todo: (8 - sizeof(RT_NODE)) */ +#define RT_FANOUT_16_LO 15 /* todo: (160 - RT_FANOUT_16_HI - MAXALIGN(sizeof(RT_NODE)) / sizeof(uint64)) */ +#define RT_FANOUT_16_HI 32 +#define RT_FANOUT_48 125 /* todo: like above but 768 (63) */ +#define RT_FANOUT_256 256 + +/* Information for each size class */ +typedef struct RT_SIZE_CLASS_ELEM +{ + const char *name; + int fanout; + + /* slab chunk size */ + Size inner_size; +} RT_SIZE_CLASS_ELEM; + +// todo: adjust name automatically - scanf()? +static const RT_SIZE_CLASS_ELEM RT_SIZE_CLASS_INFO[] = { + [RT_CLASS_4] = { + .name = "radix tree node 3", + .fanout = RT_FANOUT_4, + .inner_size = sizeof(RT_NODE_INNER_4) + RT_FANOUT_4 * sizeof(RT_PTR_ALLOC), + }, + [RT_CLASS_16_LO] = { + .name = "radix tree node 15", + .fanout = RT_FANOUT_16_LO, + .inner_size = sizeof(RT_NODE_INNER_16) + RT_FANOUT_16_LO * sizeof(RT_PTR_ALLOC), + }, + [RT_CLASS_16_HI] = { + .name = "radix tree node 32", + .fanout = RT_FANOUT_16_HI, + .inner_size = sizeof(RT_NODE_INNER_16) + RT_FANOUT_16_HI * sizeof(RT_PTR_ALLOC), + }, + [RT_CLASS_48] = { + .name = "radix tree node 125", + .fanout = RT_FANOUT_48, + .inner_size = sizeof(RT_NODE_INNER_48) + RT_FANOUT_48 * sizeof(RT_PTR_ALLOC), + }, + [RT_CLASS_256] = { + .name = "radix tree node 256", + .fanout = RT_FANOUT_256, + .inner_size = sizeof(RT_NODE_INNER_256), + }, +}; + +#define RT_SIZE_CLASS_COUNT lengthof(RT_SIZE_CLASS_INFO) + +#ifdef RT_SHMEM +/* A magic value used to identify our radix tree */ +#define RT_RADIX_TREE_MAGIC 0x54A48167 +#endif + +/* Contains the actual tree and ancillary info */ +typedef struct RT_RADIX_TREE_CONTROL +{ +#ifdef RT_SHMEM + RT_HANDLE handle; + uint32 magic; + LWLock lock; +#endif + + RT_PTR_ALLOC root; + uint64 max_val; + uint64 num_keys; + int start_shift; // xxx + + /* statistics */ +#ifdef RT_DEBUG + int32 cnt[RT_SIZE_CLASS_COUNT]; + int32 leafcnt; +#endif +} RT_RADIX_TREE_CONTROL; + +/* Entry point for allocating and accessing the tree */ +typedef struct RT_RADIX_TREE +{ + MemoryContext context; + + /* pointing to either local memory or DSA */ + RT_RADIX_TREE_CONTROL *ctl; + +#ifdef RT_SHMEM + dsa_area *dsa; +#else + MemoryContextData *inner_slabs[RT_SIZE_CLASS_COUNT]; + MemoryContextData *leaf_slab; +#endif +} RT_RADIX_TREE; + +/* + * Iteration support. + * + * Iterating the radix tree returns each pair of key and value in the ascending + * order of the key. + * + * RT_NODE_ITER is the struct for iteration of one radix tree node. + * + * RT_ITER is the struct for iteration of the radix tree, and uses RT_NODE_ITER + * for each level to track the iteration within the node. + */ +typedef struct RT_NODE_ITER +{ + RT_NODE_PTR node; + + /* + * The next index of the chunk array in RT_NODE_KIND_4 and + * RT_NODE_KIND_16 nodes, or the next chunk in RT_NODE_KIND_48 and + * RT_NODE_KIND_256 nodes. 0 for the initial value. + */ + int idx; +} RT_NODE_ITER; + +typedef struct RT_ITER +{ + RT_RADIX_TREE *tree; + + /* Track the nodes for each level. level = 0 is for a leaf node */ + RT_NODE_ITER node_iters[RT_MAX_LEVEL]; + int top_level; + + /* The key constructed during the iteration */ + uint64 key; +} RT_ITER; + + +static void RT_NODE_INSERT_INNER(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, + uint8 chunk, RT_PTR_ALLOC child); + +/* verification (available only with assertion) */ +static void RT_VERIFY_NODE(RT_PTR_LOCAL node); + +static inline void +RT_PTR_SET_LOCAL(RT_RADIX_TREE *tree, RT_NODE_PTR *node) +{ +#ifdef RT_SHMEM + node->local = dsa_get_address(tree->dsa, node->alloc); +#else +#endif +} + +static inline bool +RT_PTR_ALLOC_IS_VALID(RT_PTR_ALLOC ptr) +{ +#ifdef RT_SHMEM + return DsaPointerIsValid(ptr); +#else + return PointerIsValid(ptr); +#endif +} + +/* + * Return index of the first element in the node's chunk array that equals + * 'chunk'. Return -1 if there is no such element. + */ +static inline int +RT_NODE_3_SEARCH_EQ(RT_NODE_BASE_4 *node, uint8 chunk) +{ + int idx = -1; + + for (int i = 0; i < node->n.count; i++) + { + if (node->chunks[i] == chunk) + { + idx = i; + break; + } + } + + return idx; +} + +/* + * Return index of the chunk and slot arrays for inserting into the node, + * such that the chunk array remains ordered. + */ +static inline int +RT_NODE_3_GET_INSERTPOS(RT_NODE_BASE_4 *node, uint8 chunk) +{ + int idx; + + for (idx = 0; idx < node->n.count; idx++) + { + if (node->chunks[idx] >= chunk) + break; + } + + return idx; +} + +/* + * Return index of the first element in the node's chunk array that equals + * 'chunk'. Return -1 if there is no such element. + */ +static inline int +RT_NODE_32_SEARCH_EQ(RT_NODE_BASE_16 *node, uint8 chunk) +{ + int count = node->n.count; +#ifndef USE_NO_SIMD + Vector8 spread_chunk; + Vector8 haystack1; + Vector8 haystack2; + Vector8 cmp1; + Vector8 cmp2; + uint32 bitfield; + int index_simd = -1; +#endif + +#if defined(USE_NO_SIMD) || defined(USE_ASSERT_CHECKING) + int index = -1; + + for (int i = 0; i < count; i++) + { + if (node->chunks[i] == chunk) + { + index = i; + break; + } + } +#endif + +#ifndef USE_NO_SIMD + /* replicate the search key */ + spread_chunk = vector8_broadcast(chunk); + + /* compare to all 32 keys stored in the node */ + vector8_load(&haystack1, &node->chunks[0]); + vector8_load(&haystack2, &node->chunks[sizeof(Vector8)]); + cmp1 = vector8_eq(spread_chunk, haystack1); + cmp2 = vector8_eq(spread_chunk, haystack2); + + /* convert comparison to a bitfield */ + bitfield = vector8_highbit_mask(cmp1) | (vector8_highbit_mask(cmp2) << sizeof(Vector8)); + + /* mask off invalid entries */ + bitfield &= ((UINT64CONST(1) << count) - 1); + + /* convert bitfield to index by counting trailing zeros */ + if (bitfield) + index_simd = pg_rightmost_one_pos32(bitfield); + + Assert(index_simd == index); + return index_simd; +#else + return index; +#endif +} + +/* + * Return index of the chunk and slot arrays for inserting into the node, + * such that the chunk array remains ordered. + */ +static inline int +RT_NODE_32_GET_INSERTPOS(RT_NODE_BASE_16 *node, uint8 chunk) +{ + int count = node->n.count; +#ifndef USE_NO_SIMD + Vector8 spread_chunk; + Vector8 haystack1; + Vector8 haystack2; + Vector8 cmp1; + Vector8 cmp2; + Vector8 min1; + Vector8 min2; + uint32 bitfield; + int index_simd; +#endif + +#if defined(USE_NO_SIMD) || defined(USE_ASSERT_CHECKING) + int index; + + for (index = 0; index < count; index++) + { + /* + * This is coded with '>=' to match what we can do with SIMD, + * with an assert to keep us honest. + */ + if (node->chunks[index] >= chunk) + { + Assert(node->chunks[index] != chunk); + break; + } + } +#endif + +#ifndef USE_NO_SIMD + /* + * This is a bit more complicated than RT_NODE_32_SEARCH_EQ(), because + * no unsigned uint8 comparison instruction exists, at least for SSE2. So + * we need to play some trickery using vector8_min() to effectively get + * >=. There'll never be any equal elements in current uses, but that's + * what we get here... + */ + spread_chunk = vector8_broadcast(chunk); + vector8_load(&haystack1, &node->chunks[0]); + vector8_load(&haystack2, &node->chunks[sizeof(Vector8)]); + min1 = vector8_min(spread_chunk, haystack1); + min2 = vector8_min(spread_chunk, haystack2); + cmp1 = vector8_eq(spread_chunk, min1); + cmp2 = vector8_eq(spread_chunk, min2); + bitfield = vector8_highbit_mask(cmp1) | (vector8_highbit_mask(cmp2) << sizeof(Vector8)); + bitfield &= ((UINT64CONST(1) << count) - 1); + + if (bitfield) + index_simd = pg_rightmost_one_pos32(bitfield); + else + index_simd = count; + + Assert(index_simd == index); + return index_simd; +#else + return index; +#endif +} + + +/* + * Functions to manipulate both chunks array and children/values array. + * These are used for node-3 and node-32. + * TODO: replace slow memmove's + */ + +/* Shift the elements right at 'idx' by one */ +static inline void +RT_CHUNK_CHILDREN_ARRAY_SHIFT(uint8 *chunks, RT_PTR_ALLOC *children, int count, int idx) +{ + memmove(&(chunks[idx + 1]), &(chunks[idx]), sizeof(uint8) * (count - idx)); + memmove(&(children[idx + 1]), &(children[idx]), sizeof(RT_PTR_ALLOC) * (count - idx)); +} + +/* Delete the element at 'idx' */ +static inline void +RT_CHUNK_CHILDREN_ARRAY_DELETE(uint8 *chunks, RT_PTR_ALLOC *children, int count, int idx) +{ + memmove(&(chunks[idx]), &(chunks[idx + 1]), sizeof(uint8) * (count - idx - 1)); + memmove(&(children[idx]), &(children[idx + 1]), sizeof(RT_PTR_ALLOC) * (count - idx - 1)); +} + +/* Copy both chunks and children/values arrays */ +static inline void +RT_CHUNK_CHILDREN_ARRAY_COPY(uint8 *src_chunks, RT_PTR_ALLOC *src_children, + uint8 *dst_chunks, RT_PTR_ALLOC *dst_children, + uint8 chunk, RT_PTR_ALLOC child, int insertpos, int count) +{ + /* first copy old elements before insertpos */ + memcpy(&dst_chunks[0], &src_chunks[0], + insertpos * sizeof(src_chunks[0])); + memcpy(&dst_children[0], &src_children[0], + insertpos * sizeof(src_children[0])); + + /* then the new element */ + dst_chunks[insertpos] = chunk; + dst_children[insertpos] = child; + + /* and lastly the old elements after */ + memcpy(&dst_chunks[insertpos + 1], &src_chunks[insertpos], + (count - insertpos) * sizeof(src_chunks[0])); + memcpy(&dst_children[insertpos + 1], &src_children[insertpos], + (count - insertpos) * sizeof(src_children[0])); +} + +static inline void +RT_CHUNK_VALUES_ARRAY_COPY(uint8 *src_chunks, RT_VALUE_TYPE *src_values, + uint8 *dst_chunks, RT_VALUE_TYPE *dst_values) +{ + const int fanout = RT_SIZE_CLASS_INFO[RT_CLASS_4].fanout; + const Size chunk_size = sizeof(uint8) * fanout; + const Size values_size = sizeof(RT_VALUE_TYPE) * fanout; + + memcpy(dst_chunks, src_chunks, chunk_size); + memcpy(dst_values, src_values, values_size); +} + +/* Functions to manipulate inner and leaf node-125 */ + +/* Does the given chunk in the node has the value? */ +static inline bool +RT_NODE_125_IS_CHUNK_USED(RT_NODE_BASE_48 *node, uint8 chunk) +{ + return node->slot_idxs[chunk] != RT_INVALID_SLOT_IDX; +} + +static inline RT_PTR_ALLOC* +RT_NODE_INNER_125_GET_CHILD(RT_NODE_INNER_48 *node, uint8 chunk) +{ + return &node->children[node->base.slot_idxs[chunk]]; +} + +/* Functions to manipulate inner and leaf node-256 */ + +/* Return true if the slot corresponding to the given chunk is in use */ +static inline bool +RT_NODE_INNER_256_IS_CHUNK_USED(RT_NODE_INNER_256 *node, uint8 chunk) +{ + int idx = RT_BM_IDX(chunk); + int bitnum = RT_BM_BIT(chunk); + + return (node->isset[idx] & ((bitmapword) 1 << bitnum)) != 0; +} + +static inline RT_PTR_ALLOC* +RT_NODE_INNER_256_GET_CHILD(RT_NODE_INNER_256 *node, uint8 chunk) +{ + Assert(RT_NODE_INNER_256_IS_CHUNK_USED(node, chunk)); + return &node->children[chunk]; +} + +/* Set the child in the node-256 */ +static inline void +RT_NODE_INNER_256_SET(RT_NODE_INNER_256 *node, uint8 chunk, RT_PTR_ALLOC child) +{ + int idx = RT_BM_IDX(chunk); + int bitnum = RT_BM_BIT(chunk); + + node->isset[idx] |= ((bitmapword) 1 << bitnum); + node->children[chunk] = child; +} + +/* Set the slot at the given chunk position */ +static inline void +RT_NODE_INNER_256_DELETE(RT_NODE_INNER_256 *node, uint8 chunk) +{ + int idx = RT_BM_IDX(chunk); + int bitnum = RT_BM_BIT(chunk); + + node->isset[idx] &= ~((bitmapword) 1 << bitnum); +} + +/* + * Return the largest shift that will allowing storing the given key. + */ +static inline int +RT_KEY_GET_SHIFT(uint64 key) +{ + if (key == 0) + return 0; + else + return (pg_leftmost_one_pos64(key) / RT_NODE_SPAN) * RT_NODE_SPAN; +} + +/* + * Return the max value that can be stored in the tree with the given shift. + */ +static uint64 +RT_SHIFT_GET_MAX_VAL(int shift) +{ + if (shift == RT_MAX_SHIFT) + return UINT64_MAX; + + return (UINT64CONST(1) << (shift + RT_NODE_SPAN)) - 1; +} + +/* + * Allocate a new node with the given node kind and size class. + */ +static inline RT_NODE_PTR +RT_ALLOC_NODE(RT_RADIX_TREE *tree, const uint8 kind, const RT_SIZE_CLASS size_class, bool is_leaf) +{ + RT_NODE_PTR allocnode; + RT_PTR_LOCAL node; + size_t allocsize; + + allocsize = RT_SIZE_CLASS_INFO[size_class].inner_size; + +#ifdef RT_SHMEM + allocnode.alloc = dsa_allocate(tree->dsa, allocsize); +#else + allocnode.alloc = (RT_PTR_ALLOC) MemoryContextAlloc(tree->inner_slabs[size_class], + allocsize); +#endif + + RT_PTR_SET_LOCAL(tree, &allocnode); + node = allocnode.local; + + /* initialize contents */ + + memset(node, 0, sizeof(RT_NODE)); + switch(kind) + { + case RT_NODE_KIND_4: + case RT_NODE_KIND_16: + break; + case RT_NODE_KIND_48: + { + RT_NODE_BASE_48 *n48 = (RT_NODE_BASE_48 *) node; + + memset(n48->isset, 0, sizeof(n48->isset)); + memset(n48->slot_idxs, RT_INVALID_SLOT_IDX, sizeof(n48->slot_idxs)); + break; + } + case RT_NODE_KIND_256: + { + RT_NODE_INNER_256 *n256 = (RT_NODE_INNER_256 *) node; + + memset(n256->isset, 0, sizeof(n256->isset)); + break; + } + default: + pg_unreachable(); + } + + node->kind = kind; + if (kind == RT_NODE_KIND_256) + /* See comment for the RT_NODE type */ + // todo remove actual value from lookup table + Assert(node->fanout == 0); + else + node->fanout = RT_SIZE_CLASS_INFO[size_class].fanout; + +#ifdef RT_DEBUG + /* update the statistics */ + tree->ctl->cnt[size_class]++; +#endif + + return allocnode; +} + +/* + * Allocate a new leaf. + * XXX do we really need this separate from RT_ALLOC_NODE? We will + * if we need variable-sized leaves. + */ +static RT_NODE_PTR +RT_ALLOC_LEAF(RT_RADIX_TREE *tree) +{ + RT_NODE_PTR leaf; + size_t allocsize = sizeof(RT_VALUE_TYPE); + +#ifdef RT_SHMEM + leaf.alloc = dsa_allocate(tree->dsa, allocsize); +#else + leaf.alloc = (RT_PTR_ALLOC) MemoryContextAlloc(tree->leaf_slab, allocsize); +#endif + + RT_PTR_SET_LOCAL(tree, &leaf); + +#ifdef RT_DEBUG + tree->ctl->leafcnt++; +#endif + + return leaf; +} + +/* + * Create a new node as the root. Subordinate nodes will be created during + * the insertion. + */ +static pg_noinline void +RT_NEW_ROOT(RT_RADIX_TREE *tree, uint64 key) +{ + int shift = RT_KEY_GET_SHIFT(key); + bool is_leaf = false; + RT_NODE_PTR node; + + node = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4, is_leaf); + tree->ctl->start_shift = shift; + tree->ctl->max_val = RT_SHIFT_GET_MAX_VAL(shift); + tree->ctl->root = node.alloc; +} + +/* + * Copy relevant members of the node header. + * This is a separate function in case other fields are added. + */ +static inline void +RT_COPY_COMMON(RT_NODE_PTR newnode, RT_NODE_PTR oldnode) +{ + (newnode.local)->count = (oldnode.local)->count; +} + +/* Free the given node */ +static void +RT_FREE_NODE(RT_RADIX_TREE *tree, RT_NODE_PTR node) +{ +#ifdef RT_DEBUG + { + int i; + + /* update the statistics */ + for (i = 0; i < RT_SIZE_CLASS_COUNT; i++) + { + if ((node.local)->fanout == RT_SIZE_CLASS_INFO[i].fanout) + break; + } + + /* fanout of node256 is intentionally 0 */ + if (i == RT_SIZE_CLASS_COUNT) + i = RT_CLASS_256; + + tree->ctl->cnt[i]--; + Assert(tree->ctl->cnt[i] >= 0); + } +#endif + +#ifdef RT_SHMEM + dsa_free(tree->dsa, node.alloc); +#else + pfree(node.alloc); +#endif +} + +static inline void +RT_FREE_LEAF(RT_RADIX_TREE *tree, RT_NODE_PTR node) +{ + // because no lazy expansion yet + Assert(node.alloc != tree->ctl->root); + +#ifdef RT_DEBUG + tree->ctl->leafcnt--; + Assert(tree->ctl->leafcnt >= 0); +#endif + +#ifdef RT_SHMEM + dsa_free(tree->dsa, node.alloc); +#else + pfree(node.alloc); +#endif +} + +/* + * The radix tree doesn't have sufficient height. Extend the radix tree so + * it can store the key. + */ +static pg_noinline void +RT_EXTEND_UP(RT_RADIX_TREE *tree, uint64 key) +{ + int target_shift; + // todo: move inside loop + int shift = tree->ctl->start_shift + RT_NODE_SPAN; + + target_shift = RT_KEY_GET_SHIFT(key); + + /* Grow tree from 'shift' to 'target_shift' */ + while (shift <= target_shift) + { + RT_NODE_PTR node; + RT_NODE_INNER_4 *n4; + + node = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4, false); + n4 = (RT_NODE_INNER_4 *) node.local; + n4->base.n.count = 1; + n4->base.chunks[0] = 0; + n4->children[0] = tree->ctl->root; + + /* Update the root */ + tree->ctl->root = node.alloc; + + shift += RT_NODE_SPAN; + } + + tree->ctl->max_val = RT_SHIFT_GET_MAX_VAL(target_shift); + tree->ctl->start_shift = target_shift; +} + +/* + * Search for the child pointer corresponding to 'key' in the given node. + * + * Return child if the key is found, otherwise return NULL. + */ +static inline RT_PTR_ALLOC * +RT_NODE_SEARCH_INNER(RT_PTR_LOCAL node, uint8 chunk) +{ + /* Make sure we already converted to local pointer */ + Assert(node != NULL); + + switch (node->kind) + { + case RT_NODE_KIND_4: + { + RT_NODE_INNER_4 *n4 = (RT_NODE_INNER_4 *) node; + int idx = RT_NODE_3_SEARCH_EQ((RT_NODE_BASE_4 *) n4, chunk); + + if (idx < 0) + return NULL; + + return &n4->children[idx]; + } + case RT_NODE_KIND_16: + { + RT_NODE_INNER_16 *n16 = (RT_NODE_INNER_16 *) node; + int idx = RT_NODE_32_SEARCH_EQ((RT_NODE_BASE_16 *) n16, chunk); + + if (idx < 0) + return NULL; + + return &n16->children[idx]; + } + case RT_NODE_KIND_48: + { + RT_NODE_INNER_48 *n48 = (RT_NODE_INNER_48 *) node; + int slotpos = n48->base.slot_idxs[chunk]; + + if (slotpos == RT_INVALID_SLOT_IDX) + return NULL; + + return RT_NODE_INNER_125_GET_CHILD(n48, chunk); + } + case RT_NODE_KIND_256: + { + RT_NODE_INNER_256 *n256 = (RT_NODE_INNER_256 *) node; + + if (!RT_NODE_INNER_256_IS_CHUNK_USED(n256, chunk)) + return NULL; + + return RT_NODE_INNER_256_GET_CHILD(n256, chunk); + } + default: + pg_unreachable(); + } +} + +#ifdef RT_USE_DELETE + +/* + * When shrinking nodes, we generally wait until the count is about 3/4 + * of the next lower node's fanout. This prevents ping-ponging between + * different node sizes. + * TODO: When shrinking to node4, 3 should be hard-coded, as that's the + * largest count where linear search is faster than SIMD, at least on + * x86-64. + */ + +static inline void +RT_REMOVE_CHILD_256(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, uint8 chunk) +{ + int shrink_threshold; + RT_NODE_INNER_256 *n256 = (RT_NODE_INNER_256 *) node.local; + + RT_NODE_INNER_256_DELETE(n256, chunk); + + n256->base.n.count--; + + /* to keep isset coding below simple, for now at least */ + shrink_threshold = sizeof(bitmapword) * BITS_PER_BYTE; + shrink_threshold = Min(RT_FANOUT_48 / 4 * 3, shrink_threshold); + + if (n256->base.n.count < shrink_threshold) + { + RT_NODE_PTR newnode; + RT_NODE_INNER_48 *new48; + int slot_idx = 0; + + /* initialize new node */ + newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_48, RT_CLASS_48, false); + new48 = (RT_NODE_INNER_48 *) newnode.local; + + /* copy over the entries */ + RT_COPY_COMMON(newnode, node); + for (int i = 0; i < 256; i++) + { + if (RT_NODE_INNER_256_IS_CHUNK_USED(n256, i)) + { + new48->base.slot_idxs[i] = slot_idx; + new48->children[slot_idx] = n256->children[i]; + slot_idx++; + } + } + + /* + * Since we just copied a dense array, we can fill "isset" + * using a single store, provided the length of that array + * is at most the number of bits in a bitmapword. + */ + Assert(n256->base.n.count <= sizeof(bitmapword) * BITS_PER_BYTE); + new48->base.isset[0] = (bitmapword) (((uint64) 1 << n256->base.n.count) - 1); + + /* free old node and update reference in parent */ + *ref = newnode.alloc; + RT_FREE_NODE(tree, node); + } +} + + +static inline void +RT_REMOVE_CHILD_48(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, uint8 chunk) +{ + RT_NODE_INNER_48 *n48 = (RT_NODE_INNER_48 *) node.local; + int slotpos = n48->base.slot_idxs[chunk]; + int idx; + int bitnum; + + Assert(slotpos != RT_INVALID_SLOT_IDX); + + idx = RT_BM_IDX(slotpos); + bitnum = RT_BM_BIT(slotpos); + n48->base.isset[idx] &= ~((bitmapword) 1 << bitnum); + n48->base.slot_idxs[chunk] = RT_INVALID_SLOT_IDX; + + n48->base.n.count--; +} + +static inline void +RT_REMOVE_CHILD_16(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, uint8 chunk, RT_PTR_ALLOC *slot) +{ + RT_NODE_INNER_16 *n16 = (RT_NODE_INNER_16 *) node.local; + int idx = slot - n16->children;; + + Assert(idx >= 0); + Assert(n16->base.chunks[idx] == chunk); + + RT_CHUNK_CHILDREN_ARRAY_DELETE(n16->base.chunks, n16->children, + n16->base.n.count, idx); + n16->base.n.count--; +} + +static inline void +RT_REMOVE_CHILD_4(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, uint8 chunk, RT_PTR_ALLOC *slot) +{ + RT_NODE_INNER_4 *n4 = (RT_NODE_INNER_4 *) node.local; + + if (n4->base.n.count == 1) + { + Assert(n4->base.chunks[0] == chunk); + + /* deleting last entry, so just free the node and null out the parent's slot */ + // We assume the caller already freed the child, if necessary + RT_FREE_NODE(tree, node); + *ref = RT_INVALID_PTR_ALLOC; + + /* If we're deleting the root node, make the tree empty */ + if (ref == &tree->ctl->root) + tree->ctl->max_val = 0; + } + else + { + int idx = slot - n4->children;; + + Assert(idx >= 0); + Assert(n4->base.chunks[idx] == chunk); + + RT_CHUNK_CHILDREN_ARRAY_DELETE(n4->base.chunks, n4->children, + n4->base.n.count, idx); + + n4->base.n.count--; + } +} + +/* + * Search for the child pointer corresponding to 'key' in the given node. + * + * Delete the node and return true if the key is found, otherwise return false. + */ +static inline void +RT_NODE_DELETE_INNER(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, uint8 chunk, RT_PTR_ALLOC *slot) +{ + switch ((node.local)->kind) + { + case RT_NODE_KIND_4: + return RT_REMOVE_CHILD_4(tree, ref, node, chunk, slot); + case RT_NODE_KIND_16: + return RT_REMOVE_CHILD_16(tree, ref, node, chunk, slot); + case RT_NODE_KIND_48: + return RT_REMOVE_CHILD_48(tree, ref, node, chunk); + case RT_NODE_KIND_256: + return RT_REMOVE_CHILD_256(tree, ref, node, chunk); + default: + pg_unreachable(); + } +} + +#endif + +static inline void +RT_ADD_CHILD_256(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, + uint8 chunk, RT_PTR_ALLOC child) +{ + RT_NODE_INNER_256 *n256 = (RT_NODE_INNER_256 *) node.local; + + RT_NODE_INNER_256_SET(n256, chunk, child); + + n256->base.n.count++; + RT_VERIFY_NODE((RT_NODE *) n256); +} + +static pg_noinline void +RT_GROW_NODE_48(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, + uint8 chunk, RT_PTR_ALLOC child) +{ + RT_NODE_INNER_48 *n48 = (RT_NODE_INNER_48 *) node.local; + + RT_NODE_PTR newnode; + RT_NODE_INNER_256 *new256; + int cnt = 0; + + const bool is_leaf = false; // xxx + + /* initialize new node */ + newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_256, RT_CLASS_256, is_leaf); + new256 = (RT_NODE_INNER_256 *) newnode.local; + + /* copy over the entries */ + RT_COPY_COMMON(newnode, node); + for (int i = 0; i < RT_NODE_MAX_SLOTS && cnt < n48->base.n.count; i++) + { + if (!RT_NODE_125_IS_CHUNK_USED(&n48->base, i)) + continue; + + RT_NODE_INNER_256_SET(new256, i, + *RT_NODE_INNER_125_GET_CHILD(n48, i)); + cnt++; + } + + /* free old node and update reference in parent */ + *ref = newnode.alloc; + RT_FREE_NODE(tree, node); + + RT_ADD_CHILD_256(tree, ref, newnode, chunk, child); +} + +static inline void +RT_ADD_CHILD_48(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, + uint8 chunk, RT_PTR_ALLOC child) +{ + RT_NODE_INNER_48 *n48 = (RT_NODE_INNER_48 *) node.local; + + if (unlikely(RT_NODE_MUST_GROW(n48))) + { + RT_GROW_NODE_48(tree, ref, node, chunk, child); + } + else + { + int slotpos; + int idx; + bitmapword inverse; + + /* get the first word with at least one bit not set */ + for (idx = 0; idx < RT_BM_IDX(RT_SLOT_IDX_LIMIT); idx++) + { + if (n48->base.isset[idx] < ~((bitmapword) 0)) + break; + } + + /* To get the first unset bit in X, get the first set bit in ~X */ + inverse = ~(n48->base.isset[idx]); + slotpos = idx * BITS_PER_BITMAPWORD; + slotpos += bmw_rightmost_one_pos(inverse); + Assert(slotpos < n48->base.n.fanout); + + /* mark the slot used */ + n48->base.isset[idx] |= bmw_rightmost_one(inverse); + n48->base.slot_idxs[chunk] = slotpos; + + n48->children[slotpos] = child; + n48->base.n.count++; + RT_VERIFY_NODE((RT_NODE *) n48); + } +} + +static pg_noinline void +RT_GROW_NODE_16(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, + uint8 chunk, RT_PTR_ALLOC child) +{ + const bool is_leaf = false; // xxx + RT_NODE_INNER_16 *n16 = (RT_NODE_INNER_16 *) node.local; + + if (n16->base.n.fanout < RT_FANOUT_16_HI) + { + RT_NODE_PTR newnode; + RT_NODE_INNER_16 *new16; + int insertpos = RT_NODE_32_GET_INSERTPOS(&n16->base, chunk); + + Assert(n16->base.n.fanout == RT_FANOUT_16_LO); + + /* initialize new node */ + newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_16, RT_CLASS_16_HI, is_leaf); + new16 = (RT_NODE_INNER_16 *) newnode.local; + + /* copy over existing entries and insert new one */ + RT_COPY_COMMON(newnode, node); + RT_CHUNK_CHILDREN_ARRAY_COPY(n16->base.chunks, n16->children, + new16->base.chunks, new16->children, + chunk, child, insertpos, n16->base.n.count); + + /* update the fanout */ + new16->base.n.fanout = RT_FANOUT_16_HI; + + new16->base.n.count++; + RT_VERIFY_NODE((RT_NODE *) new16); + + /* free old node and update references */ + RT_FREE_NODE(tree, node); + *ref = newnode.alloc; + } + else + { + RT_NODE_PTR newnode; + RT_NODE_INNER_48 *new48; + const int slotpos = RT_FANOUT_16_HI; + const int idx = RT_BM_IDX(slotpos); + const int bit = RT_BM_BIT(slotpos); + + Assert(n16->base.n.fanout == RT_FANOUT_16_HI); + + /* initialize new node */ + newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_48, RT_CLASS_48, is_leaf); + new48 = (RT_NODE_INNER_48 *) newnode.local; + + /* copy over the entries */ + RT_COPY_COMMON(newnode, node); + for (int i = 0; i < RT_FANOUT_16_HI; i++) + { + new48->base.slot_idxs[n16->base.chunks[i]] = i; + new48->children[i] = n16->children[i]; + } + + /* + * Since we just copied a dense array, we can fill "isset" + * using a single store, provided the length of that array + * is at most the number of bits in a bitmapword. + */ + Assert(RT_FANOUT_16_HI <= sizeof(bitmapword) * BITS_PER_BYTE); + new48->base.isset[0] = (bitmapword) (((uint64) 1 << RT_FANOUT_16_HI) - 1); + + /* add new value */ + + /* mark slot used */ + new48->base.isset[idx] |= ((bitmapword) 1 << bit); + new48->base.slot_idxs[chunk] = slotpos; + + new48->children[slotpos] = child; + new48->base.n.count++; + RT_VERIFY_NODE((RT_NODE *) new48); + + /* free old node and update reference in parent */ + *ref = newnode.alloc; + RT_FREE_NODE(tree, node); + } +} + +static inline void +RT_ADD_CHILD_16(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, + uint8 chunk, RT_PTR_ALLOC child) +{ + RT_NODE_INNER_16 *n16 = (RT_NODE_INNER_16 *) node.local; + + if (unlikely(RT_NODE_MUST_GROW(n16))) + RT_GROW_NODE_16(tree, ref, node, chunk, child); + else + { + int insertpos = RT_NODE_32_GET_INSERTPOS(&n16->base, chunk); + int count = n16->base.n.count; + + if (insertpos < count) + RT_CHUNK_CHILDREN_ARRAY_SHIFT(n16->base.chunks, n16->children, + count, insertpos); + + n16->base.chunks[insertpos] = chunk; + n16->children[insertpos] = child; + n16->base.n.count++; + RT_VERIFY_NODE((RT_NODE *) n16); + } +} + +static pg_noinline void +RT_GROW_NODE_4(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, + uint8 chunk, RT_PTR_ALLOC child) +{ + const bool is_leaf = false; // xxx + RT_NODE_INNER_4 *n4 = (RT_NODE_INNER_4 *) (node.local); + RT_NODE_PTR newnode; + RT_NODE_INNER_16 *new16; + int insertpos = RT_NODE_3_GET_INSERTPOS(&n4->base, chunk); + + /* initialize new node */ + newnode = RT_ALLOC_NODE(tree, RT_NODE_KIND_16, RT_CLASS_16_LO, is_leaf); + new16 = (RT_NODE_INNER_16 *) newnode.local; + + /* copy over existing entries and insert new one */ + RT_COPY_COMMON(newnode, node); + RT_CHUNK_CHILDREN_ARRAY_COPY(n4->base.chunks, n4->children, + new16->base.chunks, new16->children, + chunk, child, insertpos, n4->base.n.count); + + new16->base.n.count++; + RT_VERIFY_NODE((RT_NODE *) new16); + + /* free old node and update reference in parent */ + *ref = newnode.alloc; + RT_FREE_NODE(tree, node); +} + +static inline void +RT_ADD_CHILD_4(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, + uint8 chunk, RT_PTR_ALLOC child) +{ + RT_NODE_INNER_4 *n4 = (RT_NODE_INNER_4 *) (node.local); + + if (unlikely(RT_NODE_MUST_GROW(n4))) + { + RT_GROW_NODE_4(tree, ref, node, chunk, child); + } + else + { + int insertpos = RT_NODE_3_GET_INSERTPOS(&n4->base, chunk); + int count = n4->base.n.count; + + /* shift chunks and children */ + if (insertpos < count) + RT_CHUNK_CHILDREN_ARRAY_SHIFT(n4->base.chunks, n4->children, + count, insertpos); + + n4->base.chunks[insertpos] = chunk; + n4->children[insertpos] = child; + n4->base.n.count++; + RT_VERIFY_NODE((RT_NODE *) n4); + } +} + +/* + * Insert "child" into "node". + * + * "ref" is the parent's child pointer to "node". + * If the node we're inserting into needs to grow, we update the parent's + * child pointer with the pointer to the new larger node. + */ +static void +RT_NODE_INSERT_INNER(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, + uint8 chunk, RT_PTR_ALLOC child) +{ + + switch ((node.local)->kind) + { + case RT_NODE_KIND_4: + RT_ADD_CHILD_4(tree, ref, node, chunk, child); + break; + case RT_NODE_KIND_16: + RT_ADD_CHILD_16(tree, ref, node, chunk, child); + break; + case RT_NODE_KIND_48: + RT_ADD_CHILD_48(tree, ref, node, chunk, child); + break; + case RT_NODE_KIND_256: + RT_ADD_CHILD_256(tree, ref, node, chunk, child); + break; + default: + pg_unreachable(); + } +} + +/* + * Create the radix tree in the given memory context and return it. + */ +RT_SCOPE RT_RADIX_TREE * +#ifdef RT_SHMEM +RT_CREATE(MemoryContext ctx, dsa_area *dsa, int tranche_id) +#else +RT_CREATE(MemoryContext ctx) +#endif +{ + RT_RADIX_TREE *tree; + MemoryContext old_ctx; +#ifdef RT_SHMEM + dsa_pointer dp; +#endif + + old_ctx = MemoryContextSwitchTo(ctx); + + tree = (RT_RADIX_TREE *) palloc0(sizeof(RT_RADIX_TREE)); + tree->context = ctx; + +#ifdef RT_SHMEM + tree->dsa = dsa; + dp = dsa_allocate0(dsa, sizeof(RT_RADIX_TREE_CONTROL)); + tree->ctl = (RT_RADIX_TREE_CONTROL *) dsa_get_address(dsa, dp); + tree->ctl->handle = dp; + tree->ctl->magic = RT_RADIX_TREE_MAGIC; + LWLockInitialize(&tree->ctl->lock, tranche_id); +#else + tree->ctl = (RT_RADIX_TREE_CONTROL *) palloc0(sizeof(RT_RADIX_TREE_CONTROL)); + + /* Create a slab context for each size class */ + for (int i = 0; i < RT_SIZE_CLASS_COUNT; i++) + { + RT_SIZE_CLASS_ELEM size_class = RT_SIZE_CLASS_INFO[i]; + size_t inner_blocksize = RT_SLAB_BLOCK_SIZE(size_class.inner_size); + + tree->inner_slabs[i] = SlabContextCreate(ctx, + size_class.name, + inner_blocksize, + size_class.inner_size); + } + tree->leaf_slab = SlabContextCreate(ctx, + "radix tree leaves", + RT_SLAB_BLOCK_SIZE(sizeof(RT_VALUE_TYPE)), + sizeof(RT_VALUE_TYPE)); +#endif + + tree->ctl->root = RT_INVALID_PTR_ALLOC; + + MemoryContextSwitchTo(old_ctx); + + return tree; +} + +#ifdef RT_SHMEM +RT_SCOPE RT_RADIX_TREE * +RT_ATTACH(dsa_area *dsa, RT_HANDLE handle) +{ + RT_RADIX_TREE *tree; + dsa_pointer control; + + tree = (RT_RADIX_TREE *) palloc0(sizeof(RT_RADIX_TREE)); + + /* Find the control object in shard memory */ + control = handle; + + tree->dsa = dsa; + tree->ctl = (RT_RADIX_TREE_CONTROL *) dsa_get_address(dsa, control); + Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC); + + return tree; +} + +RT_SCOPE void +RT_DETACH(RT_RADIX_TREE *tree) +{ + Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC); + pfree(tree); +} + +RT_SCOPE RT_HANDLE +RT_GET_HANDLE(RT_RADIX_TREE *tree) +{ + Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC); + return tree->ctl->handle; +} + +/* + * Recursively free all nodes allocated to the DSA area. + */ +static void +RT_FREE_RECURSE(RT_RADIX_TREE *tree, RT_PTR_ALLOC ptr) +{ +#if 0 + RT_PTR_LOCAL node = RT_PTR_SET_LOCAL(tree, ptr); + + check_stack_depth(); + CHECK_FOR_INTERRUPTS(); + + /* The leaf node doesn't have child pointers */ + /* TODO: track depth */ + if (RT_NODE_IS_LEAF(node)) + { + dsa_free(tree->dsa, ptr); + return; + } + + switch (node->kind) + { + case RT_NODE_KIND_4: + { + RT_NODE_INNER_4 *n4 = (RT_NODE_INNER_4 *) node; + + for (int i = 0; i < n4->base.n.count; i++) + RT_FREE_RECURSE(tree, n4->children[i]); + + break; + } + case RT_NODE_KIND_16: + { + RT_NODE_INNER_16 *n16 = (RT_NODE_INNER_16 *) node; + + for (int i = 0; i < n16->base.n.count; i++) + RT_FREE_RECURSE(tree, n16->children[i]); + + break; + } + case RT_NODE_KIND_48: + { + RT_NODE_INNER_48 *n48 = (RT_NODE_INNER_48 *) node; + + for (int i = 0; i < RT_NODE_MAX_SLOTS; i++) + { + if (!RT_NODE_125_IS_CHUNK_USED(&n48->base, i)) + continue; + + RT_FREE_RECURSE(tree, *RT_NODE_INNER_125_GET_CHILD(n48, i)); + } + + break; + } + case RT_NODE_KIND_256: + { + RT_NODE_INNER_256 *n256 = (RT_NODE_INNER_256 *) node; + + for (int i = 0; i < RT_NODE_MAX_SLOTS; i++) + { + if (!RT_NODE_INNER_256_IS_CHUNK_USED(n256, i)) + continue; + + RT_FREE_RECURSE(tree, *RT_NODE_INNER_256_GET_CHILD(n256, i)); + } + + break; + } + } + + /* Free the inner node */ + dsa_free(tree->dsa, ptr); +#endif // 0 +} +#endif + +/* + * Free the given radix tree. + */ +RT_SCOPE void +RT_FREE(RT_RADIX_TREE *tree) +{ +#ifdef RT_SHMEM + Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC); + + /* Free all memory used for radix tree nodes */ + if (RT_PTR_ALLOC_IS_VALID(tree->ctl->root)) + RT_FREE_RECURSE(tree, tree->ctl->root); + + /* + * Vandalize the control block to help catch programming error where + * other backends access the memory formerly occupied by this radix tree. + */ + tree->ctl->magic = 0; + dsa_free(tree->dsa, tree->ctl->handle); +#else + pfree(tree->ctl); + + for (int i = 0; i < RT_SIZE_CLASS_COUNT; i++) + { + MemoryContextDelete(tree->inner_slabs[i]); + } + MemoryContextDelete(tree->leaf_slab); +#endif + + pfree(tree); +} + +static pg_noinline void +RT_EXTEND_DOWN(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, uint64 key, RT_VALUE_TYPE *value_p, int shift) +{ + RT_NODE_PTR child; + RT_NODE_INNER_4 *n4; + + while (shift > 0) + { + child = RT_ALLOC_NODE(tree, RT_NODE_KIND_4, RT_CLASS_4, false); + + /* XXX ref is only valid the first time through, + * but it doesn't matter since that's the only possible + * time an insertion could cause "node" to grow. + */ + RT_NODE_INSERT_INNER(tree, ref, node, + RT_GET_KEY_CHUNK(key, shift), child.alloc); + + node = child; + shift -= RT_NODE_SPAN; + } + + // todo: common function RT_MAKE_LEAF + /* Set child to either an embedded value, or a pointer to a new leaf */ + if (RT_VALUE_IS_EMBEDDABLE) + { + memcpy(&child.alloc, value_p, sizeof(RT_VALUE_TYPE)); + } + else + { + RT_NODE_PTR newleaf; + + newleaf = RT_ALLOC_LEAF(tree); + memcpy(newleaf.local, value_p, sizeof(RT_VALUE_TYPE)); + + child.alloc = newleaf.alloc; + } + + /* Insert child containing our value. */ + Assert((node.local)->kind == RT_NODE_KIND_4); + n4 = (RT_NODE_INNER_4 *) node.local; + Assert(shift == 0); + n4->base.chunks[0] = RT_GET_KEY_CHUNK(key, shift); + n4->children[0] = child.alloc; + n4->base.n.count = 1; +} + +/* Workhorse for RT_SET */ +// "ref" is the address of the parent's child, which we just followed -- needed for growing nodes +static bool +RT_RECURSIVE_SET(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, uint64 key, RT_VALUE_TYPE *value_p, int shift) +{ + RT_PTR_ALLOC *slot; + RT_NODE_PTR child; + uint8 chunk = RT_GET_KEY_CHUNK(key, shift); + + slot = RT_NODE_SEARCH_INNER(node.local, chunk); + + if (shift > 0) + { + if (unlikely(!slot)) + { + RT_EXTEND_DOWN(tree, ref, node, key, value_p, shift); + return false; + } + else + { + child.alloc = *slot; + RT_PTR_SET_LOCAL(tree, &child); + return RT_RECURSIVE_SET(tree, slot, child, key, value_p, shift - RT_NODE_SPAN); + } + } + else + { + if (slot) + { + /* Found value, so update it */ + if (RT_VALUE_IS_EMBEDDABLE) + { + memcpy(slot, value_p, sizeof(RT_VALUE_TYPE)); + } + else + { + child.alloc = *slot; + RT_PTR_SET_LOCAL(tree, &child); + + memcpy(child.local, value_p, sizeof(RT_VALUE_TYPE)); + } + + return true; + } + else + { + /* Set child to either an embedded value, or a pointer to a new leaf */ + if (RT_VALUE_IS_EMBEDDABLE) + { + memcpy(&child.alloc, value_p, sizeof(RT_VALUE_TYPE)); + } + else + { + RT_NODE_PTR newleaf; + + newleaf = RT_ALLOC_LEAF(tree); + memcpy(newleaf.local, value_p, sizeof(RT_VALUE_TYPE)); + + child.alloc = newleaf.alloc; + } + + /* insert child containing our value */ + RT_NODE_INSERT_INNER(tree, ref, node, chunk, child.alloc); + return false; + } + } +} + +/* + * Set key to value. If the entry already exists, we update its value to 'value' + * and return true. Returns false if entry doesn't yet exist. + */ +RT_SCOPE bool +RT_SET(RT_RADIX_TREE *tree, uint64 key, RT_VALUE_TYPE *value_p) +{ + bool updated; + RT_NODE_PTR rootnode; +#ifdef RT_SHMEM + Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC); +#endif + + RT_LOCK_EXCLUSIVE(tree); + + /* Empty tree, create the root */ + if (!RT_PTR_ALLOC_IS_VALID(tree->ctl->root)) + RT_NEW_ROOT(tree, key); + + /* Extend the tree if necessary */ + if (key > tree->ctl->max_val) + RT_EXTEND_UP(tree, key); + + rootnode.alloc = tree->ctl->root; + RT_PTR_SET_LOCAL(tree, &rootnode); + + updated = RT_RECURSIVE_SET(tree, &tree->ctl->root, rootnode, + key, value_p, tree->ctl->start_shift); + + /* Update the statistics */ + if (!updated) + tree->ctl->num_keys++; + + RT_UNLOCK(tree); + return updated; +} + +/* + * Search the given key in the radix tree. Return true if there is the key, + * otherwise return false. On success, we set the value to *value_p so it must + * not be NULL. + */ +RT_SCOPE bool +RT_SEARCH(RT_RADIX_TREE *tree, uint64 key, RT_VALUE_TYPE *value_p) +{ + RT_NODE_PTR node; + RT_PTR_ALLOC *child; + int shift; + +#ifdef RT_SHMEM + Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC); +#endif + Assert(value_p != NULL); + + RT_LOCK_SHARED(tree); + + if (!RT_PTR_ALLOC_IS_VALID(tree->ctl->root) || key > tree->ctl->max_val) + { + RT_UNLOCK(tree); + return false; + } + + node.alloc = tree->ctl->root; + + shift = tree->ctl->start_shift; + + /* Descend the tree until a leaf node */ + while (shift >= 0) + { + RT_PTR_SET_LOCAL(tree, &node); + child = RT_NODE_SEARCH_INNER(node.local, RT_GET_KEY_CHUNK(key, shift)); + if (!child) + { + RT_UNLOCK(tree); + return false; + } + + node.alloc = *child; + shift -= RT_NODE_SPAN; + } + + if (RT_VALUE_IS_EMBEDDABLE) + { + memcpy(value_p, &node.alloc, sizeof(RT_VALUE_TYPE)); + } + else + { + RT_PTR_SET_LOCAL(tree, &node); + memcpy(value_p, node.local, sizeof(RT_VALUE_TYPE)); + } + + RT_UNLOCK(tree); + return true; +} + +#ifdef RT_USE_DELETE + +static bool +RT_RECURSIVE_DELETE(RT_RADIX_TREE *tree, RT_PTR_ALLOC *ref, RT_NODE_PTR node, uint64 key, int shift) +{ + uint8 chunk = RT_GET_KEY_CHUNK(key, shift); + RT_PTR_ALLOC *slot = RT_NODE_SEARCH_INNER(node.local, chunk); + RT_NODE_PTR child; + + if (!slot) + return false; + + child.alloc = *slot; + + if (shift == 0) + { + if (!RT_VALUE_IS_EMBEDDABLE) + RT_FREE_LEAF(tree, child); + + RT_NODE_DELETE_INNER(tree, ref, node, chunk, slot); + return true; + } + else + { + bool deleted; + + /* since we're not at lowest level, we know this is a pointer and not an embedded value */ + RT_PTR_SET_LOCAL(tree, &child); + + deleted = RT_RECURSIVE_DELETE(tree, slot, child, key, shift - RT_NODE_SPAN); + + /* Child node was freed, so delete its slot now */ + if (*slot == RT_INVALID_PTR_ALLOC) + { + Assert(deleted); + RT_NODE_DELETE_INNER(tree, ref, node, chunk, slot); + } + + return deleted; + } + +} + +/* + * Delete the given key from the radix tree. Return true if the key is found (and + * deleted), otherwise do nothing and return false. + */ +RT_SCOPE bool +RT_DELETE(RT_RADIX_TREE *tree, uint64 key) +{ + RT_NODE_PTR rootnode; + bool deleted; + +#ifdef RT_SHMEM + Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC); +#endif + + RT_LOCK_EXCLUSIVE(tree); + + if (!RT_PTR_ALLOC_IS_VALID(tree->ctl->root) || key > tree->ctl->max_val) + { + RT_UNLOCK(tree); + return false; + } + + rootnode.alloc = tree->ctl->root; + RT_PTR_SET_LOCAL(tree, &rootnode); + + deleted = RT_RECURSIVE_DELETE(tree, &tree->ctl->root, rootnode, + key, tree->ctl->start_shift); + + /* Found the key to delete. Update the statistics */ + if (deleted) + tree->ctl->num_keys--; + + RT_UNLOCK(tree); + return deleted; +} +#endif + + +/* + * Scan the inner node and return the next child node if exist, otherwise + * return NULL. + */ +static inline RT_PTR_ALLOC * +RT_NODE_INNER_ITERATE_NEXT(RT_ITER *iter, int level) +{ + + uint8 key_chunk = 0; + RT_NODE_ITER *node_iter; + RT_NODE_PTR node; + RT_PTR_ALLOC *slot = NULL; + +#ifdef RT_SHMEM + Assert(iter->tree->ctl->magic == RT_RADIX_TREE_MAGIC); +#endif + + node_iter = &(iter->node_iters[level]); + node = node_iter->node; + + Assert(node.local != NULL); + + switch ((node.local)->kind) + { + case RT_NODE_KIND_4: + { + RT_NODE_INNER_4 *n4 = (RT_NODE_INNER_4 *) (node.local); + + if (node_iter->idx >= n4->base.n.count) + return NULL; + + slot = &n4->children[node_iter->idx]; + key_chunk = n4->base.chunks[node_iter->idx]; + node_iter->idx++; + break; + } + case RT_NODE_KIND_16: + { + RT_NODE_INNER_16 *n16 = (RT_NODE_INNER_16 *) (node.local); + + if (node_iter->idx >= n16->base.n.count) + return NULL; + + slot = &n16->children[node_iter->idx]; + key_chunk = n16->base.chunks[node_iter->idx]; + node_iter->idx++; + break; + } + case RT_NODE_KIND_48: + { + RT_NODE_INNER_48 *n48 = (RT_NODE_INNER_48 *) (node.local); + int chunk; + + for (chunk = node_iter->idx; chunk < RT_NODE_MAX_SLOTS; chunk++) + { + if (RT_NODE_125_IS_CHUNK_USED((RT_NODE_BASE_48 *) n48, chunk)) + break; + } + + if (chunk >= RT_NODE_MAX_SLOTS) + return NULL; + + slot = RT_NODE_INNER_125_GET_CHILD(n48, chunk); + + key_chunk = chunk; + node_iter->idx = chunk + 1; + break; + } + case RT_NODE_KIND_256: + { + RT_NODE_INNER_256 *n256 = (RT_NODE_INNER_256 *) (node.local); + int chunk; + + for (chunk = node_iter->idx; chunk < RT_NODE_MAX_SLOTS; chunk++) + { + if (RT_NODE_INNER_256_IS_CHUNK_USED(n256, chunk)) + break; + } + + if (chunk >= RT_NODE_MAX_SLOTS) + return NULL; + + slot = RT_NODE_INNER_256_GET_CHILD(n256, chunk); + + key_chunk = chunk; + node_iter->idx = chunk + 1; + break; + } + } + + /* Update the part of the key */ + iter->key &= ~(((uint64) RT_CHUNK_MASK) << (level * RT_NODE_SPAN)); + iter->key |= (((uint64) key_chunk) << (level * RT_NODE_SPAN)); + + return slot; +} + +/* + * While descending the radix tree from the 'from' node to the bottom, we + * set the next node to iterate for each level. + */ +static void +RT_ITER_SET_NODE_FROM(RT_ITER *iter, RT_NODE_PTR from, int level) +{ + RT_NODE_PTR node = from; + + for (;;) + { + RT_NODE_ITER *node_iter = &(iter->node_iters[level]); + + RT_PTR_SET_LOCAL(iter->tree, &node); + +#if 0 //def USE_ASSERT_CHECKING fixme + if (node_iter->node) + { + /* We must have finished the iteration on the previous node */ + if (RT_NODE_IS_LEAF(node_iter->node)) + { + uint64 dummy; + Assert(!RT_NODE_LEAF_ITERATE_NEXT(iter, node_iter, &dummy)); + } + else + Assert(!RT_NODE_INNER_ITERATE_NEXT(iter, node_iter, level)); + } +#endif + + /* Set the node to the node iterator of this level */ + node_iter->node = node; + node_iter->idx = 0; + + if (level == 0) + { + /* We will visit the leaf node when RT_ITERATE_NEXT() */ + break; + } + + /* + * Get the first child node from the node, which corresponds to the + * lowest chunk within the node. + */ + node.alloc = *RT_NODE_INNER_ITERATE_NEXT(iter, level); + + /* The first child must be found */ + Assert(RT_PTR_ALLOC_IS_VALID(node.alloc)); + + level--; + } +} + +/* + * Create and return the iterator for the given radix tree. + * + * The radix tree is locked in shared mode during the iteration, so + * RT_END_ITERATE needs to be called when finished to release the lock. + */ +RT_SCOPE RT_ITER * +RT_BEGIN_ITERATE(RT_RADIX_TREE *tree) +{ + RT_ITER *iter; + RT_NODE_PTR root; + + iter = (RT_ITER *) MemoryContextAllocZero(tree->context, + sizeof(RT_ITER)); + iter->tree = tree; + + RT_LOCK_SHARED(tree); + + /* empty tree */ + if (!iter->tree->ctl->root) + return iter; + + root.alloc = iter->tree->ctl->root; + RT_PTR_SET_LOCAL(tree, &root); + + iter->top_level = iter->tree->ctl->start_shift / RT_NODE_SPAN; + + /* + * Set the next node to iterate for each level from the level of the + * root node. + */ + RT_ITER_SET_NODE_FROM(iter, root, iter->top_level); + + return iter; +} + +/* + * Return true with setting key_p and value_p if there is next key. Otherwise + * return false. + */ +RT_SCOPE bool +RT_ITERATE_NEXT(RT_ITER *iter, uint64 *key_p, RT_VALUE_TYPE *value_p) +{ + RT_PTR_ALLOC *slot = NULL; + + Assert(value_p != NULL); + + /* Empty tree */ + if (!iter->tree->ctl->root) + return false; + + do + { + RT_NODE_PTR child; + + /* Get the next chunk of the leaf node */ + slot = RT_NODE_INNER_ITERATE_NEXT(iter, 0); + + if (slot) + { + *key_p = iter->key; + child.alloc = *slot; + + // todo: deduplicate with rt_set? + if (RT_VALUE_IS_EMBEDDABLE) + { + memcpy(value_p, &child.alloc, sizeof(RT_VALUE_TYPE)); + } + else + { + RT_PTR_SET_LOCAL(iter->tree, &child); + memcpy(value_p, child.local, sizeof(RT_VALUE_TYPE)); + } + + return true; + } + + /* + * We've visited all values in the leaf node, so advance all inner node + * iterators by visiting inner nodes from the level = 1 until we find the + * next inner node that has a child node. + */ + for (int level = 1; level <= iter->top_level; level++) + { + // fixme + slot = RT_NODE_INNER_ITERATE_NEXT(iter, level); + + if (slot) + { + child.alloc = *slot; + + /* + * Found the new child node. We update the next node to iterate for each + * level from the level of this child node. + */ + RT_ITER_SET_NODE_FROM(iter, child, level - 1); + break; + } + } + } while (slot != NULL); + + /* We've visited all nodes, so the iteration finished */ + return false; +} + +/* + * Terminate the iteration and release the lock. + * + * This function needs to be called after finishing or when exiting an + * iteration. + */ +RT_SCOPE void +RT_END_ITERATE(RT_ITER *iter) +{ +#ifdef RT_SHMEM + Assert(LWLockHeldByMe(&iter->tree->ctl->lock)); +#endif + + RT_UNLOCK(iter->tree); + pfree(iter); +} + +/* + * Return the statistics of the amount of memory used by the radix tree. + */ +RT_SCOPE uint64 +RT_MEMORY_USAGE(RT_RADIX_TREE *tree) +{ + Size total = 0; + + RT_LOCK_SHARED(tree); + +#ifdef RT_SHMEM + Assert(tree->ctl->magic == RT_RADIX_TREE_MAGIC); + total = dsa_get_total_size(tree->dsa); +#else + for (int i = 0; i < RT_SIZE_CLASS_COUNT; i++) + { + total += MemoryContextMemAllocated(tree->inner_slabs[i], true); + } + total += MemoryContextMemAllocated(tree->leaf_slab, true); +#endif + + RT_UNLOCK(tree); + return total; +} + + +static void pg_attribute_unused() +RT_DUMP_NODE(RT_PTR_LOCAL node, int level, + bool recurse, StringInfo buf) +{ +#ifdef RT_DEBUG + StringInfoData spaces; + + recurse = false; // xxx + + initStringInfo(&spaces); + appendStringInfoSpaces(&spaces, (level * 4) + 1); + // todo: clean up, can we use one of our tables for the kind-to-fanout mapping? + appendStringInfo(buf, "%s%s[%s] kind %d, fanout %d, count %u:\n", + spaces.data, + level == 0 ? "" : "-> ", + RT_NODE_IS_LEAF(node) ? "LEAF" : "INNR", + (node->kind == RT_NODE_KIND_4) ? 3 : + (node->kind == RT_NODE_KIND_16) ? 32 : + (node->kind == RT_NODE_KIND_48) ? 125 : 256, + node->fanout == 0 ? 256 : node->fanout, + + (node->kind == RT_NODE_KIND_256) ? + (node->count == 0 ? 256 : node->count) : + node->count); + + switch (node->kind) + { + case RT_NODE_KIND_4: + { + for (int i = 0; i < node->count; i++) + { +#if 0 + if (RT_NODE_IS_LEAF(node)) + { + RT_NODE_LEAF_4 *n4 = (RT_NODE_LEAF_4 *) node; + + appendStringInfo(buf, "%schunk[%d] 0x%X\n", + spaces.data, i, n4->base.chunks[i]); + } + else +#endif + { + RT_NODE_INNER_4 *n4 = (RT_NODE_INNER_4 *) node; + + appendStringInfo(buf, "%schunk[%d] 0x%X", + spaces.data, i, n4->base.chunks[i]); +#if 0 + if (recurse) + { + appendStringInfo(buf, "\n"); + RT_DUMP_NODE(n4->children[i], level + 1, + recurse, buf); + } + else + appendStringInfo(buf, " (skipped)\n"); +#endif + +#if 0 + /* quick hack for inspecting values in a one-level tree */ + if (n4->children[i] != NULL) + appendStringInfo(buf, " %lu\n", *((RT_VALUE_TYPE *) n4->children[i])); + else + appendStringInfo(buf, " (NULL)\n"); +#endif + } + } + break; + } + case RT_NODE_KIND_16: + { + for (int i = 0; i < node->count; i++) + { +#if 0 + if (RT_NODE_IS_LEAF(node)) + { + RT_NODE_LEAF_16 *n16 = (RT_NODE_LEAF_16 *) node; + + appendStringInfo(buf, "%schunk[%d] 0x%X\n", + spaces.data, i, n16->base.chunks[i]); + } + else +#endif + { + RT_NODE_INNER_16 *n16 = (RT_NODE_INNER_16 *) node; + + appendStringInfo(buf, "%schunk[%d] 0x%X", + spaces.data, i, n16->base.chunks[i]); + +#if 0 + if (recurse) + { + appendStringInfo(buf, "\n"); + RT_DUMP_NODE(n16->children[i], level + 1, + recurse, buf); + } + else +#endif + if (n16->children[i] != RT_INVALID_PTR_ALLOC) + appendStringInfo(buf, " %lu\n", *((RT_VALUE_TYPE *) n16->children[i])); + else + appendStringInfo(buf, " (NULL)\n"); + + } + } + break; + } + case RT_NODE_KIND_48: + { + RT_NODE_BASE_48 *b125 = (RT_NODE_BASE_48 *) node; + char *sep = ""; + + appendStringInfo(buf, "%sslot_idxs: ", spaces.data); + for (int i = 0; i < RT_NODE_MAX_SLOTS; i++) + { + if (!RT_NODE_125_IS_CHUNK_USED(b125, i)) + continue; + + appendStringInfo(buf, "%s[%d]=%d ", + sep, i, b125->slot_idxs[i]); + sep = ","; + } + + appendStringInfo(buf, "\n%sisset-bitmap: ", spaces.data); + for (int i = 0; i < (RT_SLOT_IDX_LIMIT / BITS_PER_BYTE); i++) + appendStringInfo(buf, "%X ", ((uint8 *) b125->isset)[i]); + appendStringInfo(buf, "\n"); + + for (int i = 0; i < RT_NODE_MAX_SLOTS; i++) + { + if (!RT_NODE_125_IS_CHUNK_USED(b125, i)) + continue; +#if 0 + if (RT_NODE_IS_LEAF(node)) + appendStringInfo(buf, "%schunk 0x%X\n", + spaces.data, i); + else +#endif + { + appendStringInfo(buf, "%schunk 0x%X", + spaces.data, i); + +#if 0 + if (recurse) + { + RT_NODE_INNER_48 *n48 = (RT_NODE_INNER_48 *) b125; + + appendStringInfo(buf, "\n"); + RT_DUMP_NODE(*(RT_NODE_INNER_125_GET_CHILD(n48, i)), + level + 1, recurse, buf); + } + else +#endif + appendStringInfo(buf, " (skipped)\n"); + } + } + break; + } +#if 0 + case RT_NODE_KIND_256: + { + if (RT_NODE_IS_LEAF(node)) + { + RT_NODE_LEAF_256 *n256 = (RT_NODE_LEAF_256 *) node; + + appendStringInfo(buf, "%sisset-bitmap: ", spaces.data); + for (int i = 0; i < (RT_SLOT_IDX_LIMIT / BITS_PER_BYTE); i++) + appendStringInfo(buf, "%X ", ((uint8 *) n256->isset)[i]); + appendStringInfo(buf, "\n"); + } + + for (int i = 0; i < RT_NODE_MAX_SLOTS; i++) + { + if (RT_NODE_IS_LEAF(node)) + { + RT_NODE_LEAF_256 *n256 = (RT_NODE_LEAF_256 *) node; + + if (!RT_NODE_LEAF_256_IS_CHUNK_USED(n256, i)) + continue; + + appendStringInfo(buf, "%schunk 0x%X\n", + spaces.data, i); + } + else + { + RT_NODE_INNER_256 *n256 = (RT_NODE_INNER_256 *) node; + + if (!RT_NODE_INNER_256_IS_CHUNK_USED(n256, i)) + continue; + + appendStringInfo(buf, "%schunk 0x%X", + spaces.data, i); + + if (recurse) + { + appendStringInfo(buf, "\n"); + RT_DUMP_NODE(RT_NODE_INNER_256_GET_CHILD(n256, i), + level + 1, recurse, buf); + } + else + appendStringInfo(buf, " (skipped)\n"); + } + } + break; + } +#endif + } +#endif +} + +/* + * Verify the radix tree node. + */ +// XXX somewhat whacked around to allow dumping single node types for debugging +static void +RT_VERIFY_NODE(RT_PTR_LOCAL node) +{ +#ifdef USE_ASSERT_CHECKING + StringInfoData buf; + + initStringInfo(&buf); + + switch (node->kind) + { + case RT_NODE_KIND_4: + { + RT_NODE_BASE_4 *n4 = (RT_NODE_BASE_4 *) node; + + if (false) + { + RT_DUMP_NODE(node, 0, false, &buf); + fprintf(stderr, "%s",buf.data); + } + + for (int i = 1; i < n4->n.count; i++) + Assert(n4->chunks[i - 1] < n4->chunks[i]); + + break; + } + case RT_NODE_KIND_16: + { + RT_NODE_BASE_16 *n16 = (RT_NODE_BASE_16 *) node; + + if (false) + { + RT_DUMP_NODE(node, 0, false, &buf); + fprintf(stderr, "%s",buf.data); + } + + for (int i = 1; i < n16->n.count; i++) + Assert(n16->chunks[i - 1] < n16->chunks[i]); + + break; + } + case RT_NODE_KIND_48: + { + RT_NODE_BASE_48 *n48 = (RT_NODE_BASE_48 *) node; + int cnt = 0; + + if (false) + { + RT_DUMP_NODE(node, 0, false, &buf); + fprintf(stderr, "%s",buf.data); + } + + for (int i = 0; i < RT_NODE_MAX_SLOTS; i++) + { + uint8 slot = n48->slot_idxs[i]; + int idx = RT_BM_IDX(slot); + int bitnum = RT_BM_BIT(slot); + + if (!RT_NODE_125_IS_CHUNK_USED(n48, i)) + continue; + + /* Check if the corresponding slot is used */ + Assert(slot < node->fanout); + Assert((n48->isset[idx] & ((bitmapword) 1 << bitnum)) != 0); + + cnt++; + } + + Assert(n48->n.count == cnt); + + break; + } + case RT_NODE_KIND_256: + { + if (RT_NODE_IS_LEAF(node)) + { + RT_NODE_LEAF_256 *n256 = (RT_NODE_LEAF_256 *) node; + int cnt = 0; + + for (int i = 0; i < RT_BM_IDX(RT_NODE_MAX_SLOTS); i++) + cnt += bmw_popcount(n256->isset[i]); + + /* Check if the number of used chunk matches, accounting for overflow */ + if (cnt == 256) + Assert(n256->base.n.count == 0); + else + Assert(n256->base.n.count == cnt); + + break; + } + } + } +#endif +} + + +/***************** DEBUG FUNCTIONS *****************/ + +#define RT_UINT64_FORMAT_HEX "%" INT64_MODIFIER "X" + +RT_SCOPE void pg_attribute_unused() +RT_STATS(RT_RADIX_TREE *tree) +{ +#ifdef RT_DEBUG + RT_LOCK_SHARED(tree); + + fprintf(stderr, "max_val = " UINT64_FORMAT "\n", tree->ctl->max_val); + fprintf(stderr, "num_keys = " UINT64_FORMAT "\n", tree->ctl->num_keys); + +#ifdef RT_SHMEM + fprintf(stderr, "handle = " UINT64_FORMAT "\n", tree->ctl->handle); +#endif + + if (RT_PTR_ALLOC_IS_VALID(tree->ctl->root)) + { + fprintf(stderr, "height = %d", tree->ctl->start_shift / RT_NODE_SPAN); + + for (int i = 0; i < RT_SIZE_CLASS_COUNT; i++) + { + RT_SIZE_CLASS_ELEM size_class = RT_SIZE_CLASS_INFO[i]; + fprintf(stderr, ", n%d = %u", size_class.fanout, tree->ctl->cnt[i]); + } + + fprintf(stderr, "\n"); + } + + RT_UNLOCK(tree); +#endif +} + + +#if 0 +RT_SCOPE void +RT_DUMP_SEARCH(RT_RADIX_TREE *tree, uint64 key) +{ + RT_PTR_ALLOC allocnode; + RT_PTR_LOCAL node; + StringInfoData buf; + int shift; + int level = 0; + + RT_STATS(tree); + + RT_LOCK_SHARED(tree); + + if (!RT_PTR_ALLOC_IS_VALID(tree->ctl->root)) + { + RT_UNLOCK(tree); + fprintf(stderr, "empty tree\n"); + return; + } + + if (key > tree->ctl->max_val) + { + RT_UNLOCK(tree); + fprintf(stderr, "key " UINT64_FORMAT "(0x" RT_UINT64_FORMAT_HEX ") is larger than max val\n", + key, key); + return; + } + + initStringInfo(&buf); + allocnode = tree->ctl->root; + node = RT_PTR_SET_LOCAL(tree, allocnode); + shift = node->shift; + while (shift >= 0) + { + RT_PTR_ALLOC child; + + RT_DUMP_NODE(tree, allocnode, level, false, &buf); + + if (RT_NODE_IS_LEAF(node)) + { + RT_VALUE_TYPE dummy; + + /* We reached at a leaf node, find the corresponding slot */ + RT_NODE_SEARCH_LEAF(node, key, &dummy); + + break; + } + + if (!RT_NODE_SEARCH_INNER(node.local, RT_GET_KEY_CHUNK(key, shift))) + break; + + allocnode = child; + node = RT_PTR_SET_LOCAL(tree, allocnode); + shift -= RT_NODE_SPAN; + level++; + } + RT_UNLOCK(tree); + + fprintf(stderr, "%s", buf.data); +} + +// this might be better as "iterate over nodes", plus a callback to RT_DUMP_NODE, +// which should really only concern itself with single nodes +RT_SCOPE void +RT_DUMP(RT_RADIX_TREE *tree) +{ + StringInfoData buf; + + RT_STATS(tree); + + RT_LOCK_SHARED(tree); + + if (!RT_PTR_ALLOC_IS_VALID(tree->ctl->root)) + { + RT_UNLOCK(tree); + fprintf(stderr, "empty tree\n"); + return; + } + + initStringInfo(&buf); + + RT_DUMP_NODE(tree->ctl->root, 0, true, &buf); + RT_UNLOCK(tree); + + fprintf(stderr, "%s",buf.data); +} + +#endif /* 0 */ + + +#endif /* RT_DEFINE */ + + +/* undefine external parameters, so next radix tree can be defined */ +#undef RT_PREFIX +#undef RT_SCOPE +#undef RT_DECLARE +#undef RT_DEFINE +#undef RT_VALUE_TYPE + +/* locally declared macros */ +#undef RT_MAKE_PREFIX +#undef RT_MAKE_NAME +#undef RT_MAKE_NAME_ +#undef RT_NODE_SPAN +#undef RT_NODE_MAX_SLOTS +#undef RT_CHUNK_MASK +#undef RT_MAX_SHIFT +#undef RT_MAX_LEVEL +#undef RT_GET_KEY_CHUNK +#undef RT_BM_IDX +#undef RT_BM_BIT +#undef RT_LOCK_EXCLUSIVE +#undef RT_LOCK_SHARED +#undef RT_UNLOCK +#undef RT_NODE_IS_LEAF +#undef RT_NODE_MUST_GROW +#undef RT_NODE_KIND_COUNT +#undef RT_SIZE_CLASS_COUNT +#undef RT_SLOT_IDX_LIMIT +#undef RT_INVALID_SLOT_IDX +#undef RT_SLAB_BLOCK_SIZE +#undef RT_RADIX_TREE_MAGIC +#undef RT_UINT64_FORMAT_HEX + +/* type declarations */ +#undef RT_RADIX_TREE +#undef RT_RADIX_TREE_CONTROL +#undef RT_NODE_PTR +#undef RT_PTR_LOCAL +#undef RT_PTR_ALLOC +#undef RT_INVALID_PTR_ALLOC +#undef RT_HANDLE +#undef RT_ITER +#undef RT_NODE +#undef RT_NODE_ITER +#undef RT_NODE_KIND_4 +#undef RT_NODE_KIND_16 +#undef RT_NODE_KIND_48 +#undef RT_NODE_KIND_256 +#undef RT_NODE_BASE_4 +#undef RT_NODE_BASE_16 +#undef RT_NODE_BASE_48 +#undef RT_NODE_BASE_256 +#undef RT_NODE_INNER_4 +#undef RT_NODE_INNER_16 +#undef RT_NODE_INNER_48 +#undef RT_NODE_INNER_256 +#undef RT_NODE_LEAF_4 +#undef RT_NODE_LEAF_16 +#undef RT_NODE_LEAF_48 +#undef RT_NODE_LEAF_256 +#undef RT_SIZE_CLASS +#undef RT_SIZE_CLASS_ELEM +#undef RT_SIZE_CLASS_INFO +#undef RT_CLASS_4 +#undef RT_CLASS_16_LO +#undef RT_CLASS_16_HI +#undef RT_CLASS_48 +#undef RT_CLASS_256 +#undef RT_FANOUT_4 +#undef RT_FANOUT_16_LO +#undef RT_FANOUT_16_HI +#undef RT_FANOUT_48 +#undef RT_FANOUT_256 + +/* function declarations */ +#undef RT_CREATE +#undef RT_FREE +#undef RT_ATTACH +#undef RT_DETACH +#undef RT_GET_HANDLE +#undef RT_SEARCH +#undef RT_SET +#undef RT_BEGIN_ITERATE +#undef RT_ITERATE_NEXT +#undef RT_END_ITERATE +#undef RT_USE_DELETE +#undef RT_DELETE +#undef RT_MEMORY_USAGE +#undef RT_DUMP +#undef RT_DUMP_NODE +#undef RT_DUMP_SEARCH +#undef RT_STATS + +/* internal helper functions */ +#undef RT_NEW_ROOT +#undef RT_RECURSIVE_SET +#undef RT_RECURSIVE_DELETE +#undef RT_ALLOC_NODE +#undef RT_ALLOC_LEAF +#undef RT_FREE_NODE +#undef RT_FREE_LEAF +#undef RT_FREE_RECURSE +#undef RT_EXTEND_UP +#undef RT_EXTEND_DOWN +#undef RT_COPY_COMMON +#undef RT_PTR_SET_LOCAL +#undef RT_PTR_ALLOC_IS_VALID +#undef RT_NODE_3_SEARCH_EQ +#undef RT_NODE_32_SEARCH_EQ +#undef RT_NODE_3_GET_INSERTPOS +#undef RT_NODE_32_GET_INSERTPOS +#undef RT_CHUNK_CHILDREN_ARRAY_SHIFT +#undef RT_CHUNK_VALUES_ARRAY_SHIFT +#undef RT_CHUNK_CHILDREN_ARRAY_DELETE +#undef RT_CHUNK_VALUES_ARRAY_DELETE +#undef RT_CHUNK_CHILDREN_ARRAY_COPY +#undef RT_CHUNK_VALUES_ARRAY_COPY +#undef RT_NODE_125_IS_CHUNK_USED +#undef RT_NODE_INNER_125_GET_CHILD +#undef RT_NODE_LEAF_125_GET_VALUE +#undef RT_NODE_INNER_256_IS_CHUNK_USED +#undef RT_NODE_LEAF_256_IS_CHUNK_USED +#undef RT_NODE_INNER_256_GET_CHILD +#undef RT_NODE_LEAF_256_GET_VALUE +#undef RT_NODE_INNER_256_SET +#undef RT_NODE_LEAF_256_SET +#undef RT_NODE_INNER_256_DELETE +#undef RT_NODE_LEAF_256_DELETE +#undef RT_KEY_GET_SHIFT +#undef RT_SHIFT_GET_MAX_VAL +#undef RT_NODE_SEARCH_INNER +#undef RT_ADD_CHILD_4 +#undef RT_ADD_CHILD_16 +#undef RT_ADD_CHILD_48 +#undef RT_ADD_CHILD_256 +#undef RT_GROW_NODE_4 +#undef RT_GROW_NODE_16 +#undef RT_GROW_NODE_48 +#undef RT_GROW_NODE_256 +#undef RT_REMOVE_CHILD_4 +#undef RT_REMOVE_CHILD_16 +#undef RT_REMOVE_CHILD_48 +#undef RT_REMOVE_CHILD_256 +#undef RT_NODE_SEARCH_LEAF +#undef RT_NODE_UPDATE_INNER +#undef RT_NODE_DELETE_INNER +#undef RT_NODE_DELETE_LEAF +#undef RT_NODE_INSERT_INNER +#undef RT_NODE_INSERT_LEAF +#undef RT_NODE_INNER_ITERATE_NEXT +#undef RT_NODE_LEAF_ITERATE_NEXT +#undef RT_RT_ITER_SET_NODE_FROM +#undef RT_VERIFY_NODE + +#undef RT_DEBUG diff --git a/src/include/utils/dsa.h b/src/include/utils/dsa.h index 3ce4ee300a..2af215484f 100644 --- a/src/include/utils/dsa.h +++ b/src/include/utils/dsa.h @@ -121,6 +121,7 @@ extern dsa_handle dsa_get_handle(dsa_area *area); extern dsa_pointer dsa_allocate_extended(dsa_area *area, size_t size, int flags); extern void dsa_free(dsa_area *area, dsa_pointer dp); extern void *dsa_get_address(dsa_area *area, dsa_pointer dp); +extern size_t dsa_get_total_size(dsa_area *area); extern void dsa_trim(dsa_area *area); extern void dsa_dump(dsa_area *area); diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 6331c976dc..05f16e880b 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -27,6 +27,7 @@ SUBDIRS = \ test_parser \ test_pg_dump \ test_predtest \ + test_radixtree \ test_rbtree \ test_regex \ test_rls_hooks \ diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build index 17d369e378..995d8c0cc6 100644 --- a/src/test/modules/meson.build +++ b/src/test/modules/meson.build @@ -24,6 +24,7 @@ subdir('test_oat_hooks') subdir('test_parser') subdir('test_pg_dump') subdir('test_predtest') +subdir('test_radixtree') subdir('test_rbtree') subdir('test_regex') subdir('test_rls_hooks') diff --git a/src/test/modules/test_radixtree/.gitignore b/src/test/modules/test_radixtree/.gitignore new file mode 100644 index 0000000000..5dcb3ff972 --- /dev/null +++ b/src/test/modules/test_radixtree/.gitignore @@ -0,0 +1,4 @@ +# Generated subdirectories +/log/ +/results/ +/tmp_check/ diff --git a/src/test/modules/test_radixtree/Makefile b/src/test/modules/test_radixtree/Makefile new file mode 100644 index 0000000000..da06b93da3 --- /dev/null +++ b/src/test/modules/test_radixtree/Makefile @@ -0,0 +1,23 @@ +# src/test/modules/test_radixtree/Makefile + +MODULE_big = test_radixtree +OBJS = \ + $(WIN32RES) \ + test_radixtree.o +PGFILEDESC = "test_radixtree - test code for src/backend/lib/radixtree.c" + +EXTENSION = test_radixtree +DATA = test_radixtree--1.0.sql + +REGRESS = test_radixtree + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_radixtree +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/test_radixtree/README b/src/test/modules/test_radixtree/README new file mode 100644 index 0000000000..a8b271869a --- /dev/null +++ b/src/test/modules/test_radixtree/README @@ -0,0 +1,7 @@ +test_integerset contains unit tests for testing the integer set implementation +in src/backend/lib/integerset.c. + +The tests verify the correctness of the implementation, but they can also be +used as a micro-benchmark. If you set the 'intset_test_stats' flag in +test_integerset.c, the tests will print extra information about execution time +and memory usage. diff --git a/src/test/modules/test_radixtree/expected/test_radixtree.out b/src/test/modules/test_radixtree/expected/test_radixtree.out new file mode 100644 index 0000000000..617703d0a9 --- /dev/null +++ b/src/test/modules/test_radixtree/expected/test_radixtree.out @@ -0,0 +1,48 @@ +CREATE EXTENSION test_radixtree; +-- +-- All the logic is in the test_radixtree() function. It will throw +-- an error if something fails. +-- +SELECT test_radixtree(); +NOTICE: testing node 3 with height 0 and ascending keys +NOTICE: testing node 3 with height 0 and descending keys +NOTICE: testing node 3 with height 1 and ascending keys +NOTICE: testing node 3 with height 1 and descending keys +NOTICE: testing node 15 with height 0 and ascending keys +NOTICE: testing node 15 with height 0 and descending keys +NOTICE: testing node 15 with height 1 and ascending keys +NOTICE: testing node 15 with height 1 and descending keys +NOTICE: testing node 32 with height 0 and ascending keys +NOTICE: testing node 32 with height 0 and descending keys +NOTICE: testing node 32 with height 1 and ascending keys +NOTICE: testing node 32 with height 1 and descending keys +NOTICE: testing node 125 with height 0 and ascending keys +NOTICE: testing node 125 with height 0 and descending keys +NOTICE: testing node 125 with height 1 and ascending keys +NOTICE: testing node 125 with height 1 and descending keys +NOTICE: testing node 256 with height 0 and ascending keys +NOTICE: testing node 256 with height 0 and descending keys +NOTICE: testing node 256 with height 1 and ascending keys +NOTICE: testing node 256 with height 1 and descending keys +NOTICE: testing radix tree node types with shift "0" +NOTICE: testing radix tree node types with shift "8" +NOTICE: testing radix tree node types with shift "16" +NOTICE: testing radix tree node types with shift "24" +NOTICE: testing radix tree node types with shift "32" +NOTICE: testing radix tree node types with shift "40" +NOTICE: testing radix tree node types with shift "48" +NOTICE: testing radix tree node types with shift "56" +NOTICE: testing radix tree with pattern "all ones" +NOTICE: testing radix tree with pattern "alternating bits" +NOTICE: testing radix tree with pattern "clusters of ten" +NOTICE: testing radix tree with pattern "clusters of hundred" +NOTICE: testing radix tree with pattern "one-every-64k" +NOTICE: testing radix tree with pattern "sparse" +NOTICE: testing radix tree with pattern "single values, distance > 2^32" +NOTICE: testing radix tree with pattern "clusters, distance > 2^32" +NOTICE: testing radix tree with pattern "clusters, distance > 2^60" + test_radixtree +---------------- + +(1 row) + diff --git a/src/test/modules/test_radixtree/meson.build b/src/test/modules/test_radixtree/meson.build new file mode 100644 index 0000000000..6add06bbdb --- /dev/null +++ b/src/test/modules/test_radixtree/meson.build @@ -0,0 +1,35 @@ +# FIXME: prevent install during main install, but not during test :/ + +test_radixtree_sources = files( + 'test_radixtree.c', +) + +if host_system == 'windows' + test_radixtree_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'test_radixtree', + '--FILEDESC', 'test_radixtree - test code for src/include//lib/radixtree.h',]) +endif + +test_radixtree = shared_module('test_radixtree', + test_radixtree_sources, + link_with: pgport_srv, + kwargs: pg_mod_args, +) +testprep_targets += test_radixtree + +install_data( + 'test_radixtree.control', + 'test_radixtree--1.0.sql', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'test_radixtree', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'test_radixtree', + ], + }, +} diff --git a/src/test/modules/test_radixtree/sql/test_radixtree.sql b/src/test/modules/test_radixtree/sql/test_radixtree.sql new file mode 100644 index 0000000000..41ece5e9f5 --- /dev/null +++ b/src/test/modules/test_radixtree/sql/test_radixtree.sql @@ -0,0 +1,7 @@ +CREATE EXTENSION test_radixtree; + +-- +-- All the logic is in the test_radixtree() function. It will throw +-- an error if something fails. +-- +SELECT test_radixtree(); diff --git a/src/test/modules/test_radixtree/test_radixtree--1.0.sql b/src/test/modules/test_radixtree/test_radixtree--1.0.sql new file mode 100644 index 0000000000..074a5a7ea7 --- /dev/null +++ b/src/test/modules/test_radixtree/test_radixtree--1.0.sql @@ -0,0 +1,8 @@ +/* src/test/modules/test_radixtree/test_radixtree--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_radixtree" to load this file. \quit + +CREATE FUNCTION test_radixtree() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_radixtree/test_radixtree.c b/src/test/modules/test_radixtree/test_radixtree.c new file mode 100644 index 0000000000..451206f2da --- /dev/null +++ b/src/test/modules/test_radixtree/test_radixtree.c @@ -0,0 +1,776 @@ +/*-------------------------------------------------------------------------- + * + * test_radixtree.c + * Test radixtree set data structure. + * + * Copyright (c) 2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/test_radixtree/test_radixtree.c + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "common/pg_prng.h" +#include "fmgr.h" +#include "miscadmin.h" +#include "nodes/bitmapset.h" +#include "storage/block.h" +#include "storage/itemptr.h" +#include "storage/lwlock.h" +#include "utils/memutils.h" +#include "utils/timestamp.h" + +#define UINT64_HEX_FORMAT "%" INT64_MODIFIER "X" + +/* + * The tests pass with uint32, but build with warnings because the string + * format expects uint64. + */ +typedef uint64 TestValueType; + +/* + * If you enable this, the "pattern" tests will print information about + * how long populating, probing, and iterating the test set takes, and + * how much memory the test set consumed. That can be used as + * micro-benchmark of various operations and input patterns (you might + * want to increase the number of values used in each of the test, if + * you do that, to reduce noise). + * + * The information is printed to the server's stderr, mostly because + * that's where MemoryContextStats() output goes. + */ +static const bool rt_test_stats = false; + +/* + * XXX: should we expose and use RT_SIZE_CLASS and RT_SIZE_CLASS_INFO? + */ +static int rt_node_class_fanouts[] = { + 3, /* RT_CLASS_3 */ + 15, /* RT_CLASS_32_MIN */ + 32, /* RT_CLASS_32_MAX */ + 125, /* RT_CLASS_125 */ + 256 /* RT_CLASS_256 */ +}; + +/* + * A struct to define a pattern of integers, for use with the test_pattern() + * function. + */ +typedef struct +{ + char *test_name; /* short name of the test, for humans */ + char *pattern_str; /* a bit pattern */ + uint64 spacing; /* pattern repeats at this interval */ + uint64 num_values; /* number of integers to set in total */ +} test_spec; + +/* Test patterns borrowed from test_integerset.c */ +static const test_spec test_specs[] = { + { + "all ones", "1111111111", + 10, 1000000 + }, + { + "alternating bits", "0101010101", + 10, 1000000 + }, + { + "clusters of ten", "1111111111", + 10000, 1000000 + }, + { + "clusters of hundred", + "1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111", + 10000, 1000000 + }, + { + "one-every-64k", "1", + 65536, 1000000 + }, + { + "sparse", "100000000000000000000000000000001", + 10000000, 1000000 + }, + { + "single values, distance > 2^32", "1", + UINT64CONST(10000000000), 100000 + }, + { + "clusters, distance > 2^32", "10101010", + UINT64CONST(10000000000), 1000000 + }, + { + "clusters, distance > 2^60", "10101010", + UINT64CONST(2000000000000000000), + 23 /* can't be much higher than this, or we + * overflow uint64 */ + } +}; + +/* define the radix tree implementation to test */ +#define RT_PREFIX rt +#define RT_SCOPE static pg_noinline +#define RT_DECLARE +#define RT_DEFINE +#define RT_USE_DELETE +#define RT_VALUE_TYPE TestValueType +/* #define RT_SHMEM */ +#define RT_DEBUG +#include "lib/radixtree.h" + + +/* + * Return the number of keys in the radix tree. + */ +static uint64 +rt_num_entries(rt_radix_tree *tree) +{ + return tree->ctl->num_keys; +} + +PG_MODULE_MAGIC; + +PG_FUNCTION_INFO_V1(test_radixtree); + +static void +test_empty(void) +{ + rt_radix_tree *radixtree; + rt_iter *iter; + TestValueType dummy; + uint64 key; + TestValueType val; + +#ifdef RT_SHMEM + int tranche_id = LWLockNewTrancheId(); + dsa_area *dsa; + + LWLockRegisterTranche(tranche_id, "test_radix_tree"); + dsa = dsa_create(tranche_id); + + radixtree = rt_create(CurrentMemoryContext, dsa, tranche_id); +#else + radixtree = rt_create(CurrentMemoryContext); +#endif + + if (rt_search(radixtree, 0, &dummy)) + elog(ERROR, "rt_search on empty tree returned true"); + + if (rt_search(radixtree, 1, &dummy)) + elog(ERROR, "rt_search on empty tree returned true"); + + if (rt_search(radixtree, PG_UINT64_MAX, &dummy)) + elog(ERROR, "rt_search on empty tree returned true"); + + if (rt_delete(radixtree, 0)) + elog(ERROR, "rt_delete on empty tree returned true"); + + if (rt_num_entries(radixtree) != 0) + elog(ERROR, "rt_num_entries on empty tree return non-zero"); + + iter = rt_begin_iterate(radixtree); + + if (rt_iterate_next(iter, &key, &val)) + elog(ERROR, "rt_itereate_next on empty tree returned true"); + + rt_end_iterate(iter); + + rt_free(radixtree); + +#ifdef RT_SHMEM + dsa_detach(dsa); +#endif +} + +static void +test_basic(int children, int height, bool reverse) +{ + rt_radix_tree *radixtree; + rt_iter *iter; + uint64 *keys; + int shift = height * 8; + +#ifdef RT_SHMEM + int tranche_id = LWLockNewTrancheId(); + dsa_area *dsa; + + LWLockRegisterTranche(tranche_id, "test_radix_tree"); + dsa = dsa_create(tranche_id); +#endif + + elog(NOTICE, "testing node %3d with height %d and %s keys", + children, height, reverse ? "descending" : " ascending"); + +#ifdef RT_SHMEM + radixtree = rt_create(CurrentMemoryContext, dsa, tranche_id); +#else + radixtree = rt_create(CurrentMemoryContext); +#endif + + keys = palloc(sizeof(uint64) * children); + for (int i = 0; i < children; i++) + keys[i] = (uint64) i << shift; + + /* insert keys */ + if (reverse) + { + for (int i = children - 1; i >= 0; i--) + { + if (rt_set(radixtree, keys[i], (TestValueType*) &keys[i])) + elog(ERROR, "new inserted key 0x" UINT64_HEX_FORMAT " is found ", keys[i]); + } + } + else + { + for (int i = 0; i < children; i++) + { + if (rt_set(radixtree, keys[i], (TestValueType*) &keys[i])) + elog(ERROR, "new inserted key 0x" UINT64_HEX_FORMAT " is found ", keys[i]); + } + } + + rt_stats(radixtree); + + /* look up keys */ + for (int i = 0; i < children; i++) + { + TestValueType value; + + if (!rt_search(radixtree, keys[i], &value)) + elog(ERROR, "could not find key 0x" UINT64_HEX_FORMAT, keys[i]); + if (value != (TestValueType) keys[i]) + elog(ERROR, "rt_search returned 0x" UINT64_HEX_FORMAT ", expected " UINT64_HEX_FORMAT, + value, (TestValueType) keys[i]); + } + + /* update keys */ + for (int i = 0; i < children; i++) + { + TestValueType update = keys[i] + 1; + if (!rt_set(radixtree, keys[i], (TestValueType*) &update)) + elog(ERROR, "could not update key 0x" UINT64_HEX_FORMAT, keys[i]); + } + + /* repeat deleting and inserting keys */ + for (int i = 0; i < children; i++) + { + if (!rt_delete(radixtree, keys[i])) + elog(ERROR, "could not delete key 0x" UINT64_HEX_FORMAT, keys[i]); + if (rt_set(radixtree, keys[i], (TestValueType*) &keys[i])) + elog(ERROR, "new inserted key 0x" UINT64_HEX_FORMAT " is found ", keys[i]); + } + + /* look up keys after deleting and re-inserting */ + for (int i = 0; i < children; i++) + { + TestValueType value; + + if (!rt_search(radixtree, keys[i], &value)) + elog(ERROR, "could not find key 0x" UINT64_HEX_FORMAT, keys[i]); + if (value != (TestValueType) keys[i]) + elog(ERROR, "rt_search returned 0x" UINT64_HEX_FORMAT ", expected " UINT64_HEX_FORMAT, + value, (TestValueType) keys[i]); + } + + /* iterate over the tree */ + iter = rt_begin_iterate(radixtree); + + for (int i = 0; i < children; i++) + { + uint64 expected = keys[i]; + uint64 iterkey; + TestValueType iterval; + + if (!rt_iterate_next(iter, &iterkey, &iterval)) + elog(ERROR, "iteration terminated prematurely"); + + if (iterkey != expected) + elog(ERROR, + "iterate returned wrong key; got 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT " at %d", + iterkey, expected, i); + if (iterval != (TestValueType) expected) + elog(ERROR, + "iterate returned wrong value; got 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT " at %d", iterval, expected, i); + } + + rt_end_iterate(iter); + + + /* delete again and check that the tree is empty */ + for (int i = 0; i < children; i++) + { + if (!rt_delete(radixtree, keys[i])) + elog(ERROR, "could not delete key 0x" UINT64_HEX_FORMAT, keys[i]); + } + for (int i = 0; i < children; i++) + { + TestValueType value; + + if (rt_search(radixtree, keys[i], &value)) + elog(ERROR, "found deleted key 0x" UINT64_HEX_FORMAT, keys[i]); + } + + rt_stats(radixtree); + + pfree(keys); + rt_free(radixtree); +#ifdef RT_SHMEM + dsa_detach(dsa); +#endif +} + +/* + * Check if keys from start to end with the shift exist in the tree. + */ +static void +check_search_on_node(rt_radix_tree *radixtree, uint8 shift, int start, int end) +{ + for (int i = start; i <= end; i++) + { + uint64 key = ((uint64) i << shift); + TestValueType val; + + if (!rt_search(radixtree, key, &val)) + elog(ERROR, "key 0x" UINT64_HEX_FORMAT " is not found on node-%d", + key, end); + if (val != (TestValueType) key) + elog(ERROR, "rt_search with key 0x" UINT64_HEX_FORMAT " returns 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT, + key, val, key); + } +} + +/* + * Insert 256 key-value pairs, and check if keys are properly inserted on each + * node class. + */ +/* Test keys [0, 256) */ +#define NODE_TYPE_TEST_KEY_MIN 0 +#define NODE_TYPE_TEST_KEY_MAX 256 +static void +test_node_types_insert_asc(rt_radix_tree *radixtree, uint8 shift) +{ + uint64 num_entries; + int node_class_idx = 0; + uint64 key_checked = 0; + + for (int i = NODE_TYPE_TEST_KEY_MIN; i < NODE_TYPE_TEST_KEY_MAX; i++) + { + uint64 key = ((uint64) i << shift); + bool found; + + found = rt_set(radixtree, key, (TestValueType *) &key); + if (found) + elog(ERROR, "newly inserted key 0x" UINT64_HEX_FORMAT " is found", key); + + /* + * After filling all slots in each node type, check if the values + * are stored properly. + */ + if ((i + 1) == rt_node_class_fanouts[node_class_idx]) + { + check_search_on_node(radixtree, shift, key_checked, i); + key_checked = i; + node_class_idx++; + } + } + + num_entries = rt_num_entries(radixtree); + if (num_entries != 256) + elog(ERROR, + "rt_num_entries returned " UINT64_FORMAT ", expected " UINT64_FORMAT, + num_entries, UINT64CONST(256)); +} + +/* + * Similar to test_node_types_insert_asc(), but inserts keys in descending order. + */ +static void +test_node_types_insert_desc(rt_radix_tree *radixtree, uint8 shift) +{ + uint64 num_entries; + int node_class_idx = 0; + uint64 key_checked = NODE_TYPE_TEST_KEY_MAX - 1; + + for (int i = NODE_TYPE_TEST_KEY_MAX - 1; i >= NODE_TYPE_TEST_KEY_MIN; i--) + { + uint64 key = ((uint64) i << shift); + bool found; + + found = rt_set(radixtree, key, (TestValueType *) &key); + if (found) + elog(ERROR, "newly inserted key 0x" UINT64_HEX_FORMAT " is found", key); + + if ((i + 1) == rt_node_class_fanouts[node_class_idx]) + { + check_search_on_node(radixtree, shift, i, key_checked); + key_checked = i; + node_class_idx++; + } + } + + num_entries = rt_num_entries(radixtree); + if (num_entries != 256) + elog(ERROR, + "rt_num_entries returned " UINT64_FORMAT ", expected " UINT64_FORMAT, + num_entries, UINT64CONST(256)); +} + +static void pg_attribute_unused() +test_node_types_delete(rt_radix_tree *radixtree, uint8 shift) +{ + uint64 num_entries; + + for (int i = NODE_TYPE_TEST_KEY_MIN; i < NODE_TYPE_TEST_KEY_MAX; i++) + { + uint64 key = ((uint64) i << shift); + bool found; + + found = rt_delete(radixtree, key); + + if (!found) + elog(ERROR, "could not delete key 0x" UINT64_HEX_FORMAT, key); + } + + num_entries = rt_num_entries(radixtree); + + /* The tree must be empty */ + if (num_entries != 0) + elog(ERROR, + "rt_num_entries returned " UINT64_FORMAT ", expected " UINT64_FORMAT, + num_entries, UINT64CONST(256)); +} + +/* + * Test for inserting and deleting key-value pairs to each node type at the given shift + * level. + */ +static void pg_attribute_unused() +test_node_types(uint8 shift) +{ + rt_radix_tree *radixtree; + +#ifdef RT_SHMEM + int tranche_id = LWLockNewTrancheId(); + dsa_area *dsa; + + LWLockRegisterTranche(tranche_id, "test_radix_tree"); + dsa = dsa_create(tranche_id); +#endif + + elog(NOTICE, "testing radix tree node types with shift \"%d\"", shift); + +#ifdef RT_SHMEM + radixtree = rt_create(CurrentMemoryContext, dsa, tranche_id); +#else + radixtree = rt_create(CurrentMemoryContext); +#endif + + /* + * Insert and search entries for every node type at the 'shift' level, + * then delete all entries to make it empty, and insert and search entries + * again. + */ + test_node_types_insert_asc(radixtree, shift); + test_node_types_delete(radixtree, shift); + test_node_types_insert_desc(radixtree, shift); + + rt_free(radixtree); +#ifdef RT_SHMEM + dsa_detach(dsa); +#endif +} + +/* + * Test with a repeating pattern, defined by the 'spec'. + */ +static void +test_pattern(const test_spec * spec) +{ + rt_radix_tree *radixtree; + rt_iter *iter; + MemoryContext radixtree_ctx; + TimestampTz starttime; + TimestampTz endtime; + uint64 n; + uint64 last_int; + uint64 ndeleted; + uint64 nbefore; + uint64 nafter; + int patternlen; + uint64 *pattern_values; + uint64 pattern_num_values; +#ifdef RT_SHMEM + int tranche_id = LWLockNewTrancheId(); + dsa_area *dsa; + + LWLockRegisterTranche(tranche_id, "test_radix_tree"); + dsa = dsa_create(tranche_id); +#endif + + elog(NOTICE, "testing radix tree with pattern \"%s\"", spec->test_name); + if (rt_test_stats) + fprintf(stderr, "-----\ntesting radix tree with pattern \"%s\"\n", spec->test_name); + + /* Pre-process the pattern, creating an array of integers from it. */ + patternlen = strlen(spec->pattern_str); + pattern_values = palloc(patternlen * sizeof(uint64)); + pattern_num_values = 0; + for (int i = 0; i < patternlen; i++) + { + if (spec->pattern_str[i] == '1') + pattern_values[pattern_num_values++] = i; + } + + /* + * Allocate the radix tree. + * + * Allocate it in a separate memory context, so that we can print its + * memory usage easily. + */ + radixtree_ctx = AllocSetContextCreate(CurrentMemoryContext, + "radixtree test", + ALLOCSET_SMALL_SIZES); + MemoryContextSetIdentifier(radixtree_ctx, spec->test_name); + +#ifdef RT_SHMEM + radixtree = rt_create(radixtree_ctx, dsa, tranche_id); +#else + radixtree = rt_create(radixtree_ctx); +#endif + + + /* + * Add values to the set. + */ + starttime = GetCurrentTimestamp(); + + n = 0; + last_int = 0; + while (n < spec->num_values) + { + uint64 x = 0; + + for (int i = 0; i < pattern_num_values && n < spec->num_values; i++) + { + bool found; + + x = last_int + pattern_values[i]; + + found = rt_set(radixtree, x, (TestValueType*) &x); + + if (found) + elog(ERROR, "newly inserted key 0x" UINT64_HEX_FORMAT " found", x); + + n++; + } + last_int += spec->spacing; + } + + endtime = GetCurrentTimestamp(); + + if (rt_test_stats) + fprintf(stderr, "added " UINT64_FORMAT " values in %d ms\n", + spec->num_values, (int) (endtime - starttime) / 1000); + + /* + * Print stats on the amount of memory used. + * + * We print the usage reported by rt_memory_usage(), as well as the stats + * from the memory context. They should be in the same ballpark, but it's + * hard to automate testing that, so if you're making changes to the + * implementation, just observe that manually. + */ + if (rt_test_stats) + { + uint64 mem_usage; + + /* + * Also print memory usage as reported by rt_memory_usage(). It + * should be in the same ballpark as the usage reported by + * MemoryContextStats(). + */ + mem_usage = rt_memory_usage(radixtree); + fprintf(stderr, "rt_memory_usage() reported " UINT64_FORMAT " (%0.2f bytes / integer)\n", + mem_usage, (double) mem_usage / spec->num_values); + + MemoryContextStats(radixtree_ctx); + } + + /* Check that rt_num_entries works */ + n = rt_num_entries(radixtree); + if (n != spec->num_values) + elog(ERROR, "rt_num_entries returned " UINT64_FORMAT ", expected " UINT64_FORMAT, n, spec->num_values); + + /* + * Test random-access probes with rt_search() + */ + starttime = GetCurrentTimestamp(); + + for (n = 0; n < 100000; n++) + { + bool found; + bool expected; + uint64 x; + TestValueType v; + + /* + * Pick next value to probe at random. We limit the probes to the + * last integer that we added to the set, plus an arbitrary constant + * (1000). There's no point in probing the whole 0 - 2^64 range, if + * only a small part of the integer space is used. We would very + * rarely hit values that are actually in the set. + */ + x = pg_prng_uint64_range(&pg_global_prng_state, 0, last_int + 1000); + + /* Do we expect this value to be present in the set? */ + if (x >= last_int) + expected = false; + else + { + uint64 idx = x % spec->spacing; + + if (idx >= patternlen) + expected = false; + else if (spec->pattern_str[idx] == '1') + expected = true; + else + expected = false; + } + + /* Is it present according to rt_search() ? */ + found = rt_search(radixtree, x, &v); + + if (found != expected) + elog(ERROR, "mismatch at 0x" UINT64_HEX_FORMAT ": %d vs %d", x, found, expected); + if (found && (v != (TestValueType) x)) + elog(ERROR, "found 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT, + v, x); + } + endtime = GetCurrentTimestamp(); + if (rt_test_stats) + fprintf(stderr, "probed " UINT64_FORMAT " values in %d ms\n", + n, (int) (endtime - starttime) / 1000); + + /* + * Test iterator + */ + starttime = GetCurrentTimestamp(); + + iter = rt_begin_iterate(radixtree); + n = 0; + last_int = 0; + while (n < spec->num_values) + { + for (int i = 0; i < pattern_num_values && n < spec->num_values; i++) + { + uint64 expected = last_int + pattern_values[i]; + uint64 x; + TestValueType val; + + if (!rt_iterate_next(iter, &x, &val)) + break; + + if (x != expected) + elog(ERROR, + "iterate returned wrong key; got 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT " at %d", + x, expected, i); + if (val != (TestValueType) expected) + elog(ERROR, + "iterate returned wrong value; got 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT " at %d", x, expected, i); + n++; + } + last_int += spec->spacing; + } + endtime = GetCurrentTimestamp(); + if (rt_test_stats) + fprintf(stderr, "iterated " UINT64_FORMAT " values in %d ms\n", + n, (int) (endtime - starttime) / 1000); + + rt_end_iterate(iter); + + if (n < spec->num_values) + elog(ERROR, "iterator stopped short after " UINT64_FORMAT " entries, expected " UINT64_FORMAT, n, spec->num_values); + if (n > spec->num_values) + elog(ERROR, "iterator returned " UINT64_FORMAT " entries, " UINT64_FORMAT " was expected", n, spec->num_values); + + /* + * Test random-access probes with rt_delete() + */ + starttime = GetCurrentTimestamp(); + + nbefore = rt_num_entries(radixtree); + ndeleted = 0; + for (n = 0; n < 1; n++) + { + bool found; + uint64 x; + TestValueType v; + + /* + * Pick next value to probe at random. We limit the probes to the + * last integer that we added to the set, plus an arbitrary constant + * (1000). There's no point in probing the whole 0 - 2^64 range, if + * only a small part of the integer space is used. We would very + * rarely hit values that are actually in the set. + */ + x = pg_prng_uint64_range(&pg_global_prng_state, 0, last_int + 1000); + + /* Is it present according to rt_search() ? */ + found = rt_search(radixtree, x, &v); + + if (!found) + continue; + + /* If the key is found, delete it and check again */ + if (!rt_delete(radixtree, x)) + elog(ERROR, "could not delete key 0x" UINT64_HEX_FORMAT, x); + if (rt_search(radixtree, x, &v)) + elog(ERROR, "found deleted key 0x" UINT64_HEX_FORMAT, x); + if (rt_delete(radixtree, x)) + elog(ERROR, "deleted already-deleted key 0x" UINT64_HEX_FORMAT, x); + + ndeleted++; + } + endtime = GetCurrentTimestamp(); + if (rt_test_stats) + fprintf(stderr, "deleted " UINT64_FORMAT " values in %d ms\n", + ndeleted, (int) (endtime - starttime) / 1000); + + nafter = rt_num_entries(radixtree); + + /* Check that rt_num_entries works */ + if ((nbefore - ndeleted) != nafter) + elog(ERROR, "rt_num_entries returned " UINT64_FORMAT ", expected " UINT64_FORMAT "after " UINT64_FORMAT " deletion", + nafter, (nbefore - ndeleted), ndeleted); + + rt_free(radixtree); + MemoryContextDelete(radixtree_ctx); +#ifdef RT_SHMEM + dsa_detach(dsa); +#endif +} + +Datum +test_radixtree(PG_FUNCTION_ARGS) +{ + test_empty(); + + for (int i = 0; i < lengthof(rt_node_class_fanouts); i++) + { + test_basic(rt_node_class_fanouts[i], 0, false); + test_basic(rt_node_class_fanouts[i], 0, true); + test_basic(rt_node_class_fanouts[i], 1, false); + test_basic(rt_node_class_fanouts[i], 1, true); + } + + for (int shift = 0; shift <= (64 - 8); shift += 8) + test_node_types(shift); + + /* Test different test patterns, with lots of entries */ + for (int i = 0; i < lengthof(test_specs); i++) + test_pattern(&test_specs[i]); + + PG_RETURN_VOID(); +} diff --git a/src/test/modules/test_radixtree/test_radixtree.control b/src/test/modules/test_radixtree/test_radixtree.control new file mode 100644 index 0000000000..e53f2a3e0c --- /dev/null +++ b/src/test/modules/test_radixtree/test_radixtree.control @@ -0,0 +1,4 @@ +comment = 'Test code for radix tree' +default_version = '1.0' +module_pathname = '$libdir/test_radixtree' +relocatable = true diff --git a/src/tools/pginclude/cpluspluscheck b/src/tools/pginclude/cpluspluscheck index 4e09c4686b..202bf1c04e 100755 --- a/src/tools/pginclude/cpluspluscheck +++ b/src/tools/pginclude/cpluspluscheck @@ -101,6 +101,12 @@ do test "$f" = src/include/nodes/nodetags.h && continue test "$f" = src/backend/nodes/nodetags.h && continue + # radixtree_*_impl.h cannot be included standalone: they are just code fragments. + test "$f" = src/include/lib/radixtree_delete_impl.h && continue + test "$f" = src/include/lib/radixtree_insert_impl.h && continue + test "$f" = src/include/lib/radixtree_iter_impl.h && continue + test "$f" = src/include/lib/radixtree_search_impl.h && continue + # These files are not meant to be included standalone, because # they contain lists that might have multiple use-cases. test "$f" = src/include/access/rmgrlist.h && continue diff --git a/src/tools/pginclude/headerscheck b/src/tools/pginclude/headerscheck index 8dee1b5670..133313255c 100755 --- a/src/tools/pginclude/headerscheck +++ b/src/tools/pginclude/headerscheck @@ -96,6 +96,12 @@ do test "$f" = src/include/nodes/nodetags.h && continue test "$f" = src/backend/nodes/nodetags.h && continue + # radixtree_*_impl.h cannot be included standalone: they are just code fragments. + test "$f" = src/include/lib/radixtree_delete_impl.h && continue + test "$f" = src/include/lib/radixtree_insert_impl.h && continue + test "$f" = src/include/lib/radixtree_iter_impl.h && continue + test "$f" = src/include/lib/radixtree_search_impl.h && continue + # These files are not meant to be included standalone, because # they contain lists that might have multiple use-cases. test "$f" = src/include/access/rmgrlist.h && continue -- 2.41.0