diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile index 9dad31398a..fd002d594a 100644 --- a/src/backend/lib/Makefile +++ b/src/backend/lib/Makefile @@ -22,6 +22,9 @@ OBJS = \ integerset.o \ knapsack.o \ pairingheap.o \ + radixtree.o \ rbtree.o \ +radixtree.o: CFLAGS+=-mavx2 + include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/lib/radixtree.c b/src/backend/lib/radixtree.c new file mode 100644 index 0000000000..a5ad897ee9 --- /dev/null +++ b/src/backend/lib/radixtree.c @@ -0,0 +1,1377 @@ +/*------------------------------------------------------------------------- + * + * radixtree.c + * Implementation for adaptive radix tree. + * + * This module is based on the paper "The Adaptive Radix Tree: ARTful Indexing + * for Main-Memory Databases" by Viktor Leis, Alfons Kemper, and Thomas Neumann, + * 2013. + * + * There are some difference from the proposed implementation. For instance, + * this radix tree module utilize AVX2 instruction, enabling us to use 256-bit + * width SIMD vector, whereas 128-bit witdh SIMD vector is used in the paper. + * + * Copyright (c) 2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/lib/radixtree.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" +#include "port/pg_bitutils.h" +#include "utils/memutils.h" +#include "lib/radixtree.h" +#include "lib/stringinfo.h" + +#if defined(__AVX2__) +#include // x86 AVX2 intrinsics +#endif + +/* How many bits are encoded in one tree level */ +#define RADIX_TREE_NODE_FANOUT 8 + +#define RADIX_TREE_NODE_MAX_SLOTS (1 << RADIX_TREE_NODE_FANOUT) +#define RADIX_TREE_NODE_MAX_SLOT_BITS \ + (RADIX_TREE_NODE_MAX_SLOTS / (sizeof(uint8) * BITS_PER_BYTE)) + +#define RADIX_TREE_CHUNK_MASK ((1 << RADIX_TREE_NODE_FANOUT) - 1) +#define RADIX_TREE_MAX_SHIFT key_get_shift(UINT64_MAX) +#define RADIX_TREE_MAX_LEVEL ((sizeof(uint64) * BITS_PER_BYTE) / RADIX_TREE_NODE_FANOUT) + +#define GET_KEY_CHUNK(key, shift) \ + ((uint8) (((key) >> (shift)) & RADIX_TREE_CHUNK_MASK)) + +typedef enum radix_tree_node_kind +{ + RADIX_TREE_NODE_KIND_4 = 0, + RADIX_TREE_NODE_KIND_32, + RADIX_TREE_NODE_KIND_128, + RADIX_TREE_NODE_KIND_256 +} radix_tree_node_kind; +#define RADIX_TREE_NODE_KIND_COUNT 4 + +/* + * Base type for all nodes types. + * + * The key is a 64-bit unsigned integer and the value is a Datum. The internal + * tree nodes, shift > 0, store the pointer to its child nodes as a Datum value. + * The leaf nodes, shift == 0, stores the value that the user specified as a Datum + * value. + */ +typedef struct radix_tree_node +{ + /* + * Number of children. We use uint16 to be able to indicate 256 children + * at fanout of 8. + */ + uint16 count; + + /* + * Shift indicates which part of the key space is represented by this node. + * That is, the key is shifted by 'shift' and the lowest RADIX_TREE_NODE_FANOUT + * bits are then represented in chunk. + */ + uint8 shift; + uint8 chunk; + + /* Size class of the node */ + radix_tree_node_kind kind; +} radix_tree_node; +#define NodeIsLeaf(n) (((radix_tree_node *) (n))->shift == 0) +#define NodeHasFreeSlot(n) \ + (((radix_tree_node *) (n))->count < \ + radix_tree_node_info[((radix_tree_node *) (n))->kind].max_slots) + +/* + * To reduce memory usage compared to a simple radix tree with a fixed fanout + * we use adaptive node sides, with different storage methods for different + * numbers of elements. + */ +typedef struct radix_tree_node_4 +{ + radix_tree_node n; + + /* 4 children, for key chunks */ + uint8 chunks[4]; + Datum slots[4]; +} radix_tree_node_4; + +typedef struct radix_tree_node_32 +{ + radix_tree_node n; + + /* 32 children, for key chunks */ + uint8 chunks[32]; + Datum slots[32]; +} radix_tree_node_32; + +typedef struct radix_tree_node_128 +{ + radix_tree_node n; + + /* + * The index of slots for each fanout. 0 means unused whereas slots is + * 0-indexed. So we can get the slots of the chunk C by slots[C - 1]. + */ + uint8 slot_idxs[RADIX_TREE_NODE_MAX_SLOTS]; + + Datum slots[128]; +} radix_tree_node_128; + +typedef struct radix_tree_node_256 +{ + radix_tree_node n; + + /* A bitmap to track which slot is in use */ + uint8 set[RADIX_TREE_NODE_MAX_SLOT_BITS]; + + Datum slots[RADIX_TREE_NODE_MAX_SLOTS]; +} radix_tree_node_256; +#define RADIX_TREE_NODE_256_SET_BYTE(v) ((v) / RADIX_TREE_NODE_FANOUT) +#define RADIX_TREE_NODE_256_SET_BIT(v) (UINT64_C(1) << ((v) % RADIX_TREE_NODE_FANOUT)) + +/* Information of each size class */ +typedef struct radix_tree_node_info_elem +{ + const char *name; + int max_slots; + Size size; +} radix_tree_node_info_elem; + +static radix_tree_node_info_elem radix_tree_node_info[] = +{ + {"radix tree node 4", 4, sizeof(radix_tree_node_4)}, + {"radix tree node 32", 32, sizeof(radix_tree_node_32)}, + {"radix tree node 128", 128, sizeof(radix_tree_node_128)}, + {"radix tree node 256", 256, sizeof(radix_tree_node_256)}, +}; + +/* + * Iteration support. + * + * Iterating the radix tree returns each pair of key and value in the ascending order + * of the key. To support this, the we iterate nodes of each level. + * radix_tree_iter_node_data struct is used to track the iteration within a node. + * radix_tree_iter has the array of this struct, stack, in order to track the iteration + * of every level. During the iteration, we also construct the key to return. The key + * is updated whenever we update the node iteration information, e.g., when advancing + * the current index within the node or when moving to the next node at the same level. + */ +typedef struct radix_tree_iter_node_data +{ + radix_tree_node *node; /* current node being iterated */ + int current_idx; /* current position. -1 for initial value */ +} radix_tree_iter_node_data; + +struct radix_tree_iter +{ + radix_tree *tree; + + /* Track the iteration on nodes of each level */ + radix_tree_iter_node_data stack[RADIX_TREE_MAX_LEVEL]; + int stack_len; + + /* The key is being constructed during the iteration */ + uint64 key; +}; + +/* A radix tree with nodes */ +struct radix_tree +{ + MemoryContext context; + + radix_tree_node *root; + uint64 max_val; + uint64 num_keys; + MemoryContextData *slabs[RADIX_TREE_NODE_KIND_COUNT]; + + /* stats */ + uint64 mem_used; + int32 cnt[RADIX_TREE_NODE_KIND_COUNT]; +}; + +static radix_tree_node *radix_tree_node_grow(radix_tree *tree, radix_tree_node *parent, radix_tree_node *node); +static radix_tree_node *radix_tree_find_child(radix_tree_node *node, uint64 key); +static Datum *radix_tree_find_slot_ptr(radix_tree_node *node, uint8 chunk); +static void radix_tree_replace_slot(radix_tree_node *parent, radix_tree_node *node, + uint8 chunk); +static void radix_tree_extend(radix_tree *tree, uint64 key); +static void radix_tree_new_root(radix_tree *tree, uint64 key, Datum val); +static radix_tree_node *radix_tree_insert_child(radix_tree *tree, radix_tree_node *parent, radix_tree_node *node, + uint64 key); +static void radix_tree_insert_val(radix_tree *tree, radix_tree_node *parent, radix_tree_node *node, + uint64 key, Datum val, bool *replaced_p); + +static inline void radix_tree_iter_update_key(radix_tree_iter *iter, uint8 chunk, uint8 shift); +static Datum radix_tree_node_iterate_next(radix_tree_iter *iter, radix_tree_iter_node_data *node_iter, + bool *found_p); +static void radix_tree_store_iter_node(radix_tree_iter *iter, radix_tree_iter_node_data *node_iter, + radix_tree_node *node); +static void radix_tree_update_iter_stack(radix_tree_iter *iter, int from); + +static inline int +node_32_search_eq(radix_tree_node_32 *node, uint8 chunk) +{ +#ifdef __AVX2__ + __m256i _key = _mm256_set1_epi8(chunk); + __m256i _data = _mm256_loadu_si256((__m256i_u *) node->chunks); + __m256i _cmp = _mm256_cmpeq_epi8(_key, _data); + uint32 bitfield = _mm256_movemask_epi8(_cmp); + + bitfield &= ((UINT64_C(1) << node->n.count) - 1); + + return (bitfield) ? __builtin_ctz(bitfield) : -1; + +#else + for (int i = 0; i < node->n.count; i++) + { + if (node->chunks[i] > chunk) + return -1; + + if (node->chunks[i] == chunk) + return i; + } + + return -1; +#endif /* __AVX2__ */ +} + +/* + * This is a bit more complicated than search_chunk_array_16_eq(), because + * until recently no unsigned uint8 comparison instruction existed on x86. So + * we need to play some trickery using _mm_min_epu8() to effectively get + * <=. There never will be any equal elements in the current uses, but that's + * what we get here... + */ +static inline int +node_32_search_le(radix_tree_node_32 *node, uint8 chunk) +{ +#ifdef __AVX2__ + __m256i _key = _mm256_set1_epi8(chunk); + __m256i _data = _mm256_loadu_si256((__m256i_u*) node->chunks); + __m256i _min = _mm256_min_epu8(_key, _data); + __m256i cmp = _mm256_cmpeq_epi8(_key, _min); + uint32_t bitfield=_mm256_movemask_epi8(cmp); + + bitfield &= ((UINT64_C(1) << node->n.count) - 1); + + return (bitfield) ? __builtin_ctz(bitfield) : node->n.count; +#else + int index; + + for (index = 0; index < node->n.count; index++) + { + if (node->chunks[index] >= chunk) + break; + } + + return index; +#endif /* __AVX2__ */ +} + +static inline int +node_128_get_slot_pos(radix_tree_node_128 *node, uint8 chunk) +{ + return node->slot_idxs[chunk] - 1; +} + +static inline bool +node_128_is_slot_used(radix_tree_node_128 *node, uint8 chunk) +{ + return (node_128_get_slot_pos(node, chunk) >= 0); +} + +/* Return true if the slot corresponding to the given chunk is in use */ +static inline bool +node_256_is_slot_used(radix_tree_node_256 *node, uint8 chunk) +{ + return (node->set[RADIX_TREE_NODE_256_SET_BYTE(chunk)] & + RADIX_TREE_NODE_256_SET_BIT(chunk)) != 0; + +} + +/* Set the slot at the given chunk position */ +static inline void +node_256_set(radix_tree_node_256 *node, uint8 chunk, Datum slot) +{ + node->slots[chunk] = slot; + node->set[RADIX_TREE_NODE_256_SET_BYTE(chunk)] |= RADIX_TREE_NODE_256_SET_BIT(chunk); +} + +/* Return the shift that is satisfied to store the given key */ +inline static int +key_get_shift(uint64 key) +{ + return (key == 0) + ? 0 + : (pg_leftmost_one_pos64(key) / RADIX_TREE_NODE_FANOUT) * RADIX_TREE_NODE_FANOUT; +} + +/* Return the max value stored in a node with the given shift */ +static uint64 +shift_get_max_val(int shift) +{ + if (shift == RADIX_TREE_MAX_SHIFT) + return UINT64_MAX; + + return (UINT64_C(1) << (shift + RADIX_TREE_NODE_FANOUT)) - 1; +} + +/* Allocate a new node with the given node kind */ +static radix_tree_node * +radix_tree_alloc_node(radix_tree *tree, radix_tree_node_kind kind) +{ + radix_tree_node *newnode; + + newnode = (radix_tree_node *) MemoryContextAllocZero(tree->slabs[kind], + radix_tree_node_info[kind].size); + newnode->kind = kind; + + /* update stats */ + tree->mem_used += GetMemoryChunkSpace(newnode); + tree->cnt[kind]++; + + return newnode; +} + +/* Free the given node */ +static void +radix_tree_free_node(radix_tree *tree, radix_tree_node *node) +{ + /* update stats */ + tree->mem_used -= GetMemoryChunkSpace(node); + tree->cnt[node->kind]--; + + pfree(node); +} + +/* Copy the common fields without the node kind */ +static void +radix_tree_copy_node_common(radix_tree_node *src, radix_tree_node *dst) +{ + dst->shift = src->shift; + dst->chunk = src->chunk; + dst->count = src->count; +} + +/* The tree doesn't have not sufficient height, so grow it */ +static void +radix_tree_extend(radix_tree *tree, uint64 key) +{ + int max_shift; + int shift = tree->root->shift + RADIX_TREE_NODE_FANOUT; + + max_shift = key_get_shift(key); + + /* Grow tree from 'shift' to 'max_shift' */ + while (shift <= max_shift) + { + radix_tree_node_4 *node = + (radix_tree_node_4 *) radix_tree_alloc_node(tree, RADIX_TREE_NODE_KIND_4); + + node->n.count = 1; + node->n.shift = shift; + node->chunks[0] = 0; + node->slots[0] = PointerGetDatum(tree->root); + + tree->root->chunk = 0; + tree->root = (radix_tree_node *) node; + + shift += RADIX_TREE_NODE_FANOUT; + } + + tree->max_val = shift_get_max_val(max_shift); +} + +/* + * Return the pointer to the child node corresponding with the key. Otherwise (if + * not found) return NULL. + */ +static radix_tree_node * +radix_tree_find_child(radix_tree_node *node, uint64 key) +{ + Datum *slot_ptr; + int chunk = GET_KEY_CHUNK(key, node->shift); + + slot_ptr = radix_tree_find_slot_ptr(node, chunk); + + return (slot_ptr == NULL) ? NULL : (radix_tree_node *) DatumGetPointer(*slot_ptr); +} + +/* + * Return the address of the slot corresponding to chunk in the node, if found. + * Otherwise return NULL. + */ +static Datum * +radix_tree_find_slot_ptr(radix_tree_node *node, uint8 chunk) +{ + + switch (node->kind) + { + case RADIX_TREE_NODE_KIND_4: + { + radix_tree_node_4 *n4 = (radix_tree_node_4 *) node; + + /* Do linear search */ + for (int i = 0; i < n4->n.count; i++) + { + if (n4->chunks[i] > chunk) + break; + + if (n4->chunks[i] == chunk) + return &(n4->slots[i]); + } + + break; + } + case RADIX_TREE_NODE_KIND_32: + { + radix_tree_node_32 *n32 = (radix_tree_node_32 *) node; + int ret; + + /* Search by SIMD instructions */ + ret = node_32_search_eq(n32, chunk); + + if (ret < 0) + break; + + return &(n32->slots[ret]); + break; + } + case RADIX_TREE_NODE_KIND_128: + { + radix_tree_node_128 *n128 = (radix_tree_node_128 *) node; + + if (!node_128_is_slot_used(n128, chunk)) + break; + + return &(n128->slots[node_128_get_slot_pos(n128, chunk)]); + break; + } + case RADIX_TREE_NODE_KIND_256: + { + radix_tree_node_256 *n256 = (radix_tree_node_256 *) node; + + if (!node_256_is_slot_used(n256, chunk)) + break; + + return &(n256->slots[chunk]); + break; + } + } + + return NULL; +} + +/* Link from the parent to the node */ +static void +radix_tree_replace_slot(radix_tree_node *parent, radix_tree_node *node, uint8 chunk) +{ + Datum *slot_ptr; + + slot_ptr = radix_tree_find_slot_ptr(parent, chunk); + *slot_ptr = PointerGetDatum(node); +} + +/* + * Create a new node as the root. Subordinate nodes will be created during + * the insertion. + */ +static void +radix_tree_new_root(radix_tree *tree, uint64 key, Datum val) +{ + radix_tree_node_4 * n4 = + (radix_tree_node_4 * ) radix_tree_alloc_node(tree, RADIX_TREE_NODE_KIND_4); + int shift = key_get_shift(key); + + n4->n.shift = shift; + tree->max_val = shift_get_max_val(shift); + tree->root = (radix_tree_node *) n4; +} + +/* Insert 'node' as a child node of 'parent' */ +static radix_tree_node * +radix_tree_insert_child(radix_tree *tree, radix_tree_node *parent, radix_tree_node *node, + uint64 key) +{ + radix_tree_node *newchild = + (radix_tree_node *) radix_tree_alloc_node(tree, RADIX_TREE_NODE_KIND_4); + + Assert(!NodeIsLeaf(node)); + + newchild->shift = node->shift - RADIX_TREE_NODE_FANOUT; + newchild->chunk = GET_KEY_CHUNK(key, node->shift); + + radix_tree_insert_val(tree, parent, node, key, PointerGetDatum(newchild), NULL); + + return (radix_tree_node *) newchild; +} + +/* + * Insert the value to the node. The node grows if it's full. + * + * *replaced_p is set to true if the key already exists and its value is updated + * by this function. + */ +static void +radix_tree_insert_val(radix_tree *tree, radix_tree_node *parent, radix_tree_node *node, + uint64 key, Datum val, bool *replaced_p) +{ + int chunk = GET_KEY_CHUNK(key, node->shift); + bool replaced = false; + + switch (node->kind) + { + case RADIX_TREE_NODE_KIND_4: + { + radix_tree_node_4 *n4 = (radix_tree_node_4 *) node; + int idx; + + for (idx = 0; idx < n4->n.count; idx++) + { + if (n4->chunks[idx] >= chunk) + break; + } + + if (NodeHasFreeSlot(n4)) + { + if (n4->n.count == 0) + { + /* the first key for this node, add it */ + } + else if (n4->chunks[idx] == chunk) + { + /* found the key, replace it */ + replaced = true; + } + else if (idx != n4->n.count) + { + /* + * the key needs to be inserted in the middle of the array, + * make space for the new key. + */ + memmove(&(n4->chunks[idx + 1]), &(n4->chunks[idx]), + sizeof(uint8) * (n4->n.count - idx)); + memmove(&(n4->slots[idx + 1]), &(n4->slots[idx]), + sizeof(radix_tree_node *) * (n4->n.count - idx)); + } + + n4->chunks[idx] = chunk; + n4->slots[idx] = val; + + /* Done */ + break; + } + + /* The node needs to grow */ + node = radix_tree_node_grow(tree, parent, node); + Assert(node->kind == RADIX_TREE_NODE_KIND_32); + } + /* FALLTHROUGH */ + case RADIX_TREE_NODE_KIND_32: + { + radix_tree_node_32 *n32 = (radix_tree_node_32 *) node; + int idx; + + idx = node_32_search_le(n32, chunk); + + if (NodeHasFreeSlot(n32)) + { + if (n32->n.count == 0) + { + /* first key for this node, add it */ + } + else if (n32->chunks[idx] == chunk) + { + /* found the key, replace it */ + replaced = true; + } + else if (idx != n32->n.count) + { + /* + * the key needs to be inserted in the middle of the array, + * make space for the new key. + */ + memmove(&(n32->chunks[idx + 1]), &(n32->chunks[idx]), + sizeof(uint8) * (n32->n.count - idx)); + memmove(&(n32->slots[idx + 1]), &(n32->slots[idx]), + sizeof(radix_tree_node *) * (n32->n.count - idx)); + } + + n32->chunks[idx] = chunk; + n32->slots[idx] = val; + break; + } + + /* The node needs to grow */ + node = radix_tree_node_grow(tree, parent, node); + Assert(node->kind == RADIX_TREE_NODE_KIND_128); + } + /* FALLTHROUGH */ + case RADIX_TREE_NODE_KIND_128: + { + radix_tree_node_128 *n128 = (radix_tree_node_128 *) node; + + if (node_128_is_slot_used(n128, chunk)) + { + n128->slots[node_128_get_slot_pos(n128, chunk)] = val; + replaced = true; + break; + } + + if (NodeHasFreeSlot(n128)) + { + uint8 pos = n128->n.count + 1; + + n128->slot_idxs[chunk] = pos; + n128->slots[pos - 1] = val; + break; + } + + node = radix_tree_node_grow(tree, parent, node); + Assert(node->kind == RADIX_TREE_NODE_KIND_256); + } + /* FALLTHROUGH */ + case RADIX_TREE_NODE_KIND_256: + { + radix_tree_node_256 *n256 = (radix_tree_node_256 *) node; + + if (node_256_is_slot_used(n256, chunk)) + replaced = true; + + node_256_set(n256, chunk, val); + break; + } + } + + if (!replaced) + node->count++; + + if (replaced_p) + *replaced_p = replaced; +} + +/* Change the node type to a larger one */ +static radix_tree_node * +radix_tree_node_grow(radix_tree *tree, radix_tree_node *parent, radix_tree_node *node) +{ + radix_tree_node *newnode = NULL; + + Assert(node->count == + radix_tree_node_info[node->kind].max_slots); + + switch (node->kind) + { + case RADIX_TREE_NODE_KIND_4: + { + radix_tree_node_4 *n4 = (radix_tree_node_4 *) node; + radix_tree_node_32 *new32 = + (radix_tree_node_32 *) radix_tree_alloc_node(tree, RADIX_TREE_NODE_KIND_32); + + radix_tree_copy_node_common((radix_tree_node *) n4, + (radix_tree_node *) new32); + + memcpy(&(new32->chunks), &(n4->chunks), sizeof(uint8) * 4); + memcpy(&(new32->slots), &(n4->slots), sizeof(Datum) * 4); + + newnode = (radix_tree_node *) new32; + break; + } + case RADIX_TREE_NODE_KIND_32: + { + radix_tree_node_32 *n32 = (radix_tree_node_32 *) node; + radix_tree_node_128 *new128 = + (radix_tree_node_128 *) radix_tree_alloc_node(tree,RADIX_TREE_NODE_KIND_128); + + radix_tree_copy_node_common((radix_tree_node *) n32, + (radix_tree_node *) new128); + + for (int i = 0; i < n32->n.count; i++) + { + new128->slot_idxs[n32->chunks[i]] = i + 1; + new128->slots[i] = n32->slots[i]; + } + + newnode = (radix_tree_node *) new128; + break; + } + case RADIX_TREE_NODE_KIND_128: + { + radix_tree_node_128 *n128 = (radix_tree_node_128 *) node; + radix_tree_node_256 *new256 = + (radix_tree_node_256 *) radix_tree_alloc_node(tree,RADIX_TREE_NODE_KIND_256); + int cnt = 0; + + radix_tree_copy_node_common((radix_tree_node *) n128, + (radix_tree_node *) new256); + + for (int i = 0; i < RADIX_TREE_NODE_MAX_SLOTS && cnt < n128->n.count; i++) + { + if (!node_128_is_slot_used(n128, i)) + continue; + + node_256_set(new256, i, n128->slots[node_128_get_slot_pos(n128, i)]); + cnt++; + } + + newnode = (radix_tree_node *) new256; + break; + } + case RADIX_TREE_NODE_KIND_256: + elog(ERROR, "radix tree node_256 cannot grow"); + break; + } + + /* Replace the old node with the new one */ + if (parent == node) + tree->root = newnode; + else + radix_tree_replace_slot(parent, newnode, node->chunk); + + /* Free the old node */ + radix_tree_free_node(tree, node); + + return newnode; +} + +/* Create the radix tree in the given memory context */ +radix_tree * +radix_tree_create(MemoryContext ctx) +{ + radix_tree *tree; + MemoryContext old_ctx; + + old_ctx = MemoryContextSwitchTo(ctx); + + tree = palloc(sizeof(radix_tree)); + tree->max_val = 0; + tree->root = NULL; + tree->context = ctx; + tree->num_keys = 0; + tree->mem_used = 0; + + /* Create the slab allocator for each size class */ + for (int i = 0; i < RADIX_TREE_NODE_KIND_COUNT; i++) + { + tree->slabs[i] = SlabContextCreate(ctx, + radix_tree_node_info[i].name, + SLAB_DEFAULT_BLOCK_SIZE, + radix_tree_node_info[i].size); + tree->cnt[i] = 0; + } + + MemoryContextSwitchTo(old_ctx); + + return tree; +} + +void +radix_tree_destroy(radix_tree *tree) +{ + for (int i = 0; i < RADIX_TREE_NODE_KIND_COUNT; i++) + MemoryContextDelete(tree->slabs[i]); + + pfree(tree); +} + +/* + * Insert the key with the val. + * + * found_p is set to true if the key already present, otherwise false, if + * it's not NULL. + * + * XXX: consider a better API. Is it better to support like 'update' flag + * instead of 'found_p' so the user can asks to update the value if already + * exists? + */ +void +radix_tree_insert(radix_tree *tree, uint64 key, Datum val, bool *found_p) +{ + int shift; + bool replaced; + radix_tree_node *node; + radix_tree_node *parent = tree->root; + + /* Empty tree, create the root */ + if (!tree->root) + radix_tree_new_root(tree, key, val); + + /* Extend the tree if necessary */ + if (key > tree->max_val) + radix_tree_extend(tree, key); + + Assert(tree->root); + + shift = tree->root->shift; + node = tree->root; + while (shift > 0) + { + radix_tree_node *child; + + child = radix_tree_find_child(node, key); + + if (child == NULL) + child = radix_tree_insert_child(tree, parent, node, key); + + parent = node; + node = child; + shift -= RADIX_TREE_NODE_FANOUT; + } + + /* arrived at a leaf, so insert the value */ + Assert(NodeIsLeaf(node)); + radix_tree_insert_val(tree, parent, node, key, val, &replaced); + + if (!replaced) + tree->num_keys++; + + if (found_p) + *found_p = replaced; +} + +/* + * Return the Datum value of the given key. + * + * found_p is set to true if it's found, otherwise false. + */ +Datum +radix_tree_search(radix_tree *tree, uint64 key, bool *found_p) +{ + radix_tree_node *node; + int shift; + + if (!tree->root || key > tree->max_val) + goto not_found; + + node = tree->root; + shift = tree->root->shift; + while (shift >= 0) + { + radix_tree_node *child; + + if (NodeIsLeaf(node)) + { + Datum *slot_ptr; + int chunk = GET_KEY_CHUNK(key, node->shift); + + /* We reached at a leaf node, find the corresponding slot */ + slot_ptr = radix_tree_find_slot_ptr(node, chunk); + + if (slot_ptr == NULL) + goto not_found; + + /* Found! */ + *found_p = true; + return *slot_ptr; + } + + child = radix_tree_find_child(node, key); + + if (child == NULL) + goto not_found; + + node = child; + shift -= RADIX_TREE_NODE_FANOUT; + } + +not_found: + *found_p = false; + return (Datum) 0; +} + +/* Create and return the iterator for the given radix tree */ +radix_tree_iter * +radix_tree_begin_iterate(radix_tree *tree) +{ + MemoryContext old_ctx; + radix_tree_iter *iter; + int top_level; + + old_ctx = MemoryContextSwitchTo(tree->context); + + iter = (radix_tree_iter *) palloc0(sizeof(radix_tree_iter)); + iter->tree = tree; + + /* empty tree */ + if (!iter->tree) + return iter; + + top_level = iter->tree->root->shift / RADIX_TREE_NODE_FANOUT; + + iter->stack_len = top_level; + iter->stack[top_level].node = iter->tree->root; + iter->stack[top_level].current_idx = -1; + + /* Descend to the left most leaf node from the root */ + radix_tree_update_iter_stack(iter, top_level); + + MemoryContextSwitchTo(old_ctx); + + return iter; +} + +/* + * Return true with setting key_p and value_p if there is next key. Otherwise, + * return false. + */ +bool +radix_tree_iterate_next(radix_tree_iter *iter, uint64 *key_p, Datum *value_p) +{ + bool found = false; + Datum slot = (Datum) 0; + int level; + + /* Empty tree */ + if (!iter->tree) + return false; + + for (;;) + { + radix_tree_node *node; + radix_tree_iter_node_data *node_iter; + + /* + * Iterate node at each level from the bottom of the tree until we find + * the next slot. + */ + for (level = 0; level <= iter->stack_len; level++) + { + slot = radix_tree_node_iterate_next(iter, &(iter->stack[level]), &found); + + if (found) + break; + } + + /* end of iteration */ + if (!found) + return false; + + /* found the next slot at the leaf node, return it */ + if (level == 0) + { + *key_p = iter->key; + *value_p = slot; + return true; + } + + /* + * We have advanced more than one nodes including internal nodes. So we need + * to update the stack by descending to the left most leaf node from this level. + */ + node = (radix_tree_node *) DatumGetPointer(slot); + node_iter = &(iter->stack[level - 1]); + radix_tree_store_iter_node(iter, node_iter, node); + + radix_tree_update_iter_stack(iter, level - 1); + } +} + +void +radix_tree_end_iterate(radix_tree_iter *iter) +{ + pfree(iter); +} + +/* + * Update the part of the key being constructed during the iteration with the + * given chunk + */ +static inline void +radix_tree_iter_update_key(radix_tree_iter *iter, uint8 chunk, uint8 shift) +{ + iter->key &= ~(((uint64) RADIX_TREE_CHUNK_MASK) << shift); + iter->key |= (((uint64) chunk) << shift); +} + +/* + * Iterate over the given radix tree node and returns the next slot of the given + * node and set true to *found_p, if any. Otherwise, set false to *found_p. + */ +static Datum +radix_tree_node_iterate_next(radix_tree_iter *iter, radix_tree_iter_node_data *node_iter, + bool *found_p) +{ + radix_tree_node *node = node_iter->node; + Datum slot = (Datum) 0; + + switch (node->kind) + { + case RADIX_TREE_NODE_KIND_4: + { + radix_tree_node_4 *n4 = (radix_tree_node_4 *) node_iter->node; + + node_iter->current_idx++; + + if (node_iter->current_idx >= n4->n.count) + goto not_found; + + slot = n4->slots[node_iter->current_idx]; + + /* Update the part of the key with the current chunk */ + if (NodeIsLeaf(node)) + radix_tree_iter_update_key(iter, n4->chunks[node_iter->current_idx], 0); + + break; + } + case RADIX_TREE_NODE_KIND_32: + { + radix_tree_node_32 *n32 = (radix_tree_node_32 *) node; + + node_iter->current_idx++; + + if (node_iter->current_idx >= n32->n.count) + goto not_found; + + slot = n32->slots[node_iter->current_idx]; + + /* Update the part of the key with the current chunk */ + if (NodeIsLeaf(node)) + radix_tree_iter_update_key(iter, n32->chunks[node_iter->current_idx], 0); + + break; + } + case RADIX_TREE_NODE_KIND_128: + { + radix_tree_node_128 *n128 = (radix_tree_node_128 *) node; + int i; + + for (i = node_iter->current_idx + 1; i < RADIX_TREE_NODE_MAX_SLOTS; i++) + { + if (node_128_is_slot_used(n128, i)) + break; + } + + if (i >= RADIX_TREE_NODE_MAX_SLOTS) + goto not_found; + + node_iter->current_idx = i; + slot = n128->slots[node_128_get_slot_pos(n128, i)]; + + /* Update the part of the key */ + if (NodeIsLeaf(node)) + radix_tree_iter_update_key(iter, node_iter->current_idx, 0); + + break; + } + case RADIX_TREE_NODE_KIND_256: + { + radix_tree_node_256 *n256 = (radix_tree_node_256 *) node; + int i; + + for (i = node_iter->current_idx + 1; i < RADIX_TREE_NODE_MAX_SLOTS; i++) + { + if (node_256_is_slot_used(n256, i)) + break; + } + + if (i >= RADIX_TREE_NODE_MAX_SLOTS) + goto not_found; + + node_iter->current_idx = i; + slot = n256->slots[i]; + + /* Update the part of the key */ + if (NodeIsLeaf(node)) + radix_tree_iter_update_key(iter, node_iter->current_idx, 0); + + break; + } + } + + *found_p = true; + return slot; + +not_found: + *found_p = false; + return (Datum) 0; +} + +/* + * Initialize and update the node iteration struct with the given radix tree node. + * This function also updates the part of the key with the chunk of the given node. + */ +static void +radix_tree_store_iter_node(radix_tree_iter *iter, radix_tree_iter_node_data *node_iter, + radix_tree_node *node) +{ + node_iter->node = node; + node_iter->current_idx = -1; + + radix_tree_iter_update_key(iter, node->chunk, node->shift + RADIX_TREE_NODE_FANOUT); +} + +/* + * Build the stack of the radix tree node while descending to the leaf from the 'from' + * level. + */ +static void +radix_tree_update_iter_stack(radix_tree_iter *iter, int from) +{ + radix_tree_node *node = iter->stack[from].node; + int level = from; + + for (;;) + { + radix_tree_iter_node_data *node_iter = &(iter->stack[level--]); + bool found; + + /* Set the current node */ + radix_tree_store_iter_node(iter, node_iter, node); + + if (NodeIsLeaf(node)) + break; + + node = (radix_tree_node *) + DatumGetPointer(radix_tree_node_iterate_next(iter, node_iter, &found)); + + /* + * Since we always get the first slot in the node, we have to found + * the slot. + */ + Assert(found); + } +} + +uint64 +radix_tree_num_entries(radix_tree *tree) +{ + return tree->num_keys; +} + +uint64 +radix_tree_memory_usage(radix_tree *tree) +{ + return tree->mem_used; +} + +/***************** DEBUG FUNCTIONS *****************/ +#ifdef RADIX_TREE_DEBUG +void +radix_tree_stats(radix_tree *tree) +{ + fprintf(stderr, "num_keys = %lu, height = %u, n4 = %u(%lu), n32 = %u(%lu), n128 = %u(%lu), n256 = %u(%lu)", + tree->num_keys, + tree->root->shift / RADIX_TREE_NODE_FANOUT, + tree->cnt[0], tree->cnt[0] * sizeof(radix_tree_node_4), + tree->cnt[1], tree->cnt[1] * sizeof(radix_tree_node_32), + tree->cnt[2], tree->cnt[2] * sizeof(radix_tree_node_128), + tree->cnt[3], tree->cnt[3] * sizeof(radix_tree_node_256)); + //radix_tree_dump(tree); +} + +static void +radix_tree_print_slot(StringInfo buf, uint8 chunk, Datum slot, int idx, bool is_leaf, int level) +{ + char space[128] = {0}; + + if (level > 0) + sprintf(space, "%*c", level * 4, ' '); + + if (is_leaf) + appendStringInfo(buf, "%s[%d] \"0x%X\" val(0x%lX) LEAF\n", + space, + idx, + chunk, + DatumGetInt64(slot)); + else + appendStringInfo(buf , "%s[%d] \"0x%X\" -> ", + space, + idx, + chunk); +} + +static void +radix_tree_dump_node(radix_tree_node *node, int level, StringInfo buf, bool recurse) +{ + bool is_leaf = NodeIsLeaf(node); + + appendStringInfo(buf, "[\"%s\" type %d, cnt %u, shift %u, chunk \"0x%X\"] chunks:\n", + NodeIsLeaf(node) ? "LEAF" : "INNR", + (node->kind == RADIX_TREE_NODE_KIND_4) ? 4 : + (node->kind == RADIX_TREE_NODE_KIND_32) ? 32 : + (node->kind == RADIX_TREE_NODE_KIND_128) ? 128 : 256, + node->count, node->shift, node->chunk); + + switch (node->kind) + { + case RADIX_TREE_NODE_KIND_4: + { + radix_tree_node_4 *n4 = (radix_tree_node_4 *) node; + + for (int i = 0; i < n4->n.count; i++) + { + radix_tree_print_slot(buf, n4->chunks[i], n4->slots[i], i, is_leaf, level); + + if (!is_leaf) + { + if (recurse) + { + StringInfoData buf2; + + initStringInfo(&buf2); + radix_tree_dump_node((radix_tree_node *) n4->slots[i], level + 1, &buf2, recurse); + appendStringInfo(buf, "%s", buf2.data); + } + else + appendStringInfo(buf, "\n"); + } + } + break; + } + case RADIX_TREE_NODE_KIND_32: + { + radix_tree_node_32 *n32 = (radix_tree_node_32 *) node; + + for (int i = 0; i < n32->n.count; i++) + { + radix_tree_print_slot(buf, n32->chunks[i], n32->slots[i], i, is_leaf, level); + + if (!is_leaf) + { + if (recurse) + { + StringInfoData buf2; + + initStringInfo(&buf2); + radix_tree_dump_node((radix_tree_node *) n32->slots[i], level + 1, &buf2, recurse); + appendStringInfo(buf, "%s", buf2.data); + } + else + appendStringInfo(buf, "\n"); + } + } + break; + } + case RADIX_TREE_NODE_KIND_128: + { + radix_tree_node_128 *n128 = (radix_tree_node_128 *) node; + + for (int i = 0; i < RADIX_TREE_NODE_MAX_SLOTS; i++) + { + if (!node_128_is_slot_used(n128, i)) + continue; + + radix_tree_print_slot(buf, i, n128->slots[node_128_get_slot_pos(n128, i)], + i, is_leaf, level); + + if (!is_leaf) + { + if (recurse) + { + StringInfoData buf2; + + initStringInfo(&buf2); + radix_tree_dump_node((radix_tree_node *) n128->slots[node_128_get_slot_pos(n128, i)], + level + 1, &buf2, recurse); + appendStringInfo(buf, "%s", buf2.data); + } + else + appendStringInfo(buf, "\n"); + } + } + break; + } + case RADIX_TREE_NODE_KIND_256: + { + radix_tree_node_256 *n256 = (radix_tree_node_256 *) node; + + for (int i = 0; i < RADIX_TREE_NODE_MAX_SLOTS; i++) + { + if (!node_256_is_slot_used(n256, i)) + continue; + + radix_tree_print_slot(buf, i, n256->slots[i], i, is_leaf, level); + + if (!is_leaf) + { + if (recurse) + { + StringInfoData buf2; + + initStringInfo(&buf2); + radix_tree_dump_node((radix_tree_node *) n256->slots[i], level + 1, &buf2, recurse); + appendStringInfo(buf, "%s", buf2.data); + } + else + appendStringInfo(buf, "\n"); + } + } + break; + } + } +} + +void +radix_tree_dump_search(radix_tree *tree, uint64 key) +{ + StringInfoData buf; + radix_tree_node *node; + int shift; + int level = 0; + + elog(WARNING, "-----------------------------------------------------------"); + elog(WARNING, "max_val = %lu (0x%lX)", tree->max_val, tree->max_val); + + if (!tree->root) + { + elog(WARNING, "tree is empty"); + return; + } + + if (key > tree->max_val) + { + elog(WARNING, "key %lu (0x%lX) is larger than max val", + key, key); + return; + } + + initStringInfo(&buf); + node = tree->root; + shift = tree->root->shift; + while (shift >= 0) + { + radix_tree_node *child; + + radix_tree_dump_node(node, level, &buf, false); + + if (NodeIsLeaf(node)) + { + int chunk = GET_KEY_CHUNK(key, node->shift); + + /* We reached at a leaf node, find the corresponding slot */ + radix_tree_find_slot_ptr(node, chunk); + + break; + } + + child = radix_tree_find_child(node, key); + + if (child == NULL) + break; + + node = child; + shift -= RADIX_TREE_NODE_FANOUT; + level++; + } + + elog(WARNING, "\n%s", buf.data); +} + +void +radix_tree_dump(radix_tree *tree) +{ + StringInfoData buf; + + initStringInfo(&buf); + + elog(WARNING, "-----------------------------------------------------------"); + elog(WARNING, "max_val = %lu", tree->max_val); + radix_tree_dump_node(tree->root, 0, &buf, true); + elog(WARNING, "\n%s", buf.data); + elog(WARNING, "-----------------------------------------------------------"); +} +#endif diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h new file mode 100644 index 0000000000..fe5a4fd79a --- /dev/null +++ b/src/include/lib/radixtree.h @@ -0,0 +1,41 @@ +/*------------------------------------------------------------------------- + * + * radixtree.h + * Interface for radix tree. + * + * Copyright (c) 2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/include/lib/radixtree.h + * + *------------------------------------------------------------------------- + */ +#ifndef RADIXTREE_H +#define RADIXTREE_H + +#include "postgres.h" + +#define RADIX_TREE_DEBUG 1 + +typedef struct radix_tree radix_tree; +typedef struct radix_tree_iter radix_tree_iter; + +extern radix_tree *radix_tree_create(MemoryContext ctx); +extern Datum radix_tree_search(radix_tree *tree, uint64 key, bool *found); +extern void radix_tree_destroy(radix_tree *tree); +extern void radix_tree_insert(radix_tree *tree, uint64 key, Datum val, bool *found_p); +extern uint64 radix_tree_memory_usage(radix_tree *tree); +extern uint64 radix_tree_num_entries(radix_tree *tree); + +extern radix_tree_iter *radix_tree_begin_iterate(radix_tree *tree); +extern bool radix_tree_iterate_next(radix_tree_iter *iter, uint64 *key_p, Datum *value_p); +extern void radix_tree_end_iterate(radix_tree_iter *iter); + + +#ifdef RADIX_TREE_DEBUG +extern void radix_tree_dump(radix_tree *tree); +extern void radix_tree_dump_search(radix_tree *tree, uint64 key); +extern void radix_tree_stats(radix_tree *tree); +#endif + +#endif /* RADIXTREE_H */ diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index 9090226daa..51b2514faf 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -24,6 +24,7 @@ SUBDIRS = \ test_parser \ test_pg_dump \ test_predtest \ + test_radixtree \ test_rbtree \ test_regex \ test_rls_hooks \ diff --git a/src/test/modules/test_radixtree/.gitignore b/src/test/modules/test_radixtree/.gitignore new file mode 100644 index 0000000000..5dcb3ff972 --- /dev/null +++ b/src/test/modules/test_radixtree/.gitignore @@ -0,0 +1,4 @@ +# Generated subdirectories +/log/ +/results/ +/tmp_check/ diff --git a/src/test/modules/test_radixtree/Makefile b/src/test/modules/test_radixtree/Makefile new file mode 100644 index 0000000000..da06b93da3 --- /dev/null +++ b/src/test/modules/test_radixtree/Makefile @@ -0,0 +1,23 @@ +# src/test/modules/test_radixtree/Makefile + +MODULE_big = test_radixtree +OBJS = \ + $(WIN32RES) \ + test_radixtree.o +PGFILEDESC = "test_radixtree - test code for src/backend/lib/radixtree.c" + +EXTENSION = test_radixtree +DATA = test_radixtree--1.0.sql + +REGRESS = test_radixtree + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_radixtree +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/test_radixtree/README b/src/test/modules/test_radixtree/README new file mode 100644 index 0000000000..a8b271869a --- /dev/null +++ b/src/test/modules/test_radixtree/README @@ -0,0 +1,7 @@ +test_integerset contains unit tests for testing the integer set implementation +in src/backend/lib/integerset.c. + +The tests verify the correctness of the implementation, but they can also be +used as a micro-benchmark. If you set the 'intset_test_stats' flag in +test_integerset.c, the tests will print extra information about execution time +and memory usage. diff --git a/src/test/modules/test_radixtree/expected/test_radixtree.out b/src/test/modules/test_radixtree/expected/test_radixtree.out new file mode 100644 index 0000000000..0c96ebc739 --- /dev/null +++ b/src/test/modules/test_radixtree/expected/test_radixtree.out @@ -0,0 +1,20 @@ +CREATE EXTENSION test_radixtree; +-- +-- All the logic is in the test_radixtree() function. It will throw +-- an error if something fails. +-- +SELECT test_radixtree(); +NOTICE: testing radix tree with pattern "all ones" +NOTICE: testing radix tree with pattern "alternating bits" +NOTICE: testing radix tree with pattern "clusters of ten" +NOTICE: testing radix tree with pattern "clusters of hundred" +NOTICE: testing radix tree with pattern "one-every-64k" +NOTICE: testing radix tree with pattern "sparse" +NOTICE: testing radix tree with pattern "single values, distance > 2^32" +NOTICE: testing radix tree with pattern "clusters, distance > 2^32" +NOTICE: testing radix tree with pattern "clusters, distance > 2^60" + test_radixtree +---------------- + +(1 row) + diff --git a/src/test/modules/test_radixtree/sql/test_radixtree.sql b/src/test/modules/test_radixtree/sql/test_radixtree.sql new file mode 100644 index 0000000000..41ece5e9f5 --- /dev/null +++ b/src/test/modules/test_radixtree/sql/test_radixtree.sql @@ -0,0 +1,7 @@ +CREATE EXTENSION test_radixtree; + +-- +-- All the logic is in the test_radixtree() function. It will throw +-- an error if something fails. +-- +SELECT test_radixtree(); diff --git a/src/test/modules/test_radixtree/test_radixtree--1.0.sql b/src/test/modules/test_radixtree/test_radixtree--1.0.sql new file mode 100644 index 0000000000..074a5a7ea7 --- /dev/null +++ b/src/test/modules/test_radixtree/test_radixtree--1.0.sql @@ -0,0 +1,8 @@ +/* src/test/modules/test_radixtree/test_radixtree--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_radixtree" to load this file. \quit + +CREATE FUNCTION test_radixtree() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; diff --git a/src/test/modules/test_radixtree/test_radixtree.c b/src/test/modules/test_radixtree/test_radixtree.c new file mode 100644 index 0000000000..e9fe7e0124 --- /dev/null +++ b/src/test/modules/test_radixtree/test_radixtree.c @@ -0,0 +1,397 @@ +/*-------------------------------------------------------------------------- + * + * test_radixtree.c + * Test radixtree set data structure. + * + * Copyright (c) 2022, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/test_radixtree/test_radixtree.c + * + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "common/pg_prng.h" +#include "fmgr.h" +#include "lib/radixtree.h" +#include "miscadmin.h" +#include "nodes/bitmapset.h" +#include "storage/block.h" +#include "storage/itemptr.h" +#include "utils/memutils.h" +#include "utils/timestamp.h" + +#define UINT64_HEX_FORMAT "%" INT64_MODIFIER "X" + +/* + * If you enable this, the "pattern" tests will print information about + * how long populating, probing, and iterating the test set takes, and + * how much memory the test set consumed. That can be used as + * micro-benchmark of various operations and input patterns (you might + * want to increase the number of values used in each of the test, if + * you do that, to reduce noise). + * + * The information is printed to the server's stderr, mostly because + * that's where MemoryContextStats() output goes. + */ +static const bool intset_test_stats = true; + +static int radix_tree_node_max_entries[] = {4, 16, 48, 256}; + +/* + * A struct to define a pattern of integers, for use with the test_pattern() + * function. + */ +typedef struct +{ + char *test_name; /* short name of the test, for humans */ + char *pattern_str; /* a bit pattern */ + uint64 spacing; /* pattern repeats at this interval */ + uint64 num_values; /* number of integers to set in total */ +} test_spec; + +static const test_spec test_specs[] = { + { + "all ones", "1111111111", + 10, 1000000 + }, + { + "alternating bits", "0101010101", + 10, 1000000 + }, + { + "clusters of ten", "1111111111", + 10000, 1000000 + }, + { + "clusters of hundred", + "1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111", + 10000, 10000000 + }, + { + "one-every-64k", "1", + 65536, 1000000 + }, + { + "sparse", "100000000000000000000000000000001", + 10000000, 1000000 + }, + { + "single values, distance > 2^32", "1", + UINT64CONST(10000000000), 100000 + }, + { + "clusters, distance > 2^32", "10101010", + UINT64CONST(10000000000), 1000000 + }, + { + "clusters, distance > 2^60", "10101010", + UINT64CONST(2000000000000000000), + 23 /* can't be much higher than this, or we + * overflow uint64 */ + } +}; + +PG_MODULE_MAGIC; + +PG_FUNCTION_INFO_V1(test_radixtree); + +static void test_empty(void); + +static void +test_empty(void) +{ + radix_tree *radixtree; + bool found; + + radixtree = radix_tree_create(CurrentMemoryContext); + + radix_tree_search(radixtree, 0, &found); + if (found) + elog(ERROR, "radix_tree_search on empty tree returned true"); + + radix_tree_search(radixtree, 1, &found); + if (found) + elog(ERROR, "radix_tree_search on empty tree returned true"); + + radix_tree_search(radixtree, PG_UINT64_MAX, &found); + if (found) + elog(ERROR, "radix_tree_search on empty tree returned true"); + + if (radix_tree_num_entries(radixtree) != 0) + elog(ERROR, "radix_tree_num_entries on empty tree return non-zero"); + + radix_tree_destroy(radixtree); +} + +static void +check_search_on_node(radix_tree *radixtree, uint8 shift, int start, int end) +{ + for (int i = start; i < end; i++) + { + uint64 key = ((uint64) i << shift); + bool found; + Datum val; + + val = radix_tree_search(radixtree, key, &found); + if (!found) + elog(ERROR, "key 0x" UINT64_HEX_FORMAT " is not found on node-%d", + key, end); + if (DatumGetUInt64(val) != key) + elog(ERROR, "radix_tree_search with key 0x" UINT64_HEX_FORMAT " returns 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT, + key, DatumGetUInt64(val), key); + } +} + +static void +test_node_types(uint8 shift) +{ + radix_tree *radixtree; + uint64 num_entries; + + radixtree = radix_tree_create(CurrentMemoryContext); + + for (int i = 0; i < 256; i++) + { + uint64 key = ((uint64) i << shift); + bool found; + + radix_tree_insert(radixtree, key, Int64GetDatum(key), &found); + + if (found) + elog(ERROR, "newly inserted key 0x" UINT64_HEX_FORMAT " found", key); + + for (int j = 0; j < lengthof(radix_tree_node_max_entries); j++) + { + if (i == (radix_tree_node_max_entries[j] - 1)) + { + check_search_on_node(radixtree, shift, + (j == 0) ? 0 : radix_tree_node_max_entries[j - 1], + radix_tree_node_max_entries[j]); + break; + } + } + } + + num_entries = radix_tree_num_entries(radixtree); + + if (num_entries != 256) + elog(ERROR, + "radix_tree_num_entries returned" UINT64_FORMAT ", expected " UINT64_FORMAT, + num_entries, UINT64CONST(256)); +} + +/* + * Test with a repeating pattern, defined by the 'spec'. + */ +static void +test_pattern(const test_spec *spec) +{ + radix_tree *radixtree; + radix_tree_iter *iter; + MemoryContext radixtree_ctx; + TimestampTz starttime; + TimestampTz endtime; + uint64 n; + uint64 last_int; + int patternlen; + uint64 *pattern_values; + uint64 pattern_num_values; + + elog(NOTICE, "testing radix tree with pattern \"%s\"", spec->test_name); + if (intset_test_stats) + fprintf(stderr, "-----\ntesting radix tree with pattern \"%s\"\n", spec->test_name); + + /* Pre-process the pattern, creating an array of integers from it. */ + patternlen = strlen(spec->pattern_str); + pattern_values = palloc(patternlen * sizeof(uint64)); + pattern_num_values = 0; + for (int i = 0; i < patternlen; i++) + { + if (spec->pattern_str[i] == '1') + pattern_values[pattern_num_values++] = i; + } + + /* + * Allocate the integer set. + * + * Allocate it in a separate memory context, so that we can print its + * memory usage easily. (intset_create() creates a memory context of its + * own, too, but we don't have direct access to it, so we cannot call + * MemoryContextStats() on it directly). + */ + radixtree_ctx = AllocSetContextCreate(CurrentMemoryContext, + "radixtree test", + ALLOCSET_SMALL_SIZES); + MemoryContextSetIdentifier(radixtree_ctx, spec->test_name); + radixtree = radix_tree_create(radixtree_ctx); + + /* + * Add values to the set. + */ + starttime = GetCurrentTimestamp(); + + n = 0; + last_int = 0; + while (n < spec->num_values) + { + uint64 x = 0; + + for (int i = 0; i < pattern_num_values && n < spec->num_values; i++) + { + bool found; + + x = last_int + pattern_values[i]; + + radix_tree_insert(radixtree, x, Int64GetDatum(x), &found); + + if (found) + elog(ERROR, "newly inserted key 0x" UINT64_HEX_FORMAT " found", x); + + n++; + } + last_int += spec->spacing; + } + + endtime = GetCurrentTimestamp(); + + if (intset_test_stats) + fprintf(stderr, "added " UINT64_FORMAT " values in %d ms\n", + spec->num_values, (int) (endtime - starttime) / 1000); + + /* + * Print stats on the amount of memory used. + * + * We print the usage reported by intset_memory_usage(), as well as the + * stats from the memory context. They should be in the same ballpark, + * but it's hard to automate testing that, so if you're making changes to + * the implementation, just observe that manually. + */ + if (intset_test_stats) + { + uint64 mem_usage; + + /* + * Also print memory usage as reported by intset_memory_usage(). It + * should be in the same ballpark as the usage reported by + * MemoryContextStats(). + */ + mem_usage = radix_tree_memory_usage(radixtree); + fprintf(stderr, "radix_tree_memory_usage() reported " UINT64_FORMAT " (%0.2f bytes / integer)\n", + mem_usage, (double) mem_usage / spec->num_values); + + MemoryContextStats(radixtree_ctx); + } + + /* Check that intset_get_num_entries works */ + n = radix_tree_num_entries(radixtree); + if (n != spec->num_values) + elog(ERROR, "radix_tree_num_entries returned " UINT64_FORMAT ", expected " UINT64_FORMAT, n, spec->num_values); + + /* + * Test random-access probes with intset_is_member() + */ + starttime = GetCurrentTimestamp(); + + for (n = 0; n < 100000; n++) + { + bool found; + bool expected; + uint64 x; + Datum v; + + /* + * Pick next value to probe at random. We limit the probes to the + * last integer that we added to the set, plus an arbitrary constant + * (1000). There's no point in probing the whole 0 - 2^64 range, if + * only a small part of the integer space is used. We would very + * rarely hit values that are actually in the set. + */ + x = pg_prng_uint64_range(&pg_global_prng_state, 0, last_int + 1000); + + /* Do we expect this value to be present in the set? */ + if (x >= last_int) + expected = false; + else + { + uint64 idx = x % spec->spacing; + + if (idx >= patternlen) + expected = false; + else if (spec->pattern_str[idx] == '1') + expected = true; + else + expected = false; + } + + /* Is it present according to intset_is_member() ? */ + v = radix_tree_search(radixtree, x, &found); + + if (found != expected) + elog(ERROR, "mismatch at 0x" UINT64_HEX_FORMAT ": %d vs %d", x, found, expected); + if (found && (DatumGetUInt64(v) != x)) + elog(ERROR, "found 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT, + DatumGetUInt64(v), x); + } + endtime = GetCurrentTimestamp(); + if (intset_test_stats) + fprintf(stderr, "probed " UINT64_FORMAT " values in %d ms\n", + n, (int) (endtime - starttime) / 1000); + + /* + * Test iterator + */ + starttime = GetCurrentTimestamp(); + + iter = radix_tree_begin_iterate(radixtree); + n = 0; + last_int = 0; + while (n < spec->num_values) + { + for (int i = 0; i < pattern_num_values && n < spec->num_values; i++) + { + uint64 expected = last_int + pattern_values[i]; + uint64 x; + uint64 val; + + if (!radix_tree_iterate_next(iter, &x, &val)) + break; + + if (x != expected) + elog(ERROR, + "iterate returned wrong key; got 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT " at %d", x, expected, i); + if (DatumGetUInt64(val) != expected) + elog(ERROR, + "iterate returned wrong value; got 0x" UINT64_HEX_FORMAT ", expected 0x" UINT64_HEX_FORMAT " at %d", x, expected, i); + n++; + } + last_int += spec->spacing; + } + endtime = GetCurrentTimestamp(); + if (intset_test_stats) + fprintf(stderr, "iterated " UINT64_FORMAT " values in %d ms\n", + n, (int) (endtime - starttime) / 1000); + + if (n < spec->num_values) + elog(ERROR, "iterator stopped short after " UINT64_FORMAT " entries, expected " UINT64_FORMAT, n, spec->num_values); + if (n > spec->num_values) + elog(ERROR, "iterator returned " UINT64_FORMAT " entries, " UINT64_FORMAT " was expected", n, spec->num_values); + + MemoryContextDelete(radixtree_ctx); +} + +Datum +test_radixtree(PG_FUNCTION_ARGS) +{ + test_empty(); + + for (int shift = 0; shift <= (64 - 8); shift += 8) + test_node_types(shift); + + /* Test different test patterns, with lots of entries */ + for (int i = 0; i < lengthof(test_specs); i++) + test_pattern(&test_specs[i]); + + PG_RETURN_VOID(); +} diff --git a/src/test/modules/test_radixtree/test_radixtree.control b/src/test/modules/test_radixtree/test_radixtree.control new file mode 100644 index 0000000000..e53f2a3e0c --- /dev/null +++ b/src/test/modules/test_radixtree/test_radixtree.control @@ -0,0 +1,4 @@ +comment = 'Test code for radix tree' +default_version = '1.0' +module_pathname = '$libdir/test_radixtree' +relocatable = true