From a7403af67b54cd30fa11662dc5d3f7e15d4a1d2f Mon Sep 17 00:00:00 2001 From: Sergey Soloviev Date: Wed, 3 Dec 2025 15:25:41 +0300 Subject: [PATCH v4 1/5] add in-memory T-tree tuple index This patch implements T tree structure. It will be used as index for special type of grouping using index. It supports different memory contexts for tracking memory allocations. And just like in TupleHashTable during Lookup it uses 'isnew' pointer to prevent new tuple creation (i.e. when memory limit is reached). Also it has key abbreviation optimization support like in tuplesort. But some code was copied and looks exactly the same way, so it is worth separating such logic into a separate function. For now it supports only insert operation and no delete, because during aggregation there are no delete operations. --- src/backend/executor/execGrouping.c | 963 ++++++++++++++++++++++++++++ src/include/executor/executor.h | 65 ++ src/include/nodes/execnodes.h | 148 ++++- 3 files changed, 1152 insertions(+), 24 deletions(-) diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c index c107514a85d..3155145a2a8 100644 --- a/src/backend/executor/execGrouping.c +++ b/src/backend/executor/execGrouping.c @@ -622,3 +622,966 @@ TupleHashTableMatch(struct tuplehash_hash *tb, MinimalTuple tuple1, MinimalTuple econtext->ecxt_outertuple = slot1; return !ExecQualAndReset(hashtable->cur_eq_func, econtext); } + +/***************************************************************************** + * Utility routines for all-in-memory T-Tree + * + * These routines build T-tree index for grouping tuples together (eg, for + * index aggregation). There is one entry for each not-distinct set of tuples + * presented. + *****************************************************************************/ + +/* + * Representation of searched entry in tuple index. This have + * separate representation to avoid necessary memory allocations + * to create MinimalTuple for TupleIndexEntry. + */ +typedef struct TupleIndexSearchEntryData +{ + TupleTableSlot *slot; /* search TupleTableSlot */ + Datum key1; /* first searched key data */ + bool isnull1; /* first searched key is null */ +} TupleIndexSearchEntryData; + +typedef TupleIndexSearchEntryData *TupleIndexSearchEntry; + +/* + * compare_index_tuple_tiebreak + * Perform full comparison of tuples without key abbreviation. + * + * Invoked if first key (possibly abbreviated) can not decide comparison, so + * we have to compare all keys. + */ +static inline int +compare_index_tuple_tiebreak(TupleIndex index, TupleIndexEntry left, + TupleIndexSearchEntry right) +{ + HeapTupleData ltup; + SortSupport sortKey = index->sortKeys; + TupleDesc tupDesc = index->tupDesc; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + int cmp; + + ltup.t_len = left->tuple->t_len + MINIMAL_TUPLE_OFFSET; + ltup.t_data = (HeapTupleHeader) ((char *) left->tuple - MINIMAL_TUPLE_OFFSET); + tupDesc = index->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = slot_getattr(right->slot, attno, &isnull2); + + cmp = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (cmp != 0) + return cmp; + } + + sortKey++; + for (int nkey = 1; nkey < index->nkeys; nkey++, sortKey++) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = slot_getattr(right->slot, attno, &isnull2); + + cmp = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (cmp != 0) + return cmp; + } + + return 0; +} + +/* + * compare_index_tuple + * Compare pair of tuples during index lookup + * + * The comparison honors key abbreviation. + */ +static int +compare_index_tuple(TupleIndex index, + TupleIndexEntry left, + TupleIndexSearchEntry right) +{ + SortSupport sortKey = &index->sortKeys[0]; + int cmp = 0; + + cmp = ApplySortComparator(left->key1, left->isnull1, + right->key1, right->isnull1, + sortKey); + if (cmp != 0) + return cmp; + + return compare_index_tuple_tiebreak(index, left, right); +} + +/* + * tuple_index_node_bsearch + * Perform binary search in the index node. + * + * On return, if 'found' is set to 'true', then exact match found and returned + * index is an index in tuples array. Otherwise the value handled differently: + * - for internal nodes this is an index in 'pointers' array which to follow + * - for leaf nodes this is an index to which new entry must be inserted. + */ +static int +tuple_index_node_bsearch(TupleIndex index, TupleIndexNode node, + TupleIndexSearchEntry search, bool *found) +{ + int low; + int high; + int cmp; + + /* + * During tree traversal the main thing is to find bounding node, so + * filtering out ones known not to be bounding will increase performance. + */ + if (node->ntuples < 2) + { + if (node->ntuples == 1) + { + cmp = compare_index_tuple(index, node->tuples[0], search); + if (cmp == 0) + { + *found = true; + return 0; + } + + *found = false; + return cmp < 0 ? 1 : 0; + } + else + { + /* can happen only when inserting first entry into whole index */ + *found = false; + return 0; + } + } + + /* minimum */ + cmp = compare_index_tuple(index, node->tuples[0], search); + if (cmp == 0) + { + *found = true; + return 0; + } + + if (cmp > 0) + { + *found = false; + return 0; + } + + /* maximum */ + cmp = compare_index_tuple(index, node->tuples[node->ntuples - 1], search); + if (cmp == 0) + { + *found = true; + return node->ntuples - 1; + } + + if (cmp < 0) + { + *found = false; + return node->ntuples; + } + + /* binary search of middle */ + low = 1; + high = node->ntuples - 1; + *found = false; + + while (low < high) + { + OffsetNumber mid = (low + high) / 2; + TupleIndexEntry mid_entry = node->tuples[mid]; + + cmp = compare_index_tuple(index, mid_entry, search); + if (cmp == 0) + { + *found = true; + return mid; + } + + if (cmp < 0) + low = mid + 1; + else + high = mid; + } + + return low; +} + +static inline TupleIndexNode +AllocIndexNode(TupleIndex index) +{ + TupleIndexNode node = (TupleIndexNode) MemoryContextAllocZero( + index->nodecxt, sizeof(TupleIndexNodeData)); + + node->height = 1; /* initial height */ + return node; +} + +static inline Datum +mintup_getattr(MinimalTuple tup, TupleDesc tupdesc, AttrNumber attnum, bool *isnull) +{ + HeapTupleData htup; + + htup.t_len = tup->t_len + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) tup - MINIMAL_TUPLE_OFFSET); + + return heap_getattr(&htup, attnum, tupdesc, isnull); +} + +static inline bool +index_node_is_leaf(TupleIndexNode node) +{ + return node->left == NULL && node->right == NULL; +} + +static inline bool +index_node_is_half_leaf(TupleIndexNode node) +{ + if (node->left == NULL) + return node->right != NULL; + return node->right == NULL; +} + +static inline int +index_node_height(TupleIndexNode node) +{ + if (node == NULL) + return 0; + + return node->height; +} + +static inline int +index_node_calculate_height(TupleIndexNode node) +{ + return Max(index_node_height(node->left), + index_node_height(node->right)) + 1; +} + +static inline int +index_node_get_balance(TupleIndexNode node) +{ + return index_node_height(node->left) - index_node_height(node->right); +} + +/* + * tuple_index_node_check_special_rotation + * When performing LR/RL rotation check corner case to keep the internal + * node full. + * + * Ttree has invariant that internal nodes must be completely full. But it has + * corner case when parent and it's child are half-leaves and bottom node is + * a leaf (newly added) - after performing RL/LR rotation this invariant will + * be violated, because old bottom node (with single entry) will become + * internal node, but with single element, which violated this invariant. + * + * To fix this we forcibly move entries from middle node into bottom and then + * perform required rotation. + * + * 'lr' is a flag telling which rotation we want to perform and is needed to + * transfer entries between nodes correctly. + */ +static void +tuple_index_node_check_special_rotation(TupleIndexNode parent, + TupleIndexNode middle, + TupleIndexNode bottom, + bool lr) +{ + /* + * The paper describes condition in which this case happens - 2 half-leaves + * with leaf at the bottom. But also it has a few conclusions: bottom node + * must have single element and middle node must be full. + * + * We check only the root cause and for other leave asserts just in order + * to detect invalid usage or bugs. + */ + if (!( index_node_is_half_leaf(parent) + && index_node_is_half_leaf(middle) + && index_node_is_leaf(bottom))) + return; + + Assert(bottom->ntuples == 1); + Assert(middle->ntuples == TUPLE_INDEX_NODE_MAX_ENTRIES); + + /* move tuples from middle node to bottom */ + if (lr) + { + bottom->tuples[TUPLE_INDEX_NODE_MAX_ENTRIES - 1] = bottom->tuples[0]; + memmove(&bottom->tuples[0], &middle->tuples[1], + sizeof(TupleIndexEntry) * (TUPLE_INDEX_NODE_MAX_ENTRIES - 1)); + } + else + { + memmove(&bottom->tuples[1], &middle->tuples[0], + sizeof(TupleIndexEntry) * (TUPLE_INDEX_NODE_MAX_ENTRIES - 1)); + middle->tuples[0] = middle->tuples[TUPLE_INDEX_NODE_MAX_ENTRIES - 1]; + } + + bottom->ntuples = TUPLE_INDEX_NODE_MAX_ENTRIES; + middle->ntuples = 1; +} + +/* + * tuple_index_node_rotate_left + * Perform left rotation for the tree returning new root + */ +static TupleIndexNode +tuple_index_node_rotate_left(TupleIndexNode p) +{ + TupleIndexNode l = p->left; + TupleIndexNode lr = l->right; + + l->right = p; + p->left = lr; + + p->height = index_node_calculate_height(p); + l->height = index_node_calculate_height(l); + + return l; +} + + +/* + * tuple_index_node_rotate_right + * Perform right rotation for the tree returning new root + */ +static TupleIndexNode +tuple_index_node_rotate_right(TupleIndexNode p) +{ + TupleIndexNode r = p->right; + TupleIndexNode rl = r->left; + + r->left = p; + p->right = rl; + + p->height = index_node_calculate_height(p); + r->height = index_node_calculate_height(r); + + return r; +} + +/* + * tuple_index_insert_fixup + * Check balance of Ttree index node and perform rotations if needed + * + * This function is invoked when new node was created and balance was possibly + * changed. For this 'checkbalance' variable is passed through call stack and + * which must be checked after recursive call ended. It detects imbalance and + * if needed performs required rotation. + * + * + * Note that according to original paper Ttree has different meaning for letters + * in rotation naming. The first letter means which side caused inconsistency, + * while in AVL tree it is a direction for rotation, i.e. RR in Ttree is a LL + * in AVL-tree. + * + * Function returns root for the subtree. It can differ from passed if rotation + * was required. We do not store parent pointer, so this function must be called + * by parent of the tree. + */ +static TupleIndexNode +tuple_index_insert_fixup(TupleIndex index, TupleIndexNode node) +{ + int balance; + + /* + * update node's height, because this function is invoked right after + * insertion of new node, so height can be changed. + */ + node->height = index_node_calculate_height(node); + + balance = index_node_get_balance(node); + + /* node is balanced */ + if (-1 <= balance && balance <= 1) + return node; + + if (balance < -1) + { + balance = index_node_get_balance(node->right); + if (balance > 0) + { + tuple_index_node_check_special_rotation(node, node->right, + node->right->left, false); + node->right = tuple_index_node_rotate_left(node->right); + } + node = tuple_index_node_rotate_right(node); + } + else /* balance > 1 */ + { + balance = index_node_get_balance(node->left); + if (balance < 0) + { + tuple_index_node_check_special_rotation(node, node->left, + node->left->right, true); + node->left = tuple_index_node_rotate_right(node->left); + } + node = tuple_index_node_rotate_left(node); + } + + return node; +} + +/* + * tuple_index_insert_greatest_lower_bound + * Insert new greatest lower bound into given subtree + * + * After deleting the old node minimum, we should make it new greatest lower + * bound of left subtree. This is a recursive function that traverse to the + * right of a subtree and inserts this entry into to rightmost, maybe creating + * new node if old was full. + */ +static void +tuple_index_insert_greatest_lower_bound(TupleIndex index, TupleIndexNode node, + TupleIndexEntry entry, int idx, + bool *checkbalance) +{ + if (node->right == NULL) + { + /* in-place insertion */ + if (node->ntuples < TUPLE_INDEX_NODE_MAX_ENTRIES) + { + node->tuples[node->ntuples] = entry; + node->ntuples++; + return; + } + + node->right = AllocIndexNode(index); + node->right->tuples[0] = entry; + node->right->ntuples = 1; + if (node->left == NULL) + { + node->height = 2; + *checkbalance = true; + } + else + { + /* otherwise height of this node can not change */ + } + } + else + { + tuple_index_insert_greatest_lower_bound(index, node->right, entry, idx, + checkbalance); + if (*checkbalance) + node->right = tuple_index_insert_fixup(index, node->right); + } +} + +static inline TupleIndexEntry +tuple_index_create_entry(TupleIndex index, TupleIndexSearchEntry search) +{ + MemoryContext oldcxt; + TupleIndexEntry entry; + + oldcxt = MemoryContextSwitchTo(index->tuplecxt); + + entry = palloc(sizeof(TupleIndexEntryData)); + entry->tuple = ExecCopySlotMinimalTupleExtra(search->slot, index->additionalsize); + + MemoryContextSwitchTo(oldcxt); + + /* + * key1 in search tuple stored in TableTupleSlot which have it's own + * lifetime, so we must not copy it. + * + * But if key abbreviation is in use than we should copy it from search + * tuple: this is safe (pass-by-value) and extra recalculation can + * spoil statistics calculation. + */ + if (index->sortKeys->abbrev_converter) + { + entry->isnull1 = search->isnull1; + entry->key1 = search->key1; + } + else + { + SortSupport sortKey = &index->sortKeys[0]; + entry->key1 = mintup_getattr(entry->tuple, index->tupDesc, + sortKey->ssup_attno, &entry->isnull1); + } + + return entry; +} + +static TupleIndexEntry +tuple_index_node_lookup(TupleIndex index, TupleIndexNode node, + TupleIndexSearchEntry search, bool *is_new, + bool *checkbalance) +{ + TupleIndexEntry entry; + int idx; + bool found; + + bool insert_here; + bool have_space; + bool is_bounding; + + idx = tuple_index_node_bsearch(index, node, search, &found); + if (found) + { + if (is_new) + *is_new = false; + return node->tuples[idx]; + } + + insert_here = false; + is_bounding = 0 < idx && idx < node->ntuples; + have_space = node->ntuples < TUPLE_INDEX_NODE_MAX_ENTRIES; + + if (is_bounding) + /* if node is bounded we must always insert entry here */ + insert_here = true; + else if (have_space && + ((idx == 0 && node->left == NULL) || + (idx == node->ntuples && node->right == NULL))) + /* + * this node can be not bounded, but if there is not suitable child + * and we have enough space, then just insert entry in this node, + * because it will save us some extra space. in this case the value + * becomes new min/max. + */ + insert_here = true; + + if (insert_here) + { + /* no equal entry found, but we are asked not to create new entries */ + if (is_new == NULL) + return NULL; + + entry = tuple_index_create_entry(index, search); + + if (have_space) + { + /* we have space, so just insert into sorted array */ + Assert(node->ntuples < TUPLE_INDEX_NODE_MAX_ENTRIES); + Assert(0 <= idx && idx <= node->ntuples); + + if (idx < node->ntuples) + memmove(&node->tuples[idx + 1], &node->tuples[idx], + sizeof(TupleIndexEntry) * (node->ntuples - idx)); + + node->tuples[idx] = entry; + node->ntuples++; + } + else + { + /* + * If this node is bounding but it does not have free space, then + * we must remove current minimum node and make it new greatest + * lower bound. + */ + TupleIndexEntry oldmin; + + Assert(node->ntuples == TUPLE_INDEX_NODE_MAX_ENTRIES); + + oldmin = node->tuples[0]; + + /* insert new entry into this node */ + idx--; + if (0 < idx) + memmove(&node->tuples[0], &node->tuples[1], + sizeof(TupleIndexEntry) * idx); + node->tuples[idx] = entry; + + /* make old minimum a new greatest lower bound in left subtree */ + if (node->left == NULL) + { + node->left = AllocIndexNode(index); + node->left->tuples[0] = oldmin; + node->left->ntuples = 1; + + /* if right is NULL it means that height was 0, so now became 1 */ + if (node->right == NULL) + { + node->height = 2; + *checkbalance = true; + } + + return entry; + } + else + { + /* Search for suitable node and perform insertion */ + tuple_index_insert_greatest_lower_bound(index, node->left, oldmin, idx, + checkbalance); + if (*checkbalance) + node->left = tuple_index_insert_fixup(index, node->left); + } + } + + index->ntuples++; + *is_new = true; + } + else + { + /* non-bounding node - recurse into children */ + TupleIndexNode *recurse_node; + + Assert(idx == 0 || idx == node->ntuples); + + if (idx == 0) + recurse_node = &node->left; + else + recurse_node = &node->right; + + if (*recurse_node == NULL) + { + if (!is_new) + return NULL; + + *recurse_node = AllocIndexNode(index); + node->height = index_node_calculate_height(node); + entry = tuple_index_create_entry(index, search); + (*recurse_node)->tuples[0] = entry; + (*recurse_node)->ntuples = 1; + + *checkbalance = true; + } + else + { + entry = tuple_index_node_lookup(index, *recurse_node, search, is_new, + checkbalance); + + if (*checkbalance) + *recurse_node = tuple_index_insert_fixup(index, *recurse_node); + } + } + return entry; +} + +static void +remove_index_abbreviations_walker(TupleIndex index, TupleIndexNode node) +{ + for (size_t i = 0; i < node->ntuples; i++) + { + TupleIndexEntry entry = node->tuples[i]; + entry->key1 = mintup_getattr(entry->tuple, index->tupDesc, + index->sortKeys[0].ssup_attno, + &entry->isnull1); + } + + if (node->left) + remove_index_abbreviations_walker(index, node->left); + + if (node->right) + remove_index_abbreviations_walker(index, node->right); + +} + +static void +remove_index_abbreviations(TupleIndex index) +{ + SortSupport sortKey = &index->sortKeys[0]; + + sortKey->comparator = sortKey->abbrev_full_comparator; + sortKey->abbrev_converter = NULL; + sortKey->abbrev_abort = NULL; + sortKey->abbrev_full_comparator = NULL; + + remove_index_abbreviations_walker(index, index->root); +} + +static inline void +prepare_search_index_tuple(TupleIndex index, TupleTableSlot *slot, + TupleIndexSearchEntry entry) +{ + SortSupport sortKey; + + sortKey = &index->sortKeys[0]; + + entry->slot = slot; + entry->key1 = slot_getattr(slot, sortKey->ssup_attno, &entry->isnull1); + + /* NULL can not be abbreviated */ + if (entry->isnull1) + return; + + /* abbreviation is not used */ + if (!sortKey->abbrev_converter) + return; + + /* check if abbreviation should be removed */ + if (index->abbrevNext <= index->ntuples) + { + index->abbrevNext *= 2; + + if (sortKey->abbrev_abort(index->ntuples, sortKey)) + { + remove_index_abbreviations(index); + return; + } + } + + entry->key1 = sortKey->abbrev_converter(entry->key1, sortKey); +} + +TupleIndexEntry +TupleIndexLookup(TupleIndex index, TupleTableSlot *searchslot, bool *is_new) +{ + TupleIndexEntry entry; + TupleIndexSearchEntryData search_entry; + bool checkbalance = false; + + prepare_search_index_tuple(index, searchslot, &search_entry); + + entry = tuple_index_node_lookup(index, index->root, &search_entry, is_new, + &checkbalance); + + if (entry == NULL) + return NULL; + + if (checkbalance) + index->root = tuple_index_insert_fixup(index, index->root); + + return entry; +} + +void +InitTupleIndexIterator(TupleIndex index, TupleIndexIterator iter) +{ + TupleIndexNode min_node; + + /* iterate to the left-most node */ + min_node = index->root; + + /* + * in-order traversal requires us to keep track of nodes on our path, so + * we can process them. Also, T-tree has nice property for such traversal - + * all nodes on left are already visited and all on right are not yet. So + * to get to the next node we either go to the leftmost node in right + * subtree (tracking node on the way) or traverse bottom-up to the first + * not visited node. + * + * To keep track parent nodes we use separate array height-indexed. We + * know that for each height we must have only 1 node, so to get our parent + * we just increment our height. But there is problem - AVL allows slight + * imbalance, so there might be no node on height + 1. Here it's called + * height-gap and handled by setting 'visited' flag, so such entries will + * be skipped, because this array only required during bottom-up traversal. + */ + iter->max_height = index->root->height; + iter->stack = palloc0(sizeof(TupleIndexIteratorNode) * iter->max_height); + + while (min_node->left != NULL) + { + TupleIndexIteratorNode *n = &iter->stack[min_node->height - 1]; + n->node = min_node; + n->visited = false; + if (min_node->left == NULL) + break; + + if (min_node->height != min_node->left->height + 1) + iter->stack[min_node->left->height].visited = true; + min_node = min_node->left; + } + + iter->cur_node = min_node; + iter->cur_idx = 0; +} + +static TupleIndexNode +tuple_index_iterator_move_next(TupleIndexIterator iter) +{ + TupleIndexNode node = iter->cur_node; + + if (node->right) + { + TupleIndexNode left; + + /* we have right subtree that is not visited yet */ + + /* mark current node as already visited */ + iter->stack[node->height - 1].visited = true; + + /* height-gap */ + if (node->height != node->right->height + 1) + iter->stack[node->right->height].visited = true; + + /* + * iterate to the left-most node in this tree and mark every node + * on the way as not visited, so we will traverse them later + */ + left = node->right; + while (left->left != NULL) + { + TupleIndexIteratorNode *n = &iter->stack[left->height - 1]; + n->visited = false; + n->node = left; + if (!left->left) + break; + + /* height-gap */ + if (left->height != left->left->height + 1) + iter->stack[left->left->height].visited = true; + left = left->left; + } + + iter->cur_idx = 0; + iter->cur_node = left; + return left; + } + else + { + int height = node->height + 1; + + /* traverse stack higher and find first not yet visited node */ + + /* skip already visited nodes */ + while (height <= iter->max_height && iter->stack[height - 1].visited) + height++; + + if (iter->max_height < height) + node = NULL; + else + node = iter->stack[height - 1].node; + + iter->cur_node = node; + iter->cur_idx = 0; + return node; + } +} + +TupleIndexEntry +TupleIndexIteratorNext(TupleIndexIterator iter) +{ + TupleIndexNode node = iter->cur_node; + TupleIndexEntry tuple; + + if (node == NULL) + return NULL; + + /* this also handles single empty root node case */ + if (node->ntuples <= iter->cur_idx) + { + node = tuple_index_iterator_move_next(iter); + if (node == NULL) + return NULL; + } + + tuple = node->tuples[iter->cur_idx]; + iter->cur_idx++; + return tuple; +} + +/* + * Construct an empty TupleIndex + * + * inputDesc: tuple descriptor for input tuples + * nkeys: number of columns to be compared (length of next 4 arrays) + * attNums: attribute numbers used for grouping in sort order + * sortOperators: Oids of sort operator families used for comparisons + * sortCollations: collations used for comparisons + * nullsFirstFlags: strategy for handling NULL values + * additionalsize: size of data that may be stored along with the index entry + * used for storing per-trans information during aggregation + * metacxt: memory context for TupleIndex itself + * tuplecxt: memory context for storing MinimalTuples + * nodecxt: memory context for storing index nodes + */ +TupleIndex +BuildTupleIndex(TupleDesc inputDesc, + int nkeys, + AttrNumber *attNums, + Oid *sortOperators, + Oid *sortCollations, + bool *nullsFirstFlags, + Size additionalsize, + MemoryContext metacxt, + MemoryContext tuplecxt, + MemoryContext nodecxt) +{ + TupleIndex index; + MemoryContext oldcxt; + + Assert(nkeys > 0); + + additionalsize = MAXALIGN(additionalsize); + + oldcxt = MemoryContextSwitchTo(metacxt); + + index = (TupleIndex) palloc(sizeof(TupleIndexData)); + index->tuplecxt = tuplecxt; + index->nodecxt = nodecxt; + index->additionalsize = additionalsize; + index->tupDesc = CreateTupleDescCopy(inputDesc); + index->root = AllocIndexNode(index); + index->ntuples = 0; + index->height = 0; + + index->nkeys = nkeys; + index->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); + + for (int i = 0; i < nkeys; ++i) + { + SortSupport sortKey = &index->sortKeys[i]; + + Assert(AttributeNumberIsValid(attNums[i])); + Assert(OidIsValid(sortOperators[i])); + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = sortCollations[i]; + sortKey->ssup_nulls_first = nullsFirstFlags[i]; + sortKey->ssup_attno = attNums[i]; + /* abbreviation applies only for the first key */ + sortKey->abbreviate = i == 0; + + PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); + } + + /* Update abbreviation information */ + if (index->sortKeys[0].abbrev_converter != NULL) + { + index->abbrevUsed = true; + index->abbrevNext = 10; + index->abbrevSortOp = sortOperators[0]; + } + else + index->abbrevUsed = false; + + MemoryContextSwitchTo(oldcxt); + return index; +} + +/* + * Resets contents of the index to be empty, preserving all the non-content + * state. + */ +void +ResetTupleIndex(TupleIndex index) +{ + SortSupport ssup; + + /* by this time indexcxt must be reset by the caller */ + index->root = AllocIndexNode(index); + index->height = 0; + index->ntuples = 0; + + if (!index->abbrevUsed) + return; + + /* + * If key abbreviation is used then we must reset it's state. + * All fields in SortSupport are already setup, but we should clean + * some fields to make it look just if we setup this for the first time. + */ + ssup = &index->sortKeys[0]; + ssup->comparator = NULL; + PrepareSortSupportFromOrderingOp(index->abbrevSortOp, ssup); +} + diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 5929aabc353..c923ca6d8a9 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -198,6 +198,71 @@ TupleHashEntryGetAdditional(TupleHashTable hashtable, TupleHashEntry entry) } #endif +extern TupleIndex BuildTupleIndex(TupleDesc inputDesc, + int nkeys, + AttrNumber *attNums, + Oid *sortOperators, + Oid *sortCollations, + bool *nullsFirstFlags, + Size additionalsize, + MemoryContext metacxt, + MemoryContext tablecxt, + MemoryContext nodecxt); +extern TupleIndexEntry TupleIndexLookup(TupleIndex index, TupleTableSlot *search, + bool *is_new); +extern void ResetTupleIndex(TupleIndex index); + +/* + * Start iteration over tuples in index. Supports only ascending direction. + * During iterations no modifications are allowed, because it can break iterator. + */ +extern void InitTupleIndexIterator(TupleIndex index, TupleIndexIterator iter); +extern TupleIndexEntry TupleIndexIteratorNext(TupleIndexIterator iter); +static inline void +ResetTupleIndexIterator(TupleIndex index, TupleIndexIterator iter) +{ + InitTupleIndexIterator(index, iter); +} + +#ifndef FRONTEND + +/* + * Return size of the index entry. Useful for estimating memory usage. + */ +static inline size_t +TupleIndexEntrySize(void) +{ + return sizeof(TupleIndexEntryData); +} + +/* + * Get a pointer to the additional space allocated for this entry. The + * memory will be maxaligned and zeroed. + * + * The amount of space available is the additionalsize requested in the call + * to BuildTupleIndex(). If additionalsize was specified as zero, return + * NULL. + */ +static inline void * +TupleIndexEntryGetAdditional(TupleIndex index, TupleIndexEntry entry) +{ + if (index->additionalsize > 0) + return (char *) (entry->tuple) - index->additionalsize; + else + return NULL; +} + +/* + * Return tuple from index entry + */ +static inline MinimalTuple +TupleIndexEntryGetMinimalTuple(TupleIndexEntry entry) +{ + return entry->tuple; +} + +#endif + /* * prototypes from functions in execJunk.c */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 02265456978..c45352a7dc1 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -900,7 +900,94 @@ typedef tuplehash_iterator TupleHashIterator; #define ScanTupleHashTable(htable, iter) \ tuplehash_iterate(htable->hashtab, iter) +/* --------------------------------------------------------------- + * Tuple Btree index + * + * All-in-memory tuple Btree index used for grouping and aggregating. + * --------------------------------------------------------------- + */ + +/* + * Representation of tuple in index. It stores both tuple and + * first key information. If key abbreviation is used, then this + * first key stores abbreviated key. + */ +typedef struct TupleIndexEntryData +{ + MinimalTuple tuple; /* actual stored tuple */ + Datum key1; /* value of first key */ + bool isnull1; /* first key is null */ +} TupleIndexEntryData; + +typedef TupleIndexEntryData *TupleIndexEntry; + +/* + * Btree node of tuple index. Common for both internal and leaf nodes. + */ +typedef struct TupleIndexNodeData +{ + /* left node with keys less than minimum */ + struct TupleIndexNodeData *left; + /* right node with keys greater than maximum */ + struct TupleIndexNodeData *right; + /* amount of tuples in the node */ + int ntuples; + /* height of the node */ + int height; + +/* + * Maximal amount of tuples stored in tuple index node. + * 125 is set to fit cache lines completely. + */ +#define TUPLE_INDEX_NODE_MAX_ENTRIES 125 + /* + * array of tuples for this page. + * + * for internal node these are separator keys. + * for leaf nodes actual tuples. + */ + TupleIndexEntry tuples[TUPLE_INDEX_NODE_MAX_ENTRIES]; +} TupleIndexNodeData; + +typedef TupleIndexNodeData *TupleIndexNode; + +typedef struct TupleIndexData +{ + TupleDesc tupDesc; /* descriptor for stored tuples */ + TupleIndexNode root; /* root of the tree */ + int height; /* current tree height */ + int ntuples; /* number of tuples in index */ + int nkeys; /* amount of keys in tuple */ + SortSupport sortKeys; /* support functions for key comparison */ + MemoryContext tuplecxt; /* memory context containing tuples */ + MemoryContext nodecxt; /* memory context containing index nodes */ + Size additionalsize; /* size of additional data for tuple */ + int abbrevNext; /* next time we should check abbreviation + * optimization efficiency */ + bool abbrevUsed; /* true if key abbreviation optimization + * was ever used */ + Oid abbrevSortOp; /* sort operator for first key */ +} TupleIndexData; + +typedef struct TupleIndexData *TupleIndex; + +typedef struct TupleIndexIteratorNode +{ + TupleIndexNode node; /* index node itself */ + bool visited; /* was this node visited yet? */ +} TupleIndexIteratorNode; + +typedef struct TupleIndexIteratorData +{ + TupleIndexIteratorNode *stack; /* stack of traversed nodes */ + int max_height; /* max height of tree (root height) */ + TupleIndexNode cur_node; /* current node we are iterating */ + OffsetNumber cur_idx; /* index of tuple in cur_node to return next */ +} TupleIndexIteratorData; + +typedef TupleIndexIteratorData *TupleIndexIterator; + /* ---------------------------------------------------------------- * Expression State Nodes * @@ -2529,6 +2616,7 @@ typedef struct AggStatePerTransData *AggStatePerTrans; typedef struct AggStatePerGroupData *AggStatePerGroup; typedef struct AggStatePerPhaseData *AggStatePerPhase; typedef struct AggStatePerHashData *AggStatePerHash; +typedef struct AggStatePerIndexData *AggStatePerIndex; typedef struct AggState { @@ -2544,17 +2632,18 @@ typedef struct AggState AggStatePerAgg peragg; /* per-Aggref information */ AggStatePerTrans pertrans; /* per-Trans state information */ ExprContext *hashcontext; /* econtexts for long-lived data (hashtable) */ + ExprContext *indexcontext; /* econtexts for long-lived data (index) */ ExprContext **aggcontexts; /* econtexts for long-lived data (per GS) */ ExprContext *tmpcontext; /* econtext for input expressions */ -#define FIELDNO_AGGSTATE_CURAGGCONTEXT 14 +#define FIELDNO_AGGSTATE_CURAGGCONTEXT 15 ExprContext *curaggcontext; /* currently active aggcontext */ AggStatePerAgg curperagg; /* currently active aggregate, if any */ -#define FIELDNO_AGGSTATE_CURPERTRANS 16 +#define FIELDNO_AGGSTATE_CURPERTRANS 17 AggStatePerTrans curpertrans; /* currently active trans state, if any */ bool input_done; /* indicates end of input */ bool agg_done; /* indicates completion of Agg scan */ int projected_set; /* The last projected grouping set */ -#define FIELDNO_AGGSTATE_CURRENT_SET 20 +#define FIELDNO_AGGSTATE_CURRENT_SET 21 int current_set; /* The current grouping set being evaluated */ Bitmapset *grouped_cols; /* grouped cols in current projection */ List *all_grouped_cols; /* list of all grouped cols in DESC order */ @@ -2576,32 +2665,43 @@ typedef struct AggState int num_hashes; MemoryContext hash_metacxt; /* memory for hash table bucket array */ MemoryContext hash_tuplescxt; /* memory for hash table tuples */ - struct LogicalTapeSet *hash_tapeset; /* tape set for hash spill tapes */ - struct HashAggSpill *hash_spills; /* HashAggSpill for each grouping set, - * exists only during first pass */ - TupleTableSlot *hash_spill_rslot; /* for reading spill files */ - TupleTableSlot *hash_spill_wslot; /* for writing spill files */ - List *hash_batches; /* hash batches remaining to be processed */ - bool hash_ever_spilled; /* ever spilled during this execution? */ - bool hash_spill_mode; /* we hit a limit during the current batch - * and we must not create new groups */ - Size hash_mem_limit; /* limit before spilling hash table */ - uint64 hash_ngroups_limit; /* limit before spilling hash table */ - int hash_planned_partitions; /* number of partitions planned - * for first pass */ - double hashentrysize; /* estimate revised during execution */ - Size hash_mem_peak; /* peak hash table memory usage */ - uint64 hash_ngroups_current; /* number of groups currently in - * memory in all hash tables */ - uint64 hash_disk_used; /* kB of disk space used */ - int hash_batches_used; /* batches used during entire execution */ - AggStatePerHash perhash; /* array of per-hashtable data */ AggStatePerGroup *hash_pergroup; /* grouping set indexed array of * per-group pointers */ + /* Fields used for managing spill mode in hash and index aggs */ + struct LogicalTapeSet *spill_tapeset; /* tape set for hash spill tapes */ + struct HashAggSpill *spills; /* HashAggSpill for each grouping set, + * exists only during first pass */ + TupleTableSlot *spill_rslot; /* for reading spill files */ + TupleTableSlot *spill_wslot; /* for writing spill files */ + List *spill_batches; /* hash batches remaining to be processed */ + + bool spill_ever_happened; /* ever spilled during this execution? */ + bool spill_mode; /* we hit a limit during the current batch + * and we must not create new groups */ + Size spill_mem_limit; /* limit before spilling hash table or index */ + uint64 spill_ngroups_limit; /* limit before spilling hash table or index */ + int spill_planned_partitions; /* number of partitions planned + * for first pass */ + double hashentrysize; /* estimate revised during execution */ + Size spill_mem_peak; /* peak memory usage of hash table or index */ + uint64 spill_ngroups_current; /* number of groups currently in + * memory in all hash tables */ + uint64 spill_disk_used; /* kB of disk space used */ + int spill_batches_used; /* batches used during entire execution */ + + /* these fields are used in AGG_INDEXED mode: */ + AggStatePerIndex perindex; /* pointer to per-index state data */ + bool index_filled; /* index filled yet? */ + MemoryContext index_metacxt; /* memory for index structure */ + MemoryContext index_nodecxt; /* memory for index nodes */ + MemoryContext index_entrycxt; /* memory for index entries */ + Sort *index_sort; /* ordering information for index */ + Tuplesortstate *mergestate; /* state for merging projected tuples if + * spill occurred */ /* support for evaluation of agg input expressions: */ -#define FIELDNO_AGGSTATE_ALL_PERGROUPS 54 +#define FIELDNO_AGGSTATE_ALL_PERGROUPS 62 AggStatePerGroup *all_pergroups; /* array of first ->pergroups, than * ->hash_pergroup */ SharedAggInfo *shared_info; /* one entry per worker */ -- 2.43.0