From 36800c5124b34e5ba105901de5ba9a0ed9c18d4b Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 10 Jan 2023 21:45:44 +0100 Subject: [PATCH v12 1/6] Implement dynamic prefix compression in nbtree Because tuples are ordered on the page, if some prefix of the scan attributes on both sides of the compared tuple are equal to the scankey, then the current tuple that is being compared must also have those prefixing attributes that equal the scankey. We cannot generally propagate this information to _binsrch on lower pages, as this downstream page may have concurrently split and/or have merged with its deleted left neighbour (see [0]), which moves the keyspace of the linked page. We thus can only trust the current state of this current page for this optimization, which means we must validate this state each time we open the page. Although this limits the overall applicability of the performance improvement, it still allows for a nice performance improvement in most cases where initial columns have many duplicate values and a compare function that is not cheap. As an exception to the above rule, most of the time a pages' highkey is equal to the right seperator on the parent page due to how btree splits are done. By storing this right seperator from the parent page and then validating that the highkey of the child page contains the exact same data, we can restore the right prefix bound without having to call the relatively expensive _bt_compare. In the worst-case scenario of a concurrent page split, we'd still have to validate the full key, but that doesn't happen very often when compared to the number of times we descend the btree. --- contrib/amcheck/verify_nbtree.c | 17 +-- src/backend/access/nbtree/README | 43 ++++++++ src/backend/access/nbtree/nbtinsert.c | 34 ++++-- src/backend/access/nbtree/nbtsearch.c | 145 +++++++++++++++++++++++--- src/include/access/nbtree.h | 9 +- 5 files changed, 214 insertions(+), 34 deletions(-) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 94a9759322..e57625b75c 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -2701,6 +2701,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) BTInsertStateData insertstate; OffsetNumber offnum; Page page; + AttrNumber cmpcol = 1; insertstate.itup = itup; insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); @@ -2710,13 +2711,13 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) insertstate.buf = lbuf; /* Get matching tuple on leaf page */ - offnum = _bt_binsrch_insert(state->rel, &insertstate); + offnum = _bt_binsrch_insert(state->rel, &insertstate, 1); /* Compare first >= matching item on leaf page, if any */ page = BufferGetPage(lbuf); /* Should match on first heap TID when tuple has a posting list */ if (offnum <= PageGetMaxOffsetNumber(page) && insertstate.postingoff <= 0 && - _bt_compare(state->rel, key, page, offnum) == 0) + _bt_compare(state->rel, key, page, offnum, &cmpcol) == 0) exists = true; _bt_relbuf(state->rel, lbuf); } @@ -2778,6 +2779,7 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key, { ItemId itemid; int32 cmp; + AttrNumber cmpcol = 1; Assert(key->pivotsearch); @@ -2788,7 +2790,7 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key, if (!key->heapkeyspace) return invariant_leq_offset(state, key, upperbound); - cmp = _bt_compare(state->rel, key, state->target, upperbound); + cmp = _bt_compare(state->rel, key, state->target, upperbound, &cmpcol); /* * _bt_compare() is capable of determining that a scankey with a @@ -2840,10 +2842,11 @@ invariant_leq_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber upperbound) { int32 cmp; + AttrNumber cmpcol = 1; Assert(key->pivotsearch); - cmp = _bt_compare(state->rel, key, state->target, upperbound); + cmp = _bt_compare(state->rel, key, state->target, upperbound, &cmpcol); return cmp <= 0; } @@ -2863,10 +2866,11 @@ invariant_g_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber lowerbound) { int32 cmp; + AttrNumber cmpcol = 1; Assert(key->pivotsearch); - cmp = _bt_compare(state->rel, key, state->target, lowerbound); + cmp = _bt_compare(state->rel, key, state->target, lowerbound, &cmpcol); /* pg_upgrade'd indexes may legally have equal sibling tuples */ if (!key->heapkeyspace) @@ -2901,13 +2905,14 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key, { ItemId itemid; int32 cmp; + AttrNumber cmpcol = 1; Assert(key->pivotsearch); /* Verify line pointer before checking tuple */ itemid = PageGetItemIdCareful(state, nontargetblock, nontarget, upperbound); - cmp = _bt_compare(state->rel, key, nontarget, upperbound); + cmp = _bt_compare(state->rel, key, nontarget, upperbound, &cmpcol); /* pg_upgrade'd indexes may legally have equal sibling tuples */ if (!key->heapkeyspace) diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 52e646c7f7..0f10141a2f 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -901,6 +901,49 @@ large groups of duplicates, maximizing space utilization. Note also that deduplication more efficient. Deduplication can be performed infrequently, without merging together existing posting list tuples too often. +Notes about dynamic prefix truncation +------------------------------------- + +Because NBTrees have a sorted keyspace, when we have determined that some +prefixing columns of tuples on both sides of the tuple that is being +compared are equal to the scankey, then the current tuple must also share +this prefix with the scankey. This allows us to skip comparing those columns, +saving the indirect function calls in the compare operation. + +We can only use this constraint if we have proven this information while we +hold a pin on the page, so this is only useful on the page level: Concurrent +page deletions and splits may have moved the keyspace of the page referenced +by a parent page to the right. If we re-used high- and low-column-prefixes, +we would not be able to detect a change of keyspace from e.g. [2,3) to [1,2), +and subsequently return invalid results. This race condition can only be +prevented by re-establishing the prefix-equal-columns for each page. + +There is positive news, though: A page split will put a binary copy of the +page's highkey in the parent page. This means that we usually can reuse +the compare result of the parent page's downlink's right sibling when we +discover that their representation is binary equal. In general this will +be the case, as only in concurrent page splits and deletes the downlink +may not point to the page with the correct highkey bound (_bt_moveright +only rarely actually moves right). + +To implement this, we copy the downlink's right differentiator key into a +temporary buffer, which is then compared against the child pages' highkey. +If they match, we reuse the compare result (plus prefix) we had for it from +the parent page, if not, we need to do a full _bt_compare. Because memcpy + +memcmp is cheap compared to _bt_compare, and because it's quite unlikely +that we guess wrong this speeds up our _bt_moveright code (at cost of some +stack memory in _bt_search and some overhead in case of a wrong prediction) + +Now that we have prefix bounds on the highest value of a page, the +_bt_binsrch procedure will use this result as a rightmost prefix compare, +and for each step in the binary search (that does not compare less than the +insert key) improve the equal-prefix bounds. + +Using the above optimization, we now (on average) only need 2 full key +compares per page (plus ceil(log2(ntupsperpage)) single-attribute compares), +as opposed to the ceil(log2(ntupsperpage)) + 1 of a naive implementation; +a significant improvement. + Notes about deduplication ------------------------- diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index d33f814a93..39e7e9b731 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -328,6 +328,7 @@ _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate) { Page page; BTPageOpaque opaque; + AttrNumber cmpcol = 1; _bt_checkpage(rel, insertstate->buf); page = BufferGetPage(insertstate->buf); @@ -346,7 +347,8 @@ _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate) !P_IGNORE(opaque) && PageGetFreeSpace(page) > insertstate->itemsz && PageGetMaxOffsetNumber(page) >= P_HIKEY && - _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0) + _bt_compare(rel, insertstate->itup_key, page, P_HIKEY, + &cmpcol) > 0) { /* * Caller can use the fastpath optimization because cached @@ -440,7 +442,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, * in the fastpath below, but also in the _bt_findinsertloc() call later. */ Assert(!insertstate->bounds_valid); - offset = _bt_binsrch_insert(rel, insertstate); + offset = _bt_binsrch_insert(rel, insertstate, 1); /* * Scan over all equal tuples, looking for live conflicts. @@ -450,6 +452,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, Assert(itup_key->scantid == NULL); for (;;) { + AttrNumber cmpcol = 1; + /* * Each iteration of the loop processes one heap TID, not one index * tuple. Current offset number for page isn't usually advanced on @@ -485,7 +489,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, Assert(insertstate->bounds_valid); Assert(insertstate->low >= P_FIRSTDATAKEY(opaque)); Assert(insertstate->low <= insertstate->stricthigh); - Assert(_bt_compare(rel, itup_key, page, offset) < 0); + Assert(_bt_compare(rel, itup_key, page, offset, &cmpcol) < 0); break; } @@ -510,7 +514,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, if (!inposting) { /* Plain tuple, or first TID in posting list tuple */ - if (_bt_compare(rel, itup_key, page, offset) != 0) + if (_bt_compare(rel, itup_key, page, offset, &cmpcol) != 0) break; /* we're past all the equal tuples */ /* Advanced curitup */ @@ -720,11 +724,12 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, else { int highkeycmp; + cmpcol = 1; /* If scankey == hikey we gotta check the next page too */ if (P_RIGHTMOST(opaque)) break; - highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY); + highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol); Assert(highkeycmp <= 0); if (highkeycmp != 0) break; @@ -867,6 +872,8 @@ _bt_findinsertloc(Relation rel, for (;;) { + AttrNumber cmpcol = 1; + /* * Does the new tuple belong on this page? * @@ -884,7 +891,7 @@ _bt_findinsertloc(Relation rel, /* Test '<=', not '!=', since scantid is set now */ if (P_RIGHTMOST(opaque) || - _bt_compare(rel, itup_key, page, P_HIKEY) <= 0) + _bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol) <= 0) break; _bt_stepright(rel, heapRel, insertstate, stack); @@ -937,6 +944,8 @@ _bt_findinsertloc(Relation rel, */ while (PageGetFreeSpace(page) < insertstate->itemsz) { + AttrNumber cmpcol = 1; + /* * Before considering moving right, see if we can obtain enough * space by erasing LP_DEAD items @@ -967,7 +976,7 @@ _bt_findinsertloc(Relation rel, break; if (P_RIGHTMOST(opaque) || - _bt_compare(rel, itup_key, page, P_HIKEY) != 0 || + _bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol) != 0 || pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100)) break; @@ -982,10 +991,13 @@ _bt_findinsertloc(Relation rel, * We should now be on the correct page. Find the offset within the page * for the new tuple. (Possibly reusing earlier search bounds.) */ - Assert(P_RIGHTMOST(opaque) || - _bt_compare(rel, itup_key, page, P_HIKEY) <= 0); + { + AttrNumber cmpcol PG_USED_FOR_ASSERTS_ONLY = 1; + Assert(P_RIGHTMOST(opaque) || + _bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol) <= 0); + } - newitemoff = _bt_binsrch_insert(rel, insertstate); + newitemoff = _bt_binsrch_insert(rel, insertstate, 1); if (insertstate->postingoff == -1) { @@ -1004,7 +1016,7 @@ _bt_findinsertloc(Relation rel, */ Assert(!insertstate->bounds_valid); insertstate->postingoff = 0; - newitemoff = _bt_binsrch_insert(rel, insertstate); + newitemoff = _bt_binsrch_insert(rel, insertstate, 1); Assert(insertstate->postingoff == 0); } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 3230b3b894..a6998e48d8 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -26,7 +26,8 @@ static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); -static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); +static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf, + AttrNumber *highkeycmpcol); static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum); static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, @@ -102,6 +103,8 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, { BTStack stack_in = NULL; int page_access = BT_READ; + char tupdatabuf[BLCKSZ / 3]; + AttrNumber highkeycmpcol = 1; /* heaprel must be set whenever _bt_allocbuf is reachable */ Assert(access == BT_READ || access == BT_WRITE); @@ -138,7 +141,8 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, * opportunity to finish splits of internal pages too. */ *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE), - stack_in, page_access, snapshot); + stack_in, page_access, snapshot, &highkeycmpcol, + (char *) tupdatabuf); /* if this is a leaf page, we're done */ page = BufferGetPage(*bufP); @@ -150,12 +154,15 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, * Find the appropriate pivot tuple on this page. Its downlink points * to the child page that we're about to descend to. */ - offnum = _bt_binsrch(rel, key, *bufP); + offnum = _bt_binsrch(rel, key, *bufP, &highkeycmpcol); itemid = PageGetItemId(page, offnum); itup = (IndexTuple) PageGetItem(page, itemid); Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace); child = BTreeTupleGetDownLink(itup); + Assert(IndexTupleSize(itup) < sizeof(tupdatabuf)); + memcpy((char *) tupdatabuf, (char *) itup, IndexTupleSize(itup)); + /* * We need to save the location of the pivot tuple we chose in a new * stack entry for this page/level. If caller ends up splitting a @@ -189,6 +196,8 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, */ if (access == BT_WRITE && page_access == BT_READ) { + highkeycmpcol = 1; + /* trade in our read lock for a write lock */ _bt_unlockbuf(rel, *bufP); _bt_lockbuf(rel, *bufP, BT_WRITE); @@ -199,7 +208,7 @@ _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, * move right to its new sibling. Do that. */ *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE, - snapshot); + snapshot, &highkeycmpcol, (char *) tupdatabuf); } return stack_in; @@ -248,13 +257,16 @@ _bt_moveright(Relation rel, bool forupdate, BTStack stack, int access, - Snapshot snapshot) + Snapshot snapshot, + AttrNumber *comparecol, + char *tupdatabuf) { Page page; BTPageOpaque opaque; int32 cmpval; Assert(!forupdate || heaprel != NULL); + Assert(PointerIsValid(comparecol) && PointerIsValid(tupdatabuf)); /* * When nextkey = false (normal case): if the scan key that brought us to @@ -277,12 +289,17 @@ _bt_moveright(Relation rel, for (;;) { + AttrNumber cmpcol = 1; + page = BufferGetPage(buf); TestForOldSnapshot(snapshot, rel, page); opaque = BTPageGetOpaque(page); if (P_RIGHTMOST(opaque)) + { + *comparecol = 1; break; + } /* * Finish any incomplete splits we encounter along the way. @@ -308,14 +325,55 @@ _bt_moveright(Relation rel, continue; } - if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval) + /* + * tupdatabuf is filled with the right seperator of the parent node. + * This allows us to do a binary equality check between the parent + * node's right seperator (which is < key) and this page's P_HIKEY. + * If they equal, we can reuse the result of the parent node's + * rightkey compare, which means we can potentially save a full key + * compare (which includes indirect calls to attribute comparison + * functions). + * + * Without this, we'd on average use 3 full key compares per page before + * we achieve full dynamic prefix bounds, but with this optimization + * that is only 2. + * + * 3 compares: 1 for the highkey (rightmost), and on average 2 before + * we move right in the binary search on the page, this average equals + * SUM (1/2 ^ x) for x from 0 to log(n items)), which tends to 2. + */ + if (!P_IGNORE(opaque) && *comparecol > 1) + { + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, P_HIKEY)); + IndexTuple buftuple = (IndexTuple) tupdatabuf; + if (IndexTupleSize(itup) == IndexTupleSize(buftuple)) + { + char *dataptr = (char *) itup; + + if (memcmp(dataptr + sizeof(IndexTupleData), + tupdatabuf + sizeof(IndexTupleData), + IndexTupleSize(itup) - sizeof(IndexTupleData)) == 0) + break; + } else { + *comparecol = 1; + } + } else { + *comparecol = 1; + } + + if (P_IGNORE(opaque) || + _bt_compare(rel, key, page, P_HIKEY, &cmpcol) >= cmpval) { + *comparecol = 1; /* step right one page */ buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access); continue; } else + { + *comparecol = cmpcol; break; + } } if (P_IGNORE(opaque)) @@ -341,6 +399,16 @@ _bt_moveright(Relation rel, * right place to descend to be sure we find all leaf keys >= given scankey * (or leaf keys > given scankey when nextkey is true). * + * When called, the "highkeycmpcol" pointer argument is expected to contain the + * AttrNumber of the first attribute that is not shared between scan key and + * this page's high key, i.e. the first attribute that we have to compare + * against the scan key. The value will be updated by _bt_binsrch to contain + * this same first column we'll need to compare against the scan key, but now + * for the index tuple at the returned offset. Valid values range from 1 + * (no shared prefix) to the number of key attributes + 1 (all index key + * attributes are equal to the scan key). See also _bt_compare, and + * backend/access/nbtree/README for more info. + * * This procedure is not responsible for walking right, it just examines * the given page. _bt_binsrch() has no lock or refcount side effects * on the buffer. @@ -348,7 +416,8 @@ _bt_moveright(Relation rel, static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, - Buffer buf) + Buffer buf, + AttrNumber *highkeycmpcol) { Page page; BTPageOpaque opaque; @@ -356,6 +425,13 @@ _bt_binsrch(Relation rel, high; int32 result, cmpval; + /* + * Prefix bounds, for the high/low offset's compare columns. + * "highkeycmpcol" is the value for this page's high key (if any) or 1 + * (no established shared prefix) + */ + AttrNumber highcmpcol = *highkeycmpcol, + lowcmpcol = 1; page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); @@ -388,6 +464,10 @@ _bt_binsrch(Relation rel, * For nextkey=true (cmpval=0), the loop invariant is: all slots before * 'low' are <= scan key, all slots at or after 'high' are > scan key. * + * We maintain highcmpcol and lowcmpcol to keep track of prefixes that + * tuples share with the scan key, potentially allowing us to skip a + * prefix in the midpoint comparison. + * * We can fall out when high == low. */ high++; /* establish the loop invariant for high */ @@ -397,17 +477,27 @@ _bt_binsrch(Relation rel, while (high > low) { OffsetNumber mid = low + ((high - low) / 2); + AttrNumber cmpcol = Min(highcmpcol, lowcmpcol); /* update prefix bounds */ /* We have low <= mid < high, so mid points at a real slot */ - result = _bt_compare(rel, key, page, mid); + result = _bt_compare(rel, key, page, mid, &cmpcol); if (result >= cmpval) + { low = mid + 1; + lowcmpcol = cmpcol; + } else + { high = mid; + highcmpcol = cmpcol; + } } + /* update the bounds at the caller */ + *highkeycmpcol = highcmpcol; + /* * At this point we have high == low, but be careful: they could point * past the last slot on the page. @@ -450,7 +540,8 @@ _bt_binsrch(Relation rel, * list split). */ OffsetNumber -_bt_binsrch_insert(Relation rel, BTInsertState insertstate) +_bt_binsrch_insert(Relation rel, BTInsertState insertstate, + AttrNumber highcmpcol) { BTScanInsert key = insertstate->itup_key; Page page; @@ -460,6 +551,7 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) stricthigh; int32 result, cmpval; + AttrNumber lowcmpcol = 1; page = BufferGetPage(insertstate->buf); opaque = BTPageGetOpaque(page); @@ -510,16 +602,22 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) while (high > low) { OffsetNumber mid = low + ((high - low) / 2); + AttrNumber cmpcol = Min(highcmpcol, lowcmpcol); /* We have low <= mid < high, so mid points at a real slot */ - result = _bt_compare(rel, key, page, mid); + result = _bt_compare(rel, key, page, mid, &cmpcol); if (result >= cmpval) + { low = mid + 1; + lowcmpcol = cmpcol; + } else { high = mid; + highcmpcol = cmpcol; + if (result != 0) stricthigh = high; } @@ -654,6 +752,13 @@ _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum) * matching TID in the posting tuple, which caller must handle * themselves (e.g., by splitting the posting list tuple). * + * NOTE: The "comparecol" argument must refer to the first attribute of the + * index tuple of which the caller knows that it does not match the scan key: + * this means 1 for "no known matching attributes", up to the number of key + * attributes + 1 if the caller knows that all key attributes of the index + * tuple match those of the scan key. See backend/access/nbtree/README for + * details. + * * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be * "minus infinity": this routine will always claim it is less than the * scankey. The actual key value stored is explicitly truncated to 0 @@ -667,7 +772,8 @@ int32 _bt_compare(Relation rel, BTScanInsert key, Page page, - OffsetNumber offnum) + OffsetNumber offnum, + AttrNumber *comparecol) { TupleDesc itupdesc = RelationGetDescr(rel); BTPageOpaque opaque = BTPageGetOpaque(page); @@ -707,8 +813,9 @@ _bt_compare(Relation rel, ncmpkey = Min(ntupatts, key->keysz); Assert(key->heapkeyspace || ncmpkey == key->keysz); Assert(!BTreeTupleIsPosting(itup) || key->allequalimage); - scankey = key->scankeys; - for (int i = 1; i <= ncmpkey; i++) + + scankey = key->scankeys + ((*comparecol) - 1); + for (int i = *comparecol; i <= ncmpkey; i++) { Datum datum; bool isNull; @@ -752,11 +859,20 @@ _bt_compare(Relation rel, /* if the keys are unequal, return the difference */ if (result != 0) + { + *comparecol = i; return result; + } scankey++; } + /* + * All tuple attributes are equal to the scan key, only later attributes + * could potentially not equal the scan key. + */ + *comparecol = ntupatts + 1; + /* * All non-truncated attributes (other than heap TID) were found to be * equal. Treat truncated attributes as minus infinity when scankey has a @@ -887,6 +1003,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) StrategyNumber strat_total; BTScanPosItem *currItem; BlockNumber blkno; + AttrNumber cmpcol = 1; Assert(!BTScanPosIsValid(so->currPos)); @@ -1415,7 +1532,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) _bt_initialize_more_data(so, dir); /* position to the precise item on the page */ - offnum = _bt_binsrch(rel, &inskey, buf); + offnum = _bt_binsrch(rel, &inskey, buf, &cmpcol); /* * If nextkey = false, we are positioned at the first item >= scan key, or diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 8891fa7973..11f4184107 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1234,9 +1234,12 @@ extern BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot); extern Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, - int access, Snapshot snapshot); -extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate); -extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum); + int access, Snapshot snapshot, + AttrNumber *comparecol, char *tupdatabuf); +extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate, + AttrNumber highcmpcol); +extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, + OffsetNumber offnum, AttrNumber *comparecol); extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, -- 2.40.1