From be69308e5ba185ccde4b9a395081e48f7c97ee53 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Mon, 22 Feb 2021 14:13:13 +0100 Subject: [PATCH v2 2/2] Implement page-level dynamic prefix truncation for _bt_binsrch* Because tuples are ordered on the page, if some prefix of the scan columns on both sides of the compared tuple are equal to the scankey, then the current tuple that is being compared must also have those prefixing columns that equal the scankey. We cannot propagate this information to _binsrch on subsequent pages, as this downstream page may concurrently have split and/or have merged with its deleted left neighbour (see [0]), moving the keyspace of the linked page, so we can only trust the current state of the page for this optimization, which means we must validate this state each time we pin this page. [0] https://www.postgresql.org/message-id/CAH2-Wzn_NAyK4pR0HRWO0StwHmxjP5qyu+X8vppt030XpqrO6w@mail.gmail.com --- contrib/amcheck/verify_nbtree.c | 25 +++++---- src/backend/access/nbtree/README | 24 +++++++++ src/backend/access/nbtree/nbtinsert.c | 28 ++++++---- src/backend/access/nbtree/nbtsearch.c | 74 +++++++++++++++++++++------ src/include/access/nbtree.h | 9 ++-- 5 files changed, 120 insertions(+), 40 deletions(-) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index fdfc320e84..09a43d4fff 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -2651,6 +2651,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) BTStack stack; Buffer lbuf; bool exists; + AttrNumber cmpcol = 1; key = _bt_mkscankey(state->rel, itup); Assert(key->heapkeyspace && key->scantid != NULL); @@ -2681,13 +2682,13 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) insertstate.buf = lbuf; /* Get matching tuple on leaf page */ - offnum = _bt_binsrch_insert(state->rel, &insertstate); + offnum = _bt_binsrch_insert(state->rel, &insertstate, 1); /* Compare first >= matching item on leaf page, if any */ page = BufferGetPage(lbuf); /* Should match on first heap TID when tuple has a posting list */ if (offnum <= PageGetMaxOffsetNumber(page) && insertstate.postingoff <= 0 && - _bt_compare(state->rel, key, page, offnum) == 0) + _bt_compare(state->rel, key, page, offnum, &cmpcol) == 0) exists = true; _bt_relbuf(state->rel, lbuf); } @@ -2748,7 +2749,8 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber upperbound) { ItemId itemid; - int32 cmp; + int32 cmp, + cmpcol = 1; Assert(key->pivotsearch); @@ -2759,7 +2761,7 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key, if (!key->heapkeyspace) return invariant_leq_offset(state, key, upperbound); - cmp = _bt_compare(state->rel, key, state->target, upperbound); + cmp = _bt_compare(state->rel, key, state->target, upperbound, &cmpcol); /* * _bt_compare() is capable of determining that a scankey with a @@ -2810,11 +2812,12 @@ static inline bool invariant_leq_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber upperbound) { - int32 cmp; + int32 cmp, + cmpcol = 1; Assert(key->pivotsearch); - cmp = _bt_compare(state->rel, key, state->target, upperbound); + cmp = _bt_compare(state->rel, key, state->target, upperbound, &cmpcol); return cmp <= 0; } @@ -2833,11 +2836,12 @@ static inline bool invariant_g_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber lowerbound) { - int32 cmp; + int32 cmp, + cmpcol = 1; Assert(key->pivotsearch); - cmp = _bt_compare(state->rel, key, state->target, lowerbound); + cmp = _bt_compare(state->rel, key, state->target, lowerbound, &cmpcol); /* pg_upgrade'd indexes may legally have equal sibling tuples */ if (!key->heapkeyspace) @@ -2871,14 +2875,15 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber upperbound) { ItemId itemid; - int32 cmp; + int32 cmp, + cmpcol = 1; Assert(key->pivotsearch); /* Verify line pointer before checking tuple */ itemid = PageGetItemIdCareful(state, nontargetblock, nontarget, upperbound); - cmp = _bt_compare(state->rel, key, nontarget, upperbound); + cmp = _bt_compare(state->rel, key, nontarget, upperbound, &cmpcol); /* pg_upgrade'd indexes may legally have equal sibling tuples */ if (!key->heapkeyspace) diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index bfe33b6b43..79a179afad 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -874,6 +874,30 @@ large groups of duplicates, maximizing space utilization. Note also that deduplication more efficient. Deduplication can be performed infrequently, without merging together existing posting list tuples too often. +Notes about dynamic prefix truncation +------------------------------------- + +Because NBTrees have a sorted keyspace, when we have determined that some +prefixing columns of tuples on both sides of the tuple that is being +compared are equal to the scankey, then the current tuple must also share +this prefix with the scankey. This allows us to skip comparing those columns, +potentially saving cycles. + +We can only use this constraint if we have proven this information while we +hold a pin on the page, so this is only useful on the page level: Concurrent +page deletions and splits may have moved the keyspace of the page referenced +by an inner page to the right. If we re-used high- and low-column-prefixes, +we would not be able to detect a change of keyspace from e.g. (2,2) to (1,2), +and subsequently return invalid results. This race condition can only be +prevented by re-establishing the prefix-equal-columns for each page. + +The positive part of this, is that we already have results of the highest +value of a page: a pages' highkey is compared to the scankey while we have +a pin on the page in the _bt_moveright procedure. The _bt_binsrch procedure +will use this result as a rightmost prefix compare, and for each step in the +binary search (that does not compare less than the insert key) improve the +equal-prefix bounds. + Notes about deduplication ------------------------- diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 6ac205c98e..c5b32e7ce5 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -325,6 +325,7 @@ _bt_search_insert(Relation rel, BTInsertState insertstate) { Page page; BTPageOpaque opaque; + AttrNumber comparecol = 1; _bt_checkpage(rel, insertstate->buf); page = BufferGetPage(insertstate->buf); @@ -343,7 +344,7 @@ _bt_search_insert(Relation rel, BTInsertState insertstate) !P_IGNORE(opaque) && PageGetFreeSpace(page) > insertstate->itemsz && PageGetMaxOffsetNumber(page) >= P_HIKEY && - _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0) + _bt_compare(rel, insertstate->itup_key, page, P_HIKEY, &comparecol) > 0) { /* * Caller can use the fastpath optimization because cached @@ -437,7 +438,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, * in the fastpath below, but also in the _bt_findinsertloc() call later. */ Assert(!insertstate->bounds_valid); - offset = _bt_binsrch_insert(rel, insertstate); + offset = _bt_binsrch_insert(rel, insertstate, 1); /* * Scan over all equal tuples, looking for live conflicts. @@ -447,6 +448,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, Assert(itup_key->scantid == NULL); for (;;) { + AttrNumber cmpcol = 1; /* * Each iteration of the loop processes one heap TID, not one index * tuple. Current offset number for page isn't usually advanced on @@ -482,7 +484,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, Assert(insertstate->bounds_valid); Assert(insertstate->low >= P_FIRSTDATAKEY(opaque)); Assert(insertstate->low <= insertstate->stricthigh); - Assert(_bt_compare(rel, itup_key, page, offset) < 0); + Assert(_bt_compare(rel, itup_key, page, offset, &cmpcol) < 0); break; } @@ -507,7 +509,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, if (!inposting) { /* Plain tuple, or first TID in posting list tuple */ - if (_bt_compare(rel, itup_key, page, offset) != 0) + if (_bt_compare(rel, itup_key, page, offset, &cmpcol) != 0) break; /* we're past all the equal tuples */ /* Advanced curitup */ @@ -717,11 +719,12 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, else { int highkeycmp; + cmpcol = 1; /* If scankey == hikey we gotta check the next page too */ if (P_RIGHTMOST(opaque)) break; - highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY); + highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol); Assert(highkeycmp <= 0); if (highkeycmp != 0) break; @@ -823,6 +826,7 @@ _bt_findinsertloc(Relation rel, Page page = BufferGetPage(insertstate->buf); BTPageOpaque opaque; OffsetNumber newitemoff; + AttrNumber cmpcol = 1; opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -867,6 +871,7 @@ _bt_findinsertloc(Relation rel, for (;;) { + cmpcol = 1; /* * Does the new tuple belong on this page? * @@ -884,7 +889,7 @@ _bt_findinsertloc(Relation rel, /* Test '<=', not '!=', since scantid is set now */ if (P_RIGHTMOST(opaque) || - _bt_compare(rel, itup_key, page, P_HIKEY) <= 0) + _bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol) <= 0) break; _bt_stepright(rel, insertstate, stack); @@ -937,6 +942,7 @@ _bt_findinsertloc(Relation rel, */ while (PageGetFreeSpace(page) < insertstate->itemsz) { + cmpcol = 1; /* * Before considering moving right, see if we can obtain enough * space by erasing LP_DEAD items @@ -967,7 +973,7 @@ _bt_findinsertloc(Relation rel, break; if (P_RIGHTMOST(opaque) || - _bt_compare(rel, itup_key, page, P_HIKEY) != 0 || + _bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol) != 0 || random() <= (MAX_RANDOM_VALUE / 100)) break; @@ -982,10 +988,10 @@ _bt_findinsertloc(Relation rel, * We should now be on the correct page. Find the offset within the page * for the new tuple. (Possibly reusing earlier search bounds.) */ - Assert(P_RIGHTMOST(opaque) || - _bt_compare(rel, itup_key, page, P_HIKEY) <= 0); + Assert(((cmpcol = 1), (P_RIGHTMOST(opaque) || + _bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol) <= 0))); - newitemoff = _bt_binsrch_insert(rel, insertstate); + newitemoff = _bt_binsrch_insert(rel, insertstate, cmpcol); if (insertstate->postingoff == -1) { @@ -1004,7 +1010,7 @@ _bt_findinsertloc(Relation rel, */ Assert(!insertstate->bounds_valid); insertstate->postingoff = 0; - newitemoff = _bt_binsrch_insert(rel, insertstate); + newitemoff = _bt_binsrch_insert(rel, insertstate, cmpcol); Assert(insertstate->postingoff == 0); } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index e002c11e8b..214e699274 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -25,7 +25,7 @@ static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); -static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); +static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf, AttrNumber highkeycmpcol); static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum); static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, @@ -121,6 +121,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, IndexTuple itup; BlockNumber child; BTStack new_stack; + AttrNumber comparecol = 1; /* * Race -- the page we just grabbed may have split since we read its @@ -135,7 +136,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, * opportunity to finish splits of internal pages too. */ *bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in, - page_access, snapshot); + page_access, snapshot, &comparecol); /* if this is a leaf page, we're done */ page = BufferGetPage(*bufP); @@ -147,7 +148,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, * Find the appropriate pivot tuple on this page. Its downlink points * to the child page that we're about to descend to. */ - offnum = _bt_binsrch(rel, key, *bufP); + offnum = _bt_binsrch(rel, key, *bufP, comparecol); itemid = PageGetItemId(page, offnum); itup = (IndexTuple) PageGetItem(page, itemid); Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace); @@ -186,6 +187,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, */ if (access == BT_WRITE && page_access == BT_READ) { + AttrNumber comparecol = 1; /* trade in our read lock for a write lock */ _bt_unlockbuf(rel, *bufP); _bt_lockbuf(rel, *bufP, BT_WRITE); @@ -196,7 +198,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, * move right to its new sibling. Do that. */ *bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE, - snapshot); + snapshot, &comparecol); } return stack_in; @@ -238,18 +240,22 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, * positioning for an insert or delete, so NULL is used for those cases. */ Buffer -_bt_moveright(Relation rel, +_bt_moveright( + Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, - Snapshot snapshot) + Snapshot snapshot, + AttrNumber *comparecol) { Page page; BTPageOpaque opaque; int32 cmpval; + Assert(PointerIsValid(comparecol)); + /* * When nextkey = false (normal case): if the scan key that brought us to * this page is > the high key stored on the page, then the page has split @@ -271,12 +277,16 @@ _bt_moveright(Relation rel, for (;;) { + AttrNumber cmpcol = 1; page = BufferGetPage(buf); TestForOldSnapshot(snapshot, rel, page); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_RIGHTMOST(opaque)) + { + *comparecol = cmpcol; break; + } /* * Finish any incomplete splits we encounter along the way. @@ -302,14 +312,18 @@ _bt_moveright(Relation rel, continue; } - if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval) + if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY, &cmpcol) >= cmpval) { /* step right one page */ buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access); + *comparecol = 1; continue; } else + { + *comparecol = cmpcol; break; + } } if (P_IGNORE(opaque)) @@ -342,7 +356,8 @@ _bt_moveright(Relation rel, static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, - Buffer buf) + Buffer buf, + AttrNumber highkeycmpcol) { Page page; BTPageOpaque opaque; @@ -350,6 +365,9 @@ _bt_binsrch(Relation rel, high; int32 result, cmpval; + AttrNumber curcmpcol = 1, + highcmpcol = highkeycmpcol, + lowcmpcol = 1; page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -394,12 +412,20 @@ _bt_binsrch(Relation rel, /* We have low <= mid < high, so mid points at a real slot */ - result = _bt_compare(rel, key, page, mid); + result = _bt_compare(rel, key, page, mid, &curcmpcol); + Assert(((curcmpcol = 1), result == _bt_compare(rel, key, page, mid, &curcmpcol))); if (result >= cmpval) + { low = mid + 1; + lowcmpcol = curcmpcol; + } else + { high = mid; + highcmpcol = curcmpcol; + } + curcmpcol = Min(highcmpcol, lowcmpcol); } /* @@ -444,7 +470,7 @@ _bt_binsrch(Relation rel, * list split). */ OffsetNumber -_bt_binsrch_insert(Relation rel, BTInsertState insertstate) +_bt_binsrch_insert(Relation rel, BTInsertState insertstate, AttrNumber highcmpcol) { BTScanInsert key = insertstate->itup_key; Page page; @@ -454,6 +480,9 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) stricthigh; int32 result, cmpval; + AttrNumber cmpcol = 1, + highcol = highcmpcol, + lowcol = 1; page = BufferGetPage(insertstate->buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -507,17 +536,23 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate) /* We have low <= mid < high, so mid points at a real slot */ - result = _bt_compare(rel, key, page, mid); + result = _bt_compare(rel, key, page, mid, &cmpcol); if (result >= cmpval) + { low = mid + 1; + lowcol = cmpcol; + } else { high = mid; + highcol = cmpcol; if (result != 0) stricthigh = high; } + cmpcol = Min(highcol, lowcol); + /* * If tuple at offset located by binary search is a posting list whose * TID range overlaps with caller's scantid, perform posting list @@ -644,7 +679,8 @@ int32 _bt_compare(Relation rel, BTScanInsert key, Page page, - OffsetNumber offnum) + OffsetNumber offnum, + AttrNumber *comparecol) { TupleDesc itupdesc = RelationGetDescr(rel); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); @@ -659,6 +695,7 @@ _bt_compare(Relation rel, Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum)); Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel)); Assert(key->heapkeyspace || key->scantid == NULL); + Assert(*comparecol >= 1 && *comparecol <= key->keysz + 1); /* * Force result ">" if target item is first data item on an internal page @@ -685,11 +722,11 @@ _bt_compare(Relation rel, ncmpkey = Min(ntupatts, key->keysz); Assert(key->heapkeyspace || ncmpkey == key->keysz); Assert(!BTreeTupleIsPosting(itup) || key->allequalimage); - scankey = key->scankeys; + scankey = key->scankeys + (*comparecol - 1); - index_attiterinit(itup, 1, itupdesc, &iter); + index_attiterinit(itup, *comparecol, itupdesc, &iter); - for (int i = 1; i <= ncmpkey; i++) + for (int i = *comparecol; i <= ncmpkey; i++) { Datum datum; @@ -732,7 +769,10 @@ _bt_compare(Relation rel, /* if the keys are unequal, return the difference */ if (result != 0) + { + *comparecol = i; return result; + } scankey++; } @@ -746,6 +786,8 @@ _bt_compare(Relation rel, * scankey won't, so explicitly excluding non-key attributes isn't * necessary. */ + *comparecol = ncmpkey + 1; + if (key->keysz > ntupatts) return 1; @@ -1383,7 +1425,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) _bt_initialize_more_data(so, dir); /* position to the precise item on the page */ - offnum = _bt_binsrch(rel, &inskey, buf); + offnum = _bt_binsrch(rel, &inskey, buf, 1); /* * If nextkey = false, we are positioned at the first item >= scan key, or diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 42c66fac57..65e1861eee 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1224,9 +1224,12 @@ extern void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate); extern BTStack _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, Snapshot snapshot); extern Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf, - bool forupdate, BTStack stack, int access, Snapshot snapshot); -extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate); -extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum); + bool forupdate, BTStack stack, int access, Snapshot snapshot, + AttrNumber *comparecol); +extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate, + AttrNumber highcmpcol); +extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum, + AttrNumber *comparecol); extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, -- 2.20.1