From 4f8c9cb8af171fca3226ef9acb2883623576983d Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 15 Apr 2022 18:25:38 +0200 Subject: [PATCH v3 7/9] Implement dynamic prefix compression in nbtree Because tuples are ordered on the page, if some prefix of the scan attributes on both sides of the compared tuple are equal to the scankey, then the current tuple that is being compared must also have those prefixing attributes that equal the scankey. We cannot propagate this information to _binsrch on lower pages, as this downstream page may concurrently have split and/or have merged with its deleted left neighbour (see [0]), which moves the keyspace of the linked page. We thus can only trust the current state of this current page for this optimization, which means we must validate this state each time we open the page. Although this limits the overall performance improvement, it still allows for a nice performance improvement in most cases where initial columns have many duplicate values and a compare function that is not cheap. --- contrib/amcheck/verify_nbtree.c | 17 +++-- src/backend/access/nbtree/README | 25 ++++++++ src/backend/access/nbtree/nbtinsert.c | 14 ++-- src/backend/access/nbtree/nbtinsert_spec.h | 22 +++++-- src/backend/access/nbtree/nbtsearch.c | 2 +- src/backend/access/nbtree/nbtsearch_spec.h | 75 +++++++++++++++++----- src/include/access/nbtree_specialized.h | 8 ++- 7 files changed, 127 insertions(+), 36 deletions(-) diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 70278c4f93..5753611546 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -2673,6 +2673,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) BTInsertStateData insertstate; OffsetNumber offnum; Page page; + AttrNumber cmpcol = 1; insertstate.itup = itup; insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); @@ -2682,13 +2683,13 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) insertstate.buf = lbuf; /* Get matching tuple on leaf page */ - offnum = _bt_binsrch_insert(state->rel, &insertstate); + offnum = _bt_binsrch_insert(state->rel, &insertstate, 1); /* Compare first >= matching item on leaf page, if any */ page = BufferGetPage(lbuf); /* Should match on first heap TID when tuple has a posting list */ if (offnum <= PageGetMaxOffsetNumber(page) && insertstate.postingoff <= 0 && - _bt_compare(state->rel, key, page, offnum) == 0) + _bt_compare(state->rel, key, page, offnum, &cmpcol) == 0) exists = true; _bt_relbuf(state->rel, lbuf); } @@ -2750,6 +2751,7 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key, { ItemId itemid; int32 cmp; + AttrNumber cmpcol = 1; Assert(key->pivotsearch); @@ -2760,7 +2762,7 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key, if (!key->heapkeyspace) return invariant_leq_offset(state, key, upperbound); - cmp = _bt_compare(state->rel, key, state->target, upperbound); + cmp = _bt_compare(state->rel, key, state->target, upperbound, &cmpcol); /* * _bt_compare() is capable of determining that a scankey with a @@ -2812,10 +2814,11 @@ invariant_leq_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber upperbound) { int32 cmp; + AttrNumber cmpcol = 1; Assert(key->pivotsearch); - cmp = _bt_compare(state->rel, key, state->target, upperbound); + cmp = _bt_compare(state->rel, key, state->target, upperbound, &cmpcol); return cmp <= 0; } @@ -2835,10 +2838,11 @@ invariant_g_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber lowerbound) { int32 cmp; + AttrNumber cmpcol = 1; Assert(key->pivotsearch); - cmp = _bt_compare(state->rel, key, state->target, lowerbound); + cmp = _bt_compare(state->rel, key, state->target, lowerbound, &cmpcol); /* pg_upgrade'd indexes may legally have equal sibling tuples */ if (!key->heapkeyspace) @@ -2873,13 +2877,14 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key, { ItemId itemid; int32 cmp; + AttrNumber cmpcol = 1; Assert(key->pivotsearch); /* Verify line pointer before checking tuple */ itemid = PageGetItemIdCareful(state, nontargetblock, nontarget, upperbound); - cmp = _bt_compare(state->rel, key, nontarget, upperbound); + cmp = _bt_compare(state->rel, key, nontarget, upperbound, &cmpcol); /* pg_upgrade'd indexes may legally have equal sibling tuples */ if (!key->heapkeyspace) diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 3c08888c23..13ac9ee2be 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -901,6 +901,31 @@ large groups of duplicates, maximizing space utilization. Note also that deduplication more efficient. Deduplication can be performed infrequently, without merging together existing posting list tuples too often. + +Notes about dynamic prefix truncation +------------------------------------- + +Because NBTrees have a sorted keyspace, when we have determined that some +prefixing columns of tuples on both sides of the tuple that is being +compared are equal to the scankey, then the current tuple must also share +this prefix with the scankey. This allows us to skip comparing those columns, +potentially saving cycles. + +We can only use this constraint if we have proven this information while we +hold a pin on the page, so this is only useful on the page level: Concurrent +page deletions and splits may have moved the keyspace of the page referenced +by an inner page to the right. If we re-used high- and low-column-prefixes, +we would not be able to detect a change of keyspace from e.g. (2,2) to (1,2), +and subsequently return invalid results. This race condition can only be +prevented by re-establishing the prefix-equal-columns for each page. + +The positive part of this, is that we already have results of the highest +value of a page: a pages' highkey is compared to the scankey while we have +a pin on the page in the _bt_moveright procedure. The _bt_binsrch procedure +will use this result as a rightmost prefix compare, and for each step in the +binary search (that does not compare less than the insert key) improve the +equal-prefix bounds. + Notes about deduplication ------------------------- diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index ec6c73d1cc..20e5f33f98 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -132,7 +132,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, * in the fastpath below, but also in the _bt_findinsertloc() call later. */ Assert(!insertstate->bounds_valid); - offset = nbts_call(_bt_binsrch_insert, rel, insertstate); + offset = nbts_call(_bt_binsrch_insert, rel, insertstate, 1); /* * Scan over all equal tuples, looking for live conflicts. @@ -142,6 +142,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, Assert(itup_key->scantid == NULL); for (;;) { + AttrNumber cmpcol = 1; + /* * Each iteration of the loop processes one heap TID, not one index * tuple. Current offset number for page isn't usually advanced on @@ -177,7 +179,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, Assert(insertstate->bounds_valid); Assert(insertstate->low >= P_FIRSTDATAKEY(opaque)); Assert(insertstate->low <= insertstate->stricthigh); - Assert(nbts_call(_bt_compare, rel, itup_key, page, offset) < 0); + Assert(nbts_call(_bt_compare, rel, itup_key, page, offset, + &cmpcol) < 0); break; } @@ -202,7 +205,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, if (!inposting) { /* Plain tuple, or first TID in posting list tuple */ - if (nbts_call(_bt_compare, rel, itup_key, page, offset) != 0) + if (nbts_call(_bt_compare, rel, itup_key, page, offset, + &cmpcol) != 0) break; /* we're past all the equal tuples */ /* Advanced curitup */ @@ -412,11 +416,13 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, else { int highkeycmp; + cmpcol = 1; /* If scankey == hikey we gotta check the next page too */ if (P_RIGHTMOST(opaque)) break; - highkeycmp = nbts_call(_bt_compare, rel, itup_key, page, P_HIKEY); + highkeycmp = nbts_call(_bt_compare, rel, itup_key, page, P_HIKEY, + &cmpcol); Assert(highkeycmp <= 0); if (highkeycmp != 0) break; diff --git a/src/backend/access/nbtree/nbtinsert_spec.h b/src/backend/access/nbtree/nbtinsert_spec.h index 97c866aea3..ccba0fa5ed 100644 --- a/src/backend/access/nbtree/nbtinsert_spec.h +++ b/src/backend/access/nbtree/nbtinsert_spec.h @@ -73,6 +73,7 @@ NBTS_FUNCTION(_bt_search_insert)(Relation rel, BTInsertState insertstate) { Page page; BTPageOpaque opaque; + AttrNumber comparecol = 1; _bt_checkpage(rel, insertstate->buf); page = BufferGetPage(insertstate->buf); @@ -91,7 +92,8 @@ NBTS_FUNCTION(_bt_search_insert)(Relation rel, BTInsertState insertstate) !P_IGNORE(opaque) && PageGetFreeSpace(page) > insertstate->itemsz && PageGetMaxOffsetNumber(page) >= P_HIKEY && - nbts_call(_bt_compare, rel, insertstate->itup_key, page, P_HIKEY) > 0) + nbts_call(_bt_compare, rel, insertstate->itup_key, page, + P_HIKEY, &comparecol) > 0) { /* * Caller can use the fastpath optimization because cached @@ -221,6 +223,7 @@ NBTS_FUNCTION(_bt_findinsertloc)(Relation rel, for (;;) { + AttrNumber cmpcol = 1; /* * Does the new tuple belong on this page? * @@ -238,7 +241,7 @@ NBTS_FUNCTION(_bt_findinsertloc)(Relation rel, /* Test '<=', not '!=', since scantid is set now */ if (P_RIGHTMOST(opaque) || - nbts_call(_bt_compare, rel, itup_key, page, P_HIKEY) <= 0) + nbts_call(_bt_compare, rel, itup_key, page, P_HIKEY, &cmpcol) <= 0) break; _bt_stepright(rel, insertstate, stack); @@ -291,6 +294,7 @@ NBTS_FUNCTION(_bt_findinsertloc)(Relation rel, */ while (PageGetFreeSpace(page) < insertstate->itemsz) { + AttrNumber cmpcol = 1; /* * Before considering moving right, see if we can obtain enough * space by erasing LP_DEAD items @@ -321,7 +325,8 @@ NBTS_FUNCTION(_bt_findinsertloc)(Relation rel, break; if (P_RIGHTMOST(opaque) || - nbts_call(_bt_compare, rel, itup_key, page, P_HIKEY) != 0 || + nbts_call(_bt_compare, rel, itup_key, page, P_HIKEY, + &cmpcol) != 0 || pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100)) break; @@ -336,10 +341,13 @@ NBTS_FUNCTION(_bt_findinsertloc)(Relation rel, * We should now be on the correct page. Find the offset within the page * for the new tuple. (Possibly reusing earlier search bounds.) */ - Assert(P_RIGHTMOST(opaque) || - nbts_call(_bt_compare, rel, itup_key, page, P_HIKEY) <= 0); + { + AttrNumber cmpcol PG_USED_FOR_ASSERTS_ONLY = 1; + Assert(P_RIGHTMOST(opaque) || nbts_call(_bt_compare, rel, itup_key, + page, P_HIKEY, &cmpcol) <= 0); + } - newitemoff = nbts_call(_bt_binsrch_insert, rel, insertstate); + newitemoff = nbts_call(_bt_binsrch_insert, rel, insertstate, 1); if (insertstate->postingoff == -1) { @@ -358,7 +366,7 @@ NBTS_FUNCTION(_bt_findinsertloc)(Relation rel, */ Assert(!insertstate->bounds_valid); insertstate->postingoff = 0; - newitemoff = nbts_call(_bt_binsrch_insert, rel, insertstate); + newitemoff = nbts_call(_bt_binsrch_insert, rel, insertstate, 1); Assert(insertstate->postingoff == 0); } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index d5152bfcb7..036ce88679 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -696,7 +696,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) _bt_initialize_more_data(so, dir); /* position to the precise item on the page */ - offnum = nbts_call(_bt_binsrch, rel, &inskey, buf); + offnum = nbts_call(_bt_binsrch, rel, &inskey, buf, 1); /* * If nextkey = false, we are positioned at the first item >= scan key, or diff --git a/src/backend/access/nbtree/nbtsearch_spec.h b/src/backend/access/nbtree/nbtsearch_spec.h index a5c5f2b94f..829c216819 100644 --- a/src/backend/access/nbtree/nbtsearch_spec.h +++ b/src/backend/access/nbtree/nbtsearch_spec.h @@ -10,8 +10,10 @@ */ #ifndef NBTS_SPECIALIZING_DEFAULT -static OffsetNumber NBTS_FUNCTION(_bt_binsrch)(Relation rel, BTScanInsert key, - Buffer buf); +static OffsetNumber NBTS_FUNCTION(_bt_binsrch)(Relation rel, + BTScanInsert key, + Buffer buf, + AttrNumber highkeycmpcol); static bool NBTS_FUNCTION(_bt_readpage)(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum); @@ -38,7 +40,8 @@ static bool NBTS_FUNCTION(_bt_readpage)(IndexScanDesc scan, ScanDirection dir, static OffsetNumber NBTS_FUNCTION(_bt_binsrch)(Relation rel, BTScanInsert key, - Buffer buf) + Buffer buf, + AttrNumber highkeycmpcol) { Page page; BTPageOpaque opaque; @@ -46,6 +49,8 @@ NBTS_FUNCTION(_bt_binsrch)(Relation rel, high; int32 result, cmpval; + AttrNumber highcmpcol = highkeycmpcol, + lowcmpcol = 1; page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); @@ -87,15 +92,22 @@ NBTS_FUNCTION(_bt_binsrch)(Relation rel, while (high > low) { OffsetNumber mid = low + ((high - low) / 2); + AttrNumber cmpcol = Min(highcmpcol, lowcmpcol); /* We have low <= mid < high, so mid points at a real slot */ - result = nbts_call(_bt_compare, rel, key, page, mid); + result = nbts_call(_bt_compare, rel, key, page, mid, &cmpcol); if (result >= cmpval) + { low = mid + 1; + lowcmpcol = cmpcol; + } else + { high = mid; + highcmpcol = cmpcol; + } } /* @@ -441,6 +453,7 @@ NBTS_FUNCTION(_bt_search)(Relation rel, BTScanInsert key, Buffer *bufP, IndexTuple itup; BlockNumber child; BTStack new_stack; + AttrNumber highkeycmpcol = 1; /* * Race -- the page we just grabbed may have split since we read its @@ -456,7 +469,7 @@ NBTS_FUNCTION(_bt_search)(Relation rel, BTScanInsert key, Buffer *bufP, */ *bufP = nbts_call(_bt_moveright, rel, key, *bufP, (access == BT_WRITE), stack_in, - page_access, snapshot); + page_access, snapshot, &highkeycmpcol); /* if this is a leaf page, we're done */ page = BufferGetPage(*bufP); @@ -468,7 +481,7 @@ NBTS_FUNCTION(_bt_search)(Relation rel, BTScanInsert key, Buffer *bufP, * Find the appropriate pivot tuple on this page. Its downlink points * to the child page that we're about to descend to. */ - offnum = nbts_call(_bt_binsrch, rel, key, *bufP); + offnum = nbts_call(_bt_binsrch, rel, key, *bufP, highkeycmpcol); itemid = PageGetItemId(page, offnum); itup = (IndexTuple) PageGetItem(page, itemid); Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace); @@ -507,6 +520,7 @@ NBTS_FUNCTION(_bt_search)(Relation rel, BTScanInsert key, Buffer *bufP, */ if (access == BT_WRITE && page_access == BT_READ) { + AttrNumber highkeycmpcol = 1; /* trade in our read lock for a write lock */ _bt_unlockbuf(rel, *bufP); _bt_lockbuf(rel, *bufP, BT_WRITE); @@ -517,7 +531,7 @@ NBTS_FUNCTION(_bt_search)(Relation rel, BTScanInsert key, Buffer *bufP, * move right to its new sibling. Do that. */ *bufP = nbts_call(_bt_moveright, rel, key, *bufP, true, stack_in, - BT_WRITE, snapshot); + BT_WRITE, snapshot, &highkeycmpcol); } return stack_in; @@ -565,12 +579,15 @@ NBTS_FUNCTION(_bt_moveright)(Relation rel, bool forupdate, BTStack stack, int access, - Snapshot snapshot) + Snapshot snapshot, + AttrNumber *comparecol) { Page page; BTPageOpaque opaque; int32 cmpval; + Assert(PointerIsValid(comparecol)); + /* * When nextkey = false (normal case): if the scan key that brought us to * this page is > the high key stored on the page, then the page has split @@ -592,12 +609,17 @@ NBTS_FUNCTION(_bt_moveright)(Relation rel, for (;;) { + AttrNumber cmpcol = 1; + page = BufferGetPage(buf); TestForOldSnapshot(snapshot, rel, page); opaque = BTPageGetOpaque(page); if (P_RIGHTMOST(opaque)) + { + *comparecol = cmpcol; break; + } /* * Finish any incomplete splits we encounter along the way. @@ -623,14 +645,19 @@ NBTS_FUNCTION(_bt_moveright)(Relation rel, continue; } - if (P_IGNORE(opaque) || nbts_call(_bt_compare, rel, key, page, P_HIKEY) >= cmpval) + if (P_IGNORE(opaque) || nbts_call(_bt_compare, rel, key, page, P_HIKEY, + &cmpcol) >= cmpval) { /* step right one page */ + *comparecol = 1; buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access); continue; } else + { + *comparecol = cmpcol; break; + } } if (P_IGNORE(opaque)) @@ -663,7 +690,8 @@ NBTS_FUNCTION(_bt_moveright)(Relation rel, * list split). */ OffsetNumber -NBTS_FUNCTION(_bt_binsrch_insert)(Relation rel, BTInsertState insertstate) +NBTS_FUNCTION(_bt_binsrch_insert)(Relation rel, BTInsertState insertstate, + AttrNumber highcmpcol) { BTScanInsert key = insertstate->itup_key; Page page; @@ -673,6 +701,7 @@ NBTS_FUNCTION(_bt_binsrch_insert)(Relation rel, BTInsertState insertstate) stricthigh; int32 result, cmpval; + AttrNumber lowcmpcol = 1; page = BufferGetPage(insertstate->buf); opaque = BTPageGetOpaque(page); @@ -723,16 +752,21 @@ NBTS_FUNCTION(_bt_binsrch_insert)(Relation rel, BTInsertState insertstate) while (high > low) { OffsetNumber mid = low + ((high - low) / 2); + AttrNumber cmpcol = Min(highcmpcol, lowcmpcol); /* We have low <= mid < high, so mid points at a real slot */ - result = nbts_call(_bt_compare, rel, key, page, mid); + result = nbts_call(_bt_compare, rel, key, page, mid, &cmpcol); if (result >= cmpval) + { low = mid + 1; + lowcmpcol = cmpcol; + } else { high = mid; + highcmpcol = cmpcol; if (result != 0) stricthigh = high; } @@ -813,7 +847,8 @@ int32 NBTS_FUNCTION(_bt_compare)(Relation rel, BTScanInsert key, Page page, - OffsetNumber offnum) + OffsetNumber offnum, + AttrNumber *comparecol) { TupleDesc itupdesc = RelationGetDescr(rel); BTPageOpaque opaque = BTPageGetOpaque(page); @@ -854,10 +889,11 @@ NBTS_FUNCTION(_bt_compare)(Relation rel, ncmpkey = Min(ntupatts, key->keysz); Assert(key->heapkeyspace || ncmpkey == key->keysz); Assert(!BTreeTupleIsPosting(itup) || key->allequalimage); - scankey = key->scankeys; - nbts_attiterinit(itup, 1, itupdesc); - nbts_foreachattr(1, ncmpkey) + nbts_attiterinit(itup, *comparecol, itupdesc); + scankey = key->scankeys + ((*comparecol) - 1); + + nbts_foreachattr(*comparecol, ncmpkey) { Datum datum; @@ -902,11 +938,20 @@ NBTS_FUNCTION(_bt_compare)(Relation rel, /* if the keys are unequal, return the difference */ if (result != 0) + { + *comparecol = nbts_attiter_attnum; return result; + } scankey++; } + /* + * All tuple attributes are equal to the scan key, only later attributes + * could potentially not equal the scan key. + */ + *comparecol = ntupatts + 1; + /* * All non-truncated attributes (other than heap TID) were found to be * equal. Treat truncated attributes as minus infinity when scankey has a diff --git a/src/include/access/nbtree_specialized.h b/src/include/access/nbtree_specialized.h index c45fa84aed..7402a4c46e 100644 --- a/src/include/access/nbtree_specialized.h +++ b/src/include/access/nbtree_specialized.h @@ -43,12 +43,14 @@ NBTS_FUNCTION(_bt_search)(Relation rel, BTScanInsert key, extern Buffer NBTS_FUNCTION(_bt_moveright)(Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, - Snapshot snapshot); + Snapshot snapshot, AttrNumber *comparecol); extern OffsetNumber -NBTS_FUNCTION(_bt_binsrch_insert)(Relation rel, BTInsertState insertstate); +NBTS_FUNCTION(_bt_binsrch_insert)(Relation rel, BTInsertState insertstate, + AttrNumber highcmpcol); extern int32 NBTS_FUNCTION(_bt_compare)(Relation rel, BTScanInsert key, - Page page, OffsetNumber offnum); + Page page, OffsetNumber offnum, + AttrNumber *comparecol); /* * prototypes for functions in nbtutils_spec.h -- 2.30.2