From ab3d7230031bfc6461b2334f564c434ae0cd4b12 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 19 Jun 2023 09:58:43 -0700 Subject: [PATCH v1] Add nbtree "goback" boundary case optimization. --- src/include/access/nbtree.h | 9 ++-- src/backend/access/nbtree/nbtpage.c | 19 +++++-- src/backend/access/nbtree/nbtsearch.c | 75 ++++++++++++++++----------- src/backend/access/nbtree/nbtutils.c | 12 +++-- contrib/amcheck/verify_nbtree.c | 16 +++--- 5 files changed, 79 insertions(+), 52 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 8891fa797..6cffbf6d5 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -767,9 +767,10 @@ typedef BTStackData *BTStack; * locate the first item >= scankey. When nextkey is true, they will locate * the first item > scan key. * - * pivotsearch is set to true by callers that want to re-find a leaf page - * using a scankey built from a leaf page's high key. Most callers set this - * to false. + * goback is set to true by callers that intend to back up from the first + * match according to the scankey once on the leaf level. Forward scan + * callers always set this to false. Some backward scan callers prefer to set + * this to true to avoid scanning an extra leaf page in boundary cases. * * scantid is the heap TID that is used as a final tiebreaker attribute. It * is set to NULL when index scan doesn't need to find a position for a @@ -792,7 +793,7 @@ typedef struct BTScanInsertData bool allequalimage; bool anynullkeys; bool nextkey; - bool pivotsearch; + bool goback; ItemPointer scantid; /* tiebreaker for scankeys */ int keysz; /* Size of scankeys array */ ScanKeyData scankeys[INDEX_MAX_KEYS]; /* Must appear last */ diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index c2050656e..61e4c7dac 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1958,12 +1958,21 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) return; } - /* we need an insertion scan key for the search, so build one */ + /* + * We need an insertion scan key for the search, so build one. + * We specify goback=true because we're searching for the + * location of a matching high key (not the location of the + * first matching non-pivot tuple, which will be in the right + * sibling of the sleafbuf page returned by _bt_search). + * + * Note: Unlike regular "goback" searchers, we're not prepared + * to step back from the page returned by _bt_search. That + * makes specifying goback=true crucial here (for us it's not + * merely about handling certain boundary cases optimally.) + */ itup_key = _bt_mkscankey(rel, targetkey); - /* find the leftmost leaf page with matching pivot/high key */ - itup_key->pivotsearch = true; - stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ, - NULL); + itup_key->goback = true; + stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ, NULL); /* won't need a second lock or pin on leafbuf */ _bt_relbuf(rel, sleafbuf); diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 7e05e5867..d7798510d 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -792,11 +792,24 @@ _bt_compare(Relation rel, * doesn't have to descend left because it isn't interested in a match * that has a heap TID value of -inf. * - * However, some searches (pivotsearch searches) actually require that - * we descend left when this happens. -inf is treated as a possible - * match for omitted scankey attribute(s). This is needed by page - * deletion, which must re-find leaf pages that are targets for - * deletion using their high keys. + * We take a symmetrical approach in our handling of such a boundary + * case during goback searches (though with the same goal in mind as + * the regular/!goback case). Here we treat -inf as a possible match + * for omitted scankey attribute(s) instead. That has the effect of + * making the search avoid uselessly locking/pinning the page to the + * right. In other words, goback search should descend left directly + * because it is only interested in a match that is strictly < 'foo'. + * All goback searches are from backward scan callers that don't + * actually need to visit the page to the right at all. + * + * Note: Some goback searches won't get this behavior. Consider a + * backward scan whose insertion scan key locates tuples <= 'foo'. + * This search needs to be able to find matches on both sides of the + * ('foo', inf) leaf level high key, so visiting at least two leaf + * pages is unavoidable. This case will use an initial insertion scan + * key that is goback=true and nextkey=true. (When nextkey=true then + * goback won't influence how _bt_search descends the tree, no matter + * what we do here.) * * Note: the heap TID part of the test ensures that scankey is being * compared to a pivot tuple with one or more truncated key @@ -808,9 +821,13 @@ _bt_compare(Relation rel, * non-key attributes). !heapkeyspace searches must always be * prepared to deal with matches on both sides of the pivot once the * leaf level is reached. + * + * Note: it would be safe (though far from optimal) for every search + * to use goback=true; heapkeyspace searches _effectively_ do so for + * every _bt_search call. */ - if (key->heapkeyspace && !key->pivotsearch && - key->keysz == ntupatts && heapTid == NULL) + if (heapTid == NULL && key->keysz == ntupatts && !key->goback && + key->heapkeyspace) return 1; /* All provided scankey arguments found to be equal */ @@ -875,8 +892,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) BTStack stack; OffsetNumber offnum; StrategyNumber strat; - bool nextkey; - bool goback; BTScanInsertData inskey; ScanKey startKeys[INDEX_MAX_KEYS]; ScanKeyData notnullkeys[INDEX_MAX_KEYS]; @@ -1283,6 +1298,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * goback = false, we will start the scan on the located item. *---------- */ + _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage); + inskey.anynullkeys = false; /* unused */ + inskey.scantid = NULL; + inskey.keysz = keysCount; switch (strat_total) { case BTLessStrategyNumber: @@ -1293,8 +1312,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * for a backward scan, so that is always the correct starting * position.) */ - nextkey = false; - goback = true; + inskey.nextkey = false; + inskey.goback = true; break; case BTLessEqualStrategyNumber: @@ -1305,8 +1324,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * for a backward scan, so that is always the correct starting * position.) */ - nextkey = true; - goback = true; + inskey.nextkey = true; + inskey.goback = true; break; case BTEqualStrategyNumber: @@ -1321,8 +1340,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * This is the same as the <= strategy. We will check at the * end whether the found item is actually =. */ - nextkey = true; - goback = true; + inskey.nextkey = true; + inskey.goback = true; } else { @@ -1330,8 +1349,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * This is the same as the >= strategy. We will check at the * end whether the found item is actually =. */ - nextkey = false; - goback = false; + inskey.nextkey = false; + inskey.goback = false; } break; @@ -1341,8 +1360,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * Find first item >= scankey. (This is only used for forward * scans.) */ - nextkey = false; - goback = false; + inskey.nextkey = false; + inskey.goback = false; break; case BTGreaterStrategyNumber: @@ -1351,8 +1370,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * Find first item > scankey. (This is only used for forward * scans.) */ - nextkey = true; - goback = false; + inskey.nextkey = true; + inskey.goback = false; break; default: @@ -1361,14 +1380,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) return false; } - /* Initialize remaining insertion scan key fields */ - _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage); - inskey.anynullkeys = false; /* unused */ - inskey.nextkey = nextkey; - inskey.pivotsearch = false; - inskey.scantid = NULL; - inskey.keysz = keysCount; - /* * Use the manufactured insertion scan key to descend the tree and * position ourselves on the target leaf page. @@ -1420,9 +1431,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * the last item on this page. Adjust the starting offset if needed. (If * this results in an offset before the first item or after the last one, * _bt_readpage will report no items found, and then we'll step to the - * next page as needed.) + * next page as needed. _bt_search avoids this whenever possible.) */ - if (goback) + if (inskey.goback) offnum = OffsetNumberPrev(offnum); /* remember which buffer we have pinned, if any */ @@ -1614,6 +1625,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) } itup = (IndexTuple) PageGetItem(page, iid); + Assert(!BTreeTupleIsPivot(itup)); if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan)) { @@ -1722,6 +1734,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) tuple_alive = true; itup = (IndexTuple) PageGetItem(page, iid); + Assert(!BTreeTupleIsPivot(itup)); passes_quals = _bt_checkkeys(scan, itup, indnatts, dir, &continuescan); diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 7da499c4d..87714aba0 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -65,8 +65,8 @@ static int _bt_keep_natts(Relation rel, IndexTuple lastleft, * When itup is a non-pivot tuple, the returned insertion scan key is * suitable for finding a place for it to go on the leaf level. Pivot * tuples can be used to re-find leaf page with matching high key, but - * then caller needs to set scan key's pivotsearch field to true. This - * allows caller to search for a leaf page with a matching high key, + * then caller needs to set insertion scan key's goback field to true. + * This allows caller to search for a leaf page with a matching high key, * which is usually to the left of the first leaf page a non-pivot match * might appear on. * @@ -121,7 +121,7 @@ _bt_mkscankey(Relation rel, IndexTuple itup) } key->anynullkeys = false; /* initial assumption */ key->nextkey = false; - key->pivotsearch = false; + key->goback = false; key->keysz = Min(indnkeyatts, tupnatts); key->scantid = key->heapkeyspace && itup ? BTreeTupleGetHeapTID(itup) : NULL; @@ -725,7 +725,11 @@ _bt_restore_array_keys(IndexScanDesc scan) * failure of either key is indeed enough to stop the scan. (In general, when * inequality keys are present, the initial-positioning code only promises to * position before the first possible match, not exactly at the first match, - * for a forward scan; or after the last match for a backward scan.) + * for a forward scan; or after the last match for a backward scan. Note, + * however, that there are various optimizations in the initial-positioning + * code that maximize the likelihood that _bt_search will return the leaf page + * whose key space makes it precisely the first/last leaf page where a match + * might be found.) * * As a byproduct of this work, we can detect contradictory quals such * as "x = 1 AND x > 2". If we see that, we return so->qual_ok = false, diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 94a975932..e108e806e 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -2779,7 +2779,7 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key, ItemId itemid; int32 cmp; - Assert(key->pivotsearch); + Assert(!key->nextkey && key->goback); /* Verify line pointer before checking tuple */ itemid = PageGetItemIdCareful(state, state->targetblock, state->target, @@ -2841,7 +2841,7 @@ invariant_leq_offset(BtreeCheckState *state, BTScanInsert key, { int32 cmp; - Assert(key->pivotsearch); + Assert(!key->nextkey && key->goback); cmp = _bt_compare(state->rel, key, state->target, upperbound); @@ -2864,7 +2864,7 @@ invariant_g_offset(BtreeCheckState *state, BTScanInsert key, { int32 cmp; - Assert(key->pivotsearch); + Assert(!key->nextkey && key->goback); cmp = _bt_compare(state->rel, key, state->target, lowerbound); @@ -2902,7 +2902,7 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key, ItemId itemid; int32 cmp; - Assert(key->pivotsearch); + Assert(!key->nextkey && key->goback); /* Verify line pointer before checking tuple */ itemid = PageGetItemIdCareful(state, nontargetblock, nontarget, @@ -3128,9 +3128,9 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) * For example, invariant_g_offset() might miss a cross-page invariant failure * on an internal level if the scankey built from the first item on the * target's right sibling page happened to be equal to (not greater than) the - * last item on target page. The !pivotsearch tiebreaker in _bt_compare() - * might otherwise cause amcheck to assume (rather than actually verify) that - * the scankey is greater. + * last item on target page. The !goback tiebreaker in _bt_compare() might + * otherwise cause amcheck to assume (rather than actually verify) that the + * scankey is greater. */ static inline BTScanInsert bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup) @@ -3138,7 +3138,7 @@ bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup) BTScanInsert skey; skey = _bt_mkscankey(rel, itup); - skey->pivotsearch = true; + skey->goback = true; return skey; } -- 2.40.1