From eab94f627c1cbddcc8dec2f1553f28bf3007dab5 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Thu, 20 May 2021 21:17:51 +0200 Subject: [PATCH v41 4/6] Extend amskip implementation for Btree Add support for backward scan to Btree amskip implementation. This will make index skip scan work without Material node in case of scrolling cursor. Author: Jesper Pedersen, Dmitry Dolgov Reviewed-by: Thomas Munro, David Rowley, Floris Van Nee, Kyotaro Horiguchi, Tomas Vondra, Peter Geoghegan --- src/backend/access/index/indexam.c | 6 +- src/backend/access/nbtree/nbtree.c | 5 +- src/backend/access/nbtree/nbtsearch.c | 302 ++++++++++++++++++++++- src/backend/executor/execAmi.c | 32 +-- src/backend/executor/nodeIndexonlyscan.c | 57 ++++- src/include/access/amapi.h | 1 + src/include/access/genam.h | 3 +- src/include/access/nbtree.h | 6 +- 8 files changed, 371 insertions(+), 41 deletions(-) diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index bcf7c73467..9fa5db27ea 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -748,11 +748,13 @@ index_can_return(Relation indexRelation, int attno) * ---------------- */ bool -index_skip(IndexScanDesc scan, ScanDirection direction, int prefix) +index_skip(IndexScanDesc scan, ScanDirection direction, + ScanDirection indexdir, bool scanstart, int prefix) { SCAN_CHECKS; - return scan->indexRelation->rd_indam->amskip(scan, direction, prefix); + return scan->indexRelation->rd_indam->amskip(scan, direction, + indexdir, prefix); } /* ---------------- diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 7b2aa594fb..6a627de65a 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -449,9 +449,10 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, * btskip() -- skip to the beginning of the next key prefix */ bool -btskip(IndexScanDesc scan, ScanDirection direction, int prefix) +btskip(IndexScanDesc scan, ScanDirection direction, + ScanDirection indexdir, int prefix) { - return _bt_skip(scan, direction, prefix); + return _bt_skip(scan, direction, indexdir, prefix); } /* diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 0ad761916a..00742c7e21 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1509,12 +1509,31 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) * The current position is set so that a subsequent call to _bt_next will * fetch the first tuple that differs in the leading 'prefix' keys. * - * The current page is searched for the next unique value. If none is found - * we will do a scan from the root in order to find the next page with - * a unique value. + * There are four different kinds of skipping (depending on dir and + * indexdir, that are important to distinguish, especially in the presense + * of an index condition: + * + * * Advancing forward and reading forward + * simple scan + * + * * Advancing forward and reading backward + * scan inside a cursor fetching backward, when skipping is necessary + * right from the start + * + * * Advancing backward and reading forward + * scan with order by desc inside a cursor fetching forward, when + * skipping is necessary right from the start + * + * * Advancing backward and reading backward + * simple scan with order by desc + * + * The current page is searched for the next unique value. If none is found + * we will do a scan from the root in order to find the next page with + * a unique value. */ bool -_bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix) +_bt_skip(IndexScanDesc scan, ScanDirection dir, + ScanDirection indexdir, int prefix) { BTScanOpaque so = (BTScanOpaque) scan->opaque; BTStack stack; @@ -1625,11 +1644,282 @@ _bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix) * to go one step back, since we need a last element from the previous * series. */ - if (ScanDirectionIsBackward(dir) || (ScanDirectionIsForward(dir) & scanstart)) + if ((ScanDirectionIsBackward(dir) && ScanDirectionIsBackward(indexdir)) || + (ScanDirectionIsForward(dir) && ScanDirectionIsBackward(indexdir) & scanstart)) offnum = OffsetNumberPrev(offnum); + /* + * Andvance backward but read forward. At this moment we are at the next + * distinct key at the beginning of the series. In case if scan just + * started, we can read forward without doing anything else. Otherwise + * find previous distinct key and the beginning of it's series and read + * forward from there. To do so, go back one step, perform binary search + * to find the first item in the series and let _bt_readpage do everything + * else. + */ + else if (ScanDirectionIsBackward(dir) && ScanDirectionIsForward(indexdir) && !scanstart) + { + /* Reading forward means we expect to see more data on the right */ + so->currPos.moreRight = true; + + /* One step back to find a previous value */ + if (!_bt_readpage(scan, dir, offnum) || + --so->currPos.itemIndex < so->currPos.firstItem) + { + _bt_unlockbuf(indexRel, so->currPos.buf); + if (!_bt_steppage(scan, dir)) + { + pfree(so->skipScanKey); + so->skipScanKey = NULL; + return false; + } + } + else + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_heaptid = currItem->heapTid; + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + _bt_update_skip_scankeys(scan, indexRel); + + /* + * And now find the last item from the sequence for the + * current, value with the intention do OffsetNumberNext. As a + * result we end up on a first element from the sequence. + */ + if (_bt_scankey_within_page(scan, so->skipScanKey, so->currPos.buf)) + offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, buf); + else + { + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan); + + ReleaseBuffer(so->currPos.buf); + so->currPos.buf = InvalidBuffer; + + stack = _bt_search(scan->indexRelation, so->skipScanKey, + &buf, BT_READ, scan->xs_snapshot); + _bt_freestack(stack); + so->currPos.buf = buf; + offnum = _bt_binsrch(scan->indexRelation, so->skipScanKey, buf); + } + } + + /* + * Advance forward but read backward. At this moment we are at the next + * distinct key at the beginning of the series. In case if scan just + * started, we can go one step back and read forward without doing + * anything else. Otherwise find the next distinct key and the beginning + * of it's series, go one step back and read backward from there. + * + * An interesting situation can happen if one of distinct keys do not pass + * a corresponding index condition at all. In this case reading backward + * can lead to a previous distinct key being found, creating a loop. To + * avoid that check the value to be returned, and jump one more time if + * it's the same as at the beginning. Note that we do not check visibility + * here, and dead tuples could also lead to the same situation. This has to + * be checked on the caller side. + */ + else if (ScanDirectionIsForward(dir) && ScanDirectionIsBackward(indexdir) && !scanstart) + { + IndexTuple startItup = CopyIndexTuple(scan->xs_itup); + bool nextFound = false; + + /* Reading backwards means we expect to see more data on the left */ + so->currPos.moreLeft = true; + + for (;;) + { + IndexTuple itup; + OffsetNumber jumpOffset; + + if (nextFound) + break; + + /* + * Find a next index tuple to update scan key. It could be at + * the end, so check for max offset + */ + if (!_bt_readpage(scan, ForwardScanDirection, offnum)) + { + /* + * There's no actually-matching data on this page. Try to + * advance to the next page. Return false if there's no + * matching data at all. + */ + _bt_unlockbuf(indexRel, so->currPos.buf); + if (!_bt_steppage(scan, dir)) + { + pfree(so->skipScanKey); + so->skipScanKey = NULL; + return false; + } + } + else + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + + /* check for interrupts while we're not holding any buffer lock */ + CHECK_FOR_INTERRUPTS(); + + currItem = &so->currPos.items[so->currPos.firstItem]; + itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + scan->xs_itup = itup; + + _bt_update_skip_scankeys(scan, indexRel); + + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan); + + ReleaseBuffer(so->currPos.buf); + so->currPos.buf = InvalidBuffer; + + stack = _bt_search(scan->indexRelation, so->skipScanKey, + &buf, BT_READ, scan->xs_snapshot); + _bt_freestack(stack); + so->currPos.buf = buf; + + /* + * We need to remember the original offset after the jump, + * since in case of looping this would be the next starting + * point + */ + jumpOffset = offnum = _bt_binsrch(scan->indexRelation, + so->skipScanKey, buf); + offnum = OffsetNumberPrev(offnum); + + if (!_bt_readpage(scan, indexdir, offnum)) + { + /* + * There's no actually-matching data on this page. Try to + * advance to the next page. Return false if there's no + * matching data at all. + */ + _bt_unlockbuf(indexRel, so->currPos.buf); + if (!_bt_steppage(scan, indexdir)) + { + pfree(so->skipScanKey); + so->skipScanKey = NULL; + return false; + } + _bt_lockbuf(indexRel, so->currPos.buf, BT_READ); + } + + currItem = &so->currPos.items[so->currPos.lastItem]; + itup = CopyIndexTuple((IndexTuple) + (so->currTuples + currItem->tupleOffset)); + + /* + * To check if we returned the same tuple, try to find a + * startItup on the current page. For that we need to update + * scankey to match the whole tuple and set nextkey to return + * an exact tuple, not the next one. If the tuple we found in + * this way is equal to what we wanted to return, it means we + * are in the loop, return offnum to the original position and + * jump further + * + * Note that to compare tids we need to keep the leaf pinned, + * otherwise there is a danger of vacuum cleaning up relevant + * tuples. + */ + scan->xs_itup = startItup; + _bt_update_skip_scankeys(scan, indexRel); + + so->skipScanKey->keysz = IndexRelationGetNumberOfKeyAttributes(indexRel); + so->skipScanKey->nextkey = false; + + if (_bt_scankey_within_page(scan, so->skipScanKey, so->currPos.buf)) + { + OffsetNumber maxoff, startOffset; + IndexTuple verifiedItup; + Page page = BufferGetPage(so->currPos.buf); + startOffset = _bt_binsrch(scan->indexRelation, + so->skipScanKey, + so->currPos.buf); + + maxoff = PageGetMaxOffsetNumber(page); + + /* Now read the data */ + if (_bt_readpage(scan, ForwardScanDirection, startOffset)) + { + ItemPointer resultTids, verifyTids; + int nresult = 1, + nverify = 1; + + currItem = &so->currPos.items[so->currPos.itemIndex]; + verifiedItup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + /* + * We need to keep in mind that tuples we deal with + * could be also posting tuples and represent a list of + * tids. + */ + if (BTreeTupleIsPosting(verifiedItup)) + { + nverify = BTreeTupleGetNPosting(verifiedItup); + verifyTids = BTreeTupleGetPosting(verifiedItup); + for (int i = 1; i < nverify; i++) + verifyTids[i] = *BTreeTupleGetPostingN(verifiedItup, i); + } + else + verifyTids = &verifiedItup->t_tid; + + if (BTreeTupleIsPosting(itup)) + { + nresult = BTreeTupleGetNPosting(itup); + resultTids = BTreeTupleGetPosting(itup); + for (int i = 1; i < nresult; i++) + resultTids[i] = *BTreeTupleGetPostingN(itup, i); + } + else + resultTids = &itup->t_tid; + + /* One not equal means they're not equal. */ + for(int i = 0; i < nverify; i++) + { + for(int j = 0; j < nresult; j++) + { + if (!ItemPointerEquals(&resultTids[j], &verifyTids[i])) + { + nextFound = true; + break; + } + } + } + + if (!nextFound) + offnum = jumpOffset; + } + + if ((offnum > maxoff) && (so->currPos.nextPage == P_NONE)) + { + _bt_relbuf(indexRel, so->currPos.buf); + BTScanPosInvalidate(so->currPos); + + pfree(so->skipScanKey); + so->skipScanKey = NULL; + return false; + } + } + else + /* + * If startItup could be not found within the current page, + * assume we found something new + */ + nextFound = true; + + /* Return original scankey options */ + so->skipScanKey->keysz = prefix; + so->skipScanKey->nextkey = ScanDirectionIsForward(dir); + } + } + /* Now read the data */ - if (!_bt_readpage(scan, dir, offnum)) + if (!_bt_readpage(scan, indexdir, offnum)) { /* * There's no actually-matching data on this page. Try to advance to diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index ced8933f44..b6245994f0 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -64,7 +64,7 @@ #include "utils/rel.h" #include "utils/syscache.h" -static bool IndexSupportsBackwardScan(Plan *node); +static bool IndexSupportsBackwardScan(Oid indexid); /* @@ -555,8 +555,10 @@ ExecSupportsBackwardScan(Plan *node) return false; case T_IndexScan: + return IndexSupportsBackwardScan(((IndexScan *) node)->indexid); + case T_IndexOnlyScan: - return IndexSupportsBackwardScan(node); + return IndexSupportsBackwardScan(((IndexOnlyScan *) node)->indexid); case T_SubqueryScan: return ExecSupportsBackwardScan(((SubqueryScan *) node)->subplan); @@ -596,38 +598,16 @@ ExecSupportsBackwardScan(Plan *node) /* * An IndexScan or IndexOnlyScan node supports backward scan only if the - * index's AM does and no skip scan is used. + * index's AM does. */ static bool -IndexSupportsBackwardScan(Plan *node) +IndexSupportsBackwardScan(Oid indexid) { bool result; - Oid indexid = InvalidOid; - int skip_prefix_size = 0; HeapTuple ht_idxrel; Form_pg_class idxrelrec; IndexAmRoutine *amroutine; - Assert(IsA(node, IndexScan) || IsA(node, IndexOnlyScan)); - switch(nodeTag(node)) - { - case T_IndexScan: - indexid = ((IndexScan *) node)->indexid; - break; - - case T_IndexOnlyScan: - indexid = ((IndexOnlyScan *) node)->indexid; - skip_prefix_size = ((IndexOnlyScan *) node)->indexskipprefixsize; - break; - - default: - elog(DEBUG2, "unrecognized node type: %d", (int) nodeTag(node)); - break; - } - - if (skip_prefix_size > 0) - return false; - /* Fetch the pg_class tuple of the index relation */ ht_idxrel = SearchSysCache1(RELOID, ObjectIdGetDatum(indexid)); if (!HeapTupleIsValid(ht_idxrel)) diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 40ad1b949b..d5abac20cb 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -67,6 +67,7 @@ IndexOnlyNext(IndexOnlyScanState *node) IndexScanDesc scandesc; TupleTableSlot *slot; ItemPointer tid = NULL; + ItemPointerData startTid; IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) node->ss.ps.plan; /* @@ -75,6 +76,14 @@ IndexOnlyNext(IndexOnlyScanState *node) */ bool skipped = false; + /* + * Index only scan must be aware that in case of skipping we can return to + * the starting point due to visibility checks. In this situation we need + * to jump further, and number of skipping attempts tell us how far do we + * need to do so. + */ + int skipAttempts = 0; + /* * extract necessary information from index scan node */ @@ -123,13 +132,27 @@ IndexOnlyNext(IndexOnlyScanState *node) node->ioss_OrderByKeys, node->ioss_NumOrderByKeys); } + else + { + ItemPointerCopy(&scandesc->xs_heaptid, &startTid); + } /* * Check if we need to skip to the next key prefix. + * + * When fetching a cursor in the direction opposite to a general scan + * direction, the result must be what normal fetching should have + * returned, but in reversed order. In other words, return the last or + * first scanned tuple in a DISTINCT set, depending on a cursor direction. + * Due to that we skip also when the first tuple wasn't emitted yet, but + * the directions are opposite. */ - if (node->ioss_SkipPrefixSize > 0 && node->ioss_FirstTupleEmitted) + if (node->ioss_SkipPrefixSize > 0 && + (node->ioss_FirstTupleEmitted || + ScanDirectionsAreOpposite(direction, indexonlyscan->indexorderdir))) { - if (!index_skip(scandesc, direction, node->ioss_SkipPrefixSize)) + if (!index_skip(scandesc, direction, indexonlyscan->indexorderdir, + !node->ioss_FirstTupleEmitted, node->ioss_SkipPrefixSize)) { /* * Reached end of index. At this point currPos is invalidated, and @@ -144,6 +167,7 @@ IndexOnlyNext(IndexOnlyScanState *node) else { skipped = true; + skipAttempts = 1; tid = &scandesc->xs_heaptid; } } @@ -161,6 +185,35 @@ IndexOnlyNext(IndexOnlyScanState *node) skipped = false; + /* + * If we already emitted first tuple, while doing index only skip scan + * with advancing and reading in different directions we can return to + * the same position where we started after visibility check. Recognize + * such situations and skip more. + */ + if ((readDirection != direction) && node->ioss_FirstTupleEmitted && + ItemPointerIsValid(&startTid) && ItemPointerEquals(&startTid, tid)) + { + int i; + skipAttempts += 1; + + for (i = 0; i < skipAttempts; i++) + { + if (!index_skip(scandesc, direction, + indexonlyscan->indexorderdir, + !node->ioss_FirstTupleEmitted, + node->ioss_SkipPrefixSize)) + { + node->ioss_FirstTupleEmitted = false; + return ExecClearTuple(slot); + } + } + + tid = &scandesc->xs_heaptid; + } + + skipped = false; + /* * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index cb2f48a1bc..58e29d054c 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -176,6 +176,7 @@ typedef bool (*amgettuple_function) (IndexScanDesc scan, /* skip past duplicates in a given prefix */ typedef bool (*amskip_function) (IndexScanDesc scan, ScanDirection dir, + ScanDirection indexdir, int prefix); /* fetch all valid tuples */ diff --git a/src/include/access/genam.h b/src/include/access/genam.h index d13d95c458..5a6d904af6 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -183,7 +183,8 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info, extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *istat); extern bool index_can_return(Relation indexRelation, int attno); -extern bool index_skip(IndexScanDesc scan, ScanDirection direction, int prefix); +extern bool index_skip(IndexScanDesc scan, ScanDirection direction, + ScanDirection indexdir, bool start, int prefix); extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum, uint16 procnum); extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum, diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 2c516654c2..039e8d1f0d 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1232,7 +1232,8 @@ extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate); extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum); extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); -extern bool _bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix); +extern bool _bt_skip(IndexScanDesc scan, ScanDirection dir, + ScanDirection indexdir, int prefix); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot); @@ -1257,7 +1258,8 @@ extern void _bt_end_vacuum_callback(int code, Datum arg); extern Size BTreeShmemSize(void); extern void BTreeShmemInit(void); extern bytea *btoptions(Datum reloptions, bool validate); -extern bool btskip(IndexScanDesc scan, ScanDirection dir, int prefix); +extern bool btskip(IndexScanDesc scan, ScanDirection dir, + ScanDirection indexdir, int prefix); extern bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull); -- 2.32.0