diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 3680e69b89a..eb4df2ebbe6 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -609,6 +609,9 @@ original search scankey is consulted as each index entry is sequentially scanned to decide whether to return the entry and whether the scan can stop (see _bt_checkkeys()). +HEIKKI: The above probably needs some updating, now that we have a +separate BTScanInsert struct to represent an insertion scan key. + We use term "pivot" index tuples to distinguish tuples which don't point to heap tuples, but rather used for tree navigation. Pivot tuples includes all tuples on non-leaf pages and high keys on leaf pages. Note that pivot diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index b3fbba276dd..2a2d6576060 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -97,9 +97,12 @@ static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel); * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate. * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and - * don't actually insert. If rel is a unique index, then every call - * here is a checkingunique call (i.e. every call does a duplicate - * check, though perhaps only a tentative check). + * don't actually insert. + +HEIKKI: 'checkingunique' is a local variable in the function. Seems a bit +weird to talk about it in the function comment. I didn't understand what +the point of adding this sentence was, so I removed it. + * * The result value is only significant for UNIQUE_CHECK_PARTIAL: * it must be true if the entry is known unique, else false. @@ -285,9 +288,10 @@ top: CheckForSerializableConflictIn(rel, NULL, buf); /* - * Do the insertion. Note that itup_key contains mutable state used - * by _bt_check_unique to help _bt_findinsertloc avoid repeating its - * binary search. !checkingunique case must start own binary search. + * Do the insertion. Note that itup_key contains state filled in by + * _bt_check_unique to help _bt_findinsertloc avoid repeating its + * binary search. !checkingunique case must start its own binary + * search. */ newitemoff = _bt_findinsertloc(rel, itup_key, &buf, checkingunique, itup, stack, heapRel); @@ -311,10 +315,6 @@ top: /* * _bt_check_unique() -- Check for violation of unique index constraint * - * Sets state in itup_key sufficient for later _bt_findinsertloc() call to - * reuse most of the work of our initial binary search to find conflicting - * tuples. - * * Returns InvalidTransactionId if there is no conflict, else an xact ID * we must wait for to see if it commits a conflicting tuple. If an actual * conflict is detected, no return --- just ereport(). If an xact ID is @@ -326,6 +326,10 @@ top: * InvalidTransactionId because we don't want to wait. In this case we * set *is_unique to false if there is a potential conflict, and the * core code must redo the uniqueness check later. + * + * As a side-effect, sets state in itup_key that can later be used by + * _bt_findinsertloc() to reuse most of the binary search work we do + * here. */ static TransactionId _bt_check_unique(Relation rel, BTScanInsert itup_key, @@ -352,8 +356,8 @@ _bt_check_unique(Relation rel, BTScanInsert itup_key, maxoff = PageGetMaxOffsetNumber(page); /* - * Save binary search bounds. Note that this is also used within - * _bt_findinsertloc() later. + * Save binary search bounds. We use them in the fastpath below, but + * also in the _bt_findinsertloc() call later. */ itup_key->savebinsrch = true; offset = _bt_binsrch(rel, itup_key, buf); @@ -375,16 +379,16 @@ _bt_check_unique(Relation rel, BTScanInsert itup_key, if (offset <= maxoff) { /* - * Fastpath: _bt_binsrch() search bounds can be used to limit our - * consideration to items that are definitely duplicates in most - * cases (though not when original page is empty, or when initial - * offset is past the end of the original page, which may indicate - * that we'll have to examine a second or subsequent page). + * Fastpath: In most cases, we can use _bt_binsrch search bounds + * to limit our consideration to items that are definitely + * duplicates. This fastpath doesn't apply, when the original + * page is empty, or when initial offset is past the end of the + * original page, which may indicate that we need to examine a + * second or subsequent page. * * Note that this optimization avoids calling _bt_isequal() - * entirely when there are no duplicates, provided initial offset - * isn't past end of the initial page (and provided page has at - * least one item). + * entirely when there are no duplicates, as long as the location + * where the key would belong to is not at the end of the page. */ if (nbuf == InvalidBuffer && offset == itup_key->stricthigh) { @@ -588,6 +592,17 @@ _bt_check_unique(Relation rel, BTScanInsert itup_key, if (P_RIGHTMOST(opaque)) break; highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY); + + /* + * HEIKKI: This assertion might fire if the user-defined opclass + * is broken. It's just an assertion, so maybe that's ok. With a + * broken opclass, it's obviously "garbage in, garbage out", but + * we should try to behave sanely anyway. I don't remember what + * our general policy on that is; should we assert, elog(ERROR), + * or continue silently in that case? An elog(ERROR) or + * elog(WARNING) would feel best to me, but I don't remember what + * we usually do. + */ Assert(highkeycmp <= 0); if (highkeycmp != 0) break; @@ -644,30 +659,55 @@ _bt_check_unique(Relation rel, BTScanInsert itup_key, * Once we have chosen the page to put the key on, we'll insert it before * any existing equal keys because of the way _bt_binsrch() works. * - * _bt_check_unique() callers arrange for their insertion scan key to - * save the progress of the last binary search performed. No additional - * binary search comparisons occur in the common case where there was no - * existing duplicate tuple, though we may occasionally still not be able - * to reuse their work for our own reasons. Even when there are garbage - * duplicates, very few binary search comparisons will be performed - * without being strictly necessary. + * _bt_check_unique() saves the progress of the binary search it + * performs, in the insertion scan key. In the common case that there + * were no duplicates, we don't need to do any additional binary search + * comparisons here. Though occasionally, we may still not be able to + * reuse the saved state for our own reasons. Even when there are garbage + * duplicates, we do very few binary search comparisons that are not + * strictly necessary. * - * The caller should hold an exclusive lock on *bufptr in all cases. On - * exit, bufptr points to the chosen insert location in all cases. If - * we have to move right, the lock and pin on the original page will be - * released, and the new page returned to the caller is exclusively - * locked instead. In any case, we return the offset that caller should - * use to insert into the buffer pointed to by bufptr on return. +HEIKKI: + +Should we mention explicitly that this binary-search reuse is only applicable +if unique checks were performed? It's kind of implied by the fact that it's +_bt_check_unique() that saves the state, but perhaps we should be more clear +about it. + +What is a "garbage duplicate"? Same as a "dead duplicate"? + +The last sentence, about garbage duplicates, seems really vague. Why do we +ever do any comparisons that are not strictly necessary? Perhaps it's best to +just remove that last sentence. + + * + * On entry, *bufptr points to the first legal page where the new tuple + * could be inserted. The caller must hold an exclusive lock on *bufptr. + * + * On exit, *bufptr points to the chosen insertion page, and the offset + * within that page is returned. If _bt_findinsertloc decides to move + * right, the lock and pin on the original page is released, and the new + * page returned to the caller is exclusively locked instead. * * This is also where opportunistic microvacuuming of LP_DEAD tuples * occurs. It has to happen here, since it may invalidate a * _bt_check_unique() caller's cached binary search work. + +HEIKKI: I don't buy the argument that microvacuuming has to happen here. You +could easily imagine a separate function that does microvacuuming, and resets +(or even updates) the binary-search cache in the insertion key. I agree this +is a convenient place to do it, though. + */ static OffsetNumber _bt_findinsertloc(Relation rel, BTScanInsert itup_key, Buffer *bufptr, bool checkingunique, +/* HEIKKI: +Do we need 'checkunique' as an argument? If unique checks were not +performed, the insertion key will simply not have saved state. +*/ IndexTuple newtup, BTStack stack, Relation heapRel) @@ -706,6 +746,30 @@ _bt_findinsertloc(Relation rel, errtableconstraint(heapRel, RelationGetRelationName(rel)))); + /* HEIKKI: I liked this comment that we used to have here, before this patch: */ + /*---------- + * If we will need to split the page to put the item on this page, + * check whether we can put the tuple somewhere to the right, + * instead. Keep scanning right until we + * (a) find a page with enough free space, + * (b) reach the last page where the tuple can legally go, or + * (c) get tired of searching. + * (c) is not flippant; it is important because if there are many + * pages' worth of equal keys, it's better to split one of the early + * pages than to scan all the way to the end of the run of equal keys + * on every insert. We implement "get tired" as a random choice, + * since stopping after scanning a fixed number of pages wouldn't work + * well (we'd never reach the right-hand side of previously split + * pages). Currently the probability of moving right is set at 0.99, + * which may seem too high to change the behavior much, but it does an + * excellent job of preventing O(N^2) behavior with many equal keys. + *---------- + */ + /* HEIKKI: Maybe it's not relevant with the later patches, but at least + * with just this first patch, it's still valid. I noticed that the + * comment is now in _bt_useduplicatepage, it seems a bit out-of-place + * there. */ + Assert(P_ISLEAF(lpageop) && !P_INCOMPLETE_SPLIT(lpageop)); for (;;) { @@ -714,29 +778,35 @@ _bt_findinsertloc(Relation rel, BlockNumber rblkno; /* - * The checkingunique (restorebinsrch) case may well have established - * bounds within _bt_check_unique()'s binary search that preclude the - * need for a further high key check. This fastpath isn't used when - * there are no items on the existing page (other than high key), or - * when it looks like the new item belongs last on the page, but it - * might go on a later page instead. + * An earlier _bt_check_unique() call may well have saved bounds that + * we can use to skip the high key check. This fastpath cannot be + * used when there are no items on the existing page (other than the + * high key), or when it looks like the new item belongs on the page + * but it might go on a later page instead. */ if (restorebinsrch && itup_key->low <= itup_key->stricthigh && itup_key->stricthigh <= PageGetMaxOffsetNumber(page)) break; + /* + * If this is the last page that the tuple can legally go to, stop + * here. + */ if (P_RIGHTMOST(lpageop)) break; cmpval = _bt_compare(rel, itup_key, page, P_HIKEY); + if (cmpval != 0) + break; /* - * May have to handle case where there is a choice of which page to - * place new tuple on, and we must balance space utilization as best - * we can. + * Otherwise, we have a choice to insert here, or move right to a + * later page. Try to balance space utilization the best we can. */ - if (cmpval != 0 || _bt_useduplicatepage(rel, heapRel, buf, - &restorebinsrch, itemsz)) + if (_bt_useduplicatepage(rel, heapRel, buf, &restorebinsrch, itemsz)) + { + /* decided to insert here */ break; + } /* * step right to next non-dead page @@ -786,9 +856,17 @@ _bt_findinsertloc(Relation rel, _bt_compare(rel, itup_key, page, P_HIKEY) <= 0); /* - * Perform microvacuuming of the page we're about to insert tuple on if it - * looks like it has LP_DEAD items. Only microvacuum when it's likely to - * forestall a page split, though. + * If the page we're about to insert to doesn't have enough room for the + * new tuple, we will have to split it. If it looks like the page has + * LP_DEAD items, try to remove them, in hope of making room for the new + * item and avoiding the split. + +HEIKKI: In some scenarios, if the BTP_HAS_GARBAGE flag is falsely set, we would +try to microvacuum the page twice: first in _bt_useduplicatepage, and second +time here. That's because _bt_vacuum_one_page() doesn't clear the flag, if +there are in fact no LP_DEAD items. That's probably insignificant and not worth +worrying about, but I thought I'd mention it. + */ if (P_HAS_GARBAGE(lpageop) && PageGetFreeSpace(page) < itemsz) { @@ -814,15 +892,15 @@ _bt_findinsertloc(Relation rel, /* * _bt_useduplicatepage() -- Settle for this page of duplicates? * - * This function handles the question of whether or not an insertion - * of a duplicate into a pg_upgrade'd !heapkeyspace index should - * insert on the page contained in buf when a choice must be made. - * Preemptive microvacuuming is performed here when that could allow - * caller to insert on to the page in buf. + * If we have the choice to insert to current page, or to some later + * later page to the right, this function decides what to do. * - * Returns true if caller should proceed with insert on buf's page. - * Otherwise, caller should move on to the page to the right (caller - * must always be able to still move right following call here). + * If the current page doesn't have enough free space for the new + * tuple, we "microvacuum" the page, removing LP_DEAD items, in + * hope that it will make enough room. + * + * Returns true if caller should proceed with insert on the current + * page. Otherwise, caller should move on to the page to the right. */ static bool _bt_useduplicatepage(Relation rel, Relation heapRel, Buffer buf, @@ -838,6 +916,10 @@ _bt_useduplicatepage(Relation rel, Relation heapRel, Buffer buf, if (PageGetFreeSpace(page) >= itemsz) return true; + /* + * Before considering moving right, see if we can obtain enough space by + * erasing LP_DEAD items. + */ if (P_HAS_GARBAGE(lpageop)) { _bt_vacuum_one_page(rel, buf, heapRel); @@ -1275,9 +1357,11 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright, * If the page we're splitting is not the rightmost page at its level in * the tree, then the first entry on the page is the high key for the * page. We need to copy that to the right half. Otherwise (meaning the - * rightmost page case), all the items on the right half will be user data - * (there is no existing high key that needs to be relocated to the new - * right page). + * rightmost page case), all the items on the right half will be user + * data. + * +HEIKKI: I don't think the comment change you made here was needed or +helpful, so I reverted it. */ rightoff = P_HIKEY; diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 7940297305d..f27148eb27d 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -162,8 +162,8 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access, new_stack->bts_parent = stack_in; /* - * Page level 1 is lowest non-leaf page level prior to leaves. So, - * if we're on the level 1 and asked to lock leaf page in write mode, + * Page level 1 is lowest non-leaf page level prior to leaves. So, if + * we're on the level 1 and asked to lock leaf page in write mode, * then lock next page in write mode, because it must be a leaf. */ if (opaque->btpo.level == 1 && access == BT_WRITE) @@ -333,13 +333,14 @@ _bt_moveright(Relation rel, * * This procedure is not responsible for walking right, it just examines * the given page. _bt_binsrch() has no lock or refcount side effects - * on the buffer. When key.savebinsrch is set, modifies mutable fields - * of insertion scan key, so that a subsequent call where caller sets - * key.savebinsrch can reuse the low and strict high bound of original - * binary search. Callers that use these fields directly must be - * prepared for the case where stricthigh isn't on the same page (it - * exceeds maxoff for the page), and the case where there are no items - * on the page (high < low). + * on the buffer. + * + * When key.savebinsrch is set, modifies mutable fields of insertion scan + * key, so that a subsequent call where caller sets key.restorebinsrch can + * reuse the low and strict high bound of original binary search. Callers + * that use these fields directly must be prepared for the case where + * stricthigh isn't on the same page (it exceeds maxoff for the page), and + * the case where there are no items on the page (high < low). */ OffsetNumber _bt_binsrch(Relation rel, diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index e010bcdcfa9..3daf5829f82 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -103,9 +103,8 @@ _bt_mkscankey(Relation rel, IndexTuple itup) procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); /* - * Key arguments built when caller provides no tuple are defensively - * represented as NULL values, though they should still not - * participate in comparisons. + * If the caller provides no tuple, the key arguments should never be + * used. Set them to NULL, anyway, to be defensive. */ if (i < tupnatts) arg = index_getattr(itup, i + 1, itupdesc, &null); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index dc2eafb5665..45899454bba 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -323,8 +323,7 @@ typedef BTStackData *BTStack; * BTScanInsert is the btree-private state needed to find an initial position * for an indexscan, or to insert new tuples -- an "insertion scankey" (not to * be confused with a search scankey). It's used to descend a B-Tree using - * _bt_search. For details on its mutable state, see _bt_binsrch and - * _bt_findinsertloc. + * _bt_search. * * When nextkey is false (the usual case), _bt_search and _bt_binsrch will * locate the first item >= scankey. When nextkey is true, they will locate @@ -334,9 +333,14 @@ typedef BTStackData *BTStack; * * scankeys is an array of scan key entries for attributes that are compared. * During insertion, there must be a scan key for every attribute, but when - * starting a regular index scan some can be omitted. The array is used as a + * starting a regular index scan, some can be omitted. The array is used as a * flexible array member, though it's sized in a way that makes it possible to * use stack allocations. See nbtree/README for full details. + +HEIKKI: I don't see anything in the README about stack allocations. What +exactly does the README reference refer to? No code seems to actually allocate +this in the stack, so we don't really need that. + */ typedef struct BTScanInsertData @@ -344,7 +348,8 @@ typedef struct BTScanInsertData /* * Mutable state used by _bt_binsrch to inexpensively repeat a binary * search on the leaf level. Only used for insertions where - * _bt_check_unique is called. + * _bt_check_unique is called. See _bt_binsrch and _bt_findinsertloc + * for details. */ bool savebinsrch; bool restorebinsrch;