From b4f03568ef99abc02d350c09de527a7a45c954fe Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 6 May 2019 16:23:40 -0700 Subject: [PATCH v1] Don't leave behind junk nbtree pages during split. Commit 8fa30f906be reduced the elevel of a number of "can't happen" _bt_split() errors from PANIC to ERROR. At the same time, the new right page buffer for the split could continue to be acquired well before the critical section. This was possible because it was relatively straightforward to make sure that _bt_split() could not throw an error, with a few specific exceptions. The exceptional cases were safe because they involved specific, well understood errors, making it possible to consistently zero the right page before actually raising an error using elog(). There was no danger of leaving around a junk page, provided _bt_split() stuck to this coding rule. Commit 8224de4f, which introduced INCLUDE indexes, added code to make _bt_split() truncate away non-key attributes. This happened at a point that broke the rule around zeroing the right page in _bt_split(). If truncation failed (perhaps due to palloc() failure), that would result in an errant right page buffer with junk contents. This could confuse VACUUM when it attempted to delete the page, and should be avoided on general principle. To fix, reorganize _bt_split() so that truncation occurs before the new right page buffer is even acquired. It is now safe for _bt_truncate() to throw an error -- a junk page/buffer will not be left behind. Discussion: https://postgr.es/m/CAH2-WzkcWT_-NH7EeL=Az4efg0KCV+wArygW8zKB=+HoP=VWMw@mail.gmail.com --- src/backend/access/nbtree/nbtinsert.c | 156 ++++++++++++++------------ 1 file changed, 87 insertions(+), 69 deletions(-) diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index c6c952dd49..7e4e0ab755 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -1253,29 +1253,33 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, int indnatts = IndexRelationGetNumberOfAttributes(rel); int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); - /* Acquire a new page to split into */ - rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); - /* * origpage is the original page to be split. leftpage is a temporary * buffer that receives the left-sibling data, which will be copied back * into origpage on success. rightpage is the new page that receives the - * right-sibling data. If we fail before reaching the critical section, - * origpage hasn't been modified and leftpage is only workspace. In - * principle we shouldn't need to worry about rightpage either, because it - * hasn't been linked into the btree page structure; but to avoid leaving - * possibly-confusing junk behind, we are careful to rewrite rightpage as - * zeroes before throwing any error. + * right-sibling data, which won't be allocated just yet. It's okay if we + * fail before reaching the point that rightpage is allocated, since + * origpage hasn't been modified and leftpage is only workspace. */ origpage = BufferGetPage(buf); - leftpage = PageGetTempPage(origpage); - rightpage = BufferGetPage(rbuf); - + oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); origpagenumber = BufferGetBlockNumber(buf); - rightpagenumber = BufferGetBlockNumber(rbuf); - + leftpage = PageGetTempPage(origpage); _bt_pageinit(leftpage, BufferGetPageSize(buf)); - /* rightpage was already initialized by _bt_getbuf */ + lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage); + + /* + * leftpage won't be the root when we're done. Also, clear the SPLIT_END + * and HAS_GARBAGE flags. + */ + lopaque->btpo_flags = oopaque->btpo_flags; + lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); + /* set flag in left page indicating that rightpage has no downlink yet */ + lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT; + lopaque->btpo_prev = oopaque->btpo_prev; + /* handle btpo_next after rightpage allocated */ + lopaque->btpo.level = oopaque->btpo.level; + /* Wait until both buffers are locked before initializing btpo_cycleid */ /* * Copy the original page's LSN into leftpage, which will become the @@ -1283,57 +1287,8 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, * examine the LSN and possibly dump it in a page image. */ PageSetLSN(leftpage, PageGetLSN(origpage)); - - /* init btree private data */ - oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage); - lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage); - ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); - isleaf = P_ISLEAF(oopaque); - /* if we're splitting this page, it won't be the root when we're done */ - /* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */ - lopaque->btpo_flags = oopaque->btpo_flags; - lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); - ropaque->btpo_flags = lopaque->btpo_flags; - /* set flag in left page indicating that the right page has no downlink */ - lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT; - lopaque->btpo_prev = oopaque->btpo_prev; - lopaque->btpo_next = rightpagenumber; - ropaque->btpo_prev = origpagenumber; - ropaque->btpo_next = oopaque->btpo_next; - lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level; - /* Since we already have write-lock on both pages, ok to read cycleid */ - lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel); - ropaque->btpo_cycleid = lopaque->btpo_cycleid; - - /* - * If the page we're splitting is not the rightmost page at its level in - * the tree, then the first entry on the page is the high key for the - * page. We need to copy that to the right half. Otherwise (meaning the - * rightmost page case), all the items on the right half will be user - * data. - */ - rightoff = P_HIKEY; - - if (!P_RIGHTMOST(oopaque)) - { - itemid = PageGetItemId(origpage, P_HIKEY); - itemsz = ItemIdGetLength(itemid); - item = (IndexTuple) PageGetItem(origpage, itemid); - Assert(BTreeTupleGetNAtts(item, rel) > 0); - Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts); - if (PageAddItem(rightpage, (Item) item, itemsz, rightoff, - false, false) == InvalidOffsetNumber) - { - memset(rightpage, 0, BufferGetPageSize(rbuf)); - elog(ERROR, "failed to add hikey to the right sibling" - " while splitting block %u of index \"%s\"", - origpagenumber, RelationGetRelationName(rel)); - } - rightoff = OffsetNumberNext(rightoff); - } - /* * The "high key" for the new left page will be the first key that's going * to go into the new right page, or possibly a truncated version if this @@ -1360,7 +1315,6 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, * tuple could be physically larger despite being opclass-equal in respect * of all attributes prior to the heap TID attribute.) */ - leftoff = P_HIKEY; if (!newitemonleft && newitemoff == firstright) { /* incoming tuple will become first on right page */ @@ -1416,23 +1370,87 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, else lefthikey = item; + /* + * Add new high key to leftpage + */ + leftoff = P_HIKEY; + Assert(BTreeTupleGetNAtts(lefthikey, rel) > 0); Assert(BTreeTupleGetNAtts(lefthikey, rel) <= indnkeyatts); if (PageAddItem(leftpage, (Item) lefthikey, itemsz, leftoff, false, false) == InvalidOffsetNumber) - { - memset(rightpage, 0, BufferGetPageSize(rbuf)); elog(ERROR, "failed to add hikey to the left sibling" " while splitting block %u of index \"%s\"", origpagenumber, RelationGetRelationName(rel)); - } leftoff = OffsetNumberNext(leftoff); /* be tidy */ if (lefthikey != item) pfree(lefthikey); /* - * Now transfer all the data items to the appropriate page. + * Acquire a new right page to split into, now that left page has a new + * high key. From here on, it's not okay to throw an error without + * zeroing rightpage image first. Zeroing rightpage avoids confusing + * VACUUM, which might otherwise try to re-find a downlink to a leftover + * junk page. + * + * It wouldn't be unreasonable to start the critical section here instead, + * but it's better to delay doing so until the original page must be + * modified. That way, failing to add new items to a page (a known + * symptom of corruption) won't cause us to PANIC. + */ + rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE); + rightpage = BufferGetPage(rbuf); + rightpagenumber = BufferGetBlockNumber(rbuf); + /* rightpage was initialized by _bt_getbuf */ + ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage); + + /* Since we have write-lock on both pages now, ok to read cycleid */ + lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel); + ropaque->btpo_cycleid = lopaque->btpo_cycleid; + /* Set leftpage's sibling pointer to point to new rightpage */ + lopaque->btpo_next = rightpagenumber; + + /* + * rightpage won't be the root when we're done. Also, clear the SPLIT_END + * and HAS_GARBAGE flags. + */ + ropaque->btpo_flags = oopaque->btpo_flags; + ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); + ropaque->btpo_prev = origpagenumber; + ropaque->btpo_next = oopaque->btpo_next; + ropaque->btpo.level = oopaque->btpo.level; + + /* + * Add new high key to rightpage where necessary. + * + * If the page we're splitting is not the rightmost page at its level in + * the tree, then the first entry on the page is the high key for the + * page. We need to copy that to the right half. + */ + rightoff = P_HIKEY; + + if (!P_RIGHTMOST(oopaque)) + { + itemid = PageGetItemId(origpage, P_HIKEY); + itemsz = ItemIdGetLength(itemid); + item = (IndexTuple) PageGetItem(origpage, itemid); + Assert(BTreeTupleGetNAtts(item, rel) > 0); + Assert(BTreeTupleGetNAtts(item, rel) <= indnkeyatts); + if (PageAddItem(rightpage, (Item) item, itemsz, rightoff, + false, false) == InvalidOffsetNumber) + { + memset(rightpage, 0, BufferGetPageSize(rbuf)); + elog(ERROR, "failed to add hikey to the right sibling" + " while splitting block %u of index \"%s\"", + origpagenumber, RelationGetRelationName(rel)); + } + rightoff = OffsetNumberNext(rightoff); + } + + /* + * Now transfer all the data items (non-pivot tuples in isleaf case, or + * additional pivot tuples in !isleaf case) to the appropriate page. * * Note: we *must* insert at least the right page's items in item-number * order, for the benefit of _bt_restore_page(). -- 2.17.1