From 6bf12b80259c48ee613a56e85f37ebb73cfdf819 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Tue, 30 Apr 2019 10:55:52 -0700 Subject: [PATCH v2] Fix nbtsort.c's page space accounting. Commit dd299df8189 failed to have nbtsort.c conservatively assume that suffix truncation was ineffective (i.e. that it would represent heap TID in new high key). Space for a possible heap TID in new leaf page high key was budgeted within _bt_check_third_page(), but nbtsort.c's definition of when a page is completely full didn't explicitly consider high keys where heap TID is represented. When the page was deemed full, it might already be too late: there might be insufficient space, because the last existing non-pivot tuple on page gets replaced with new high key that is slightly larger (larger by the space required to store a MAXALIGN()'d heap TID item pointer). To fix, bring nbtsort.c in line with nbtsplitloc.c, which already explicitly assumes that new high key will need to have a heap TID added when high key is formed on the leaf level. Reported-By: Andreas Joseph Krogh Discussion: https://postgr.es/m/VisenaEmail.c5.3ee7fe277d514162.16a6d785bea@tc7-visena --- src/backend/access/nbtree/nbtsort.c | 44 ++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 9ac4c1e1c0..7db468bb58 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -841,6 +841,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) OffsetNumber last_off; Size pgspc; Size itupsz; + bool isleaf; /* * This is a handy place to check for cancel interrupts during the btree @@ -855,9 +856,13 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) pgspc = PageGetFreeSpace(npage); itupsz = IndexTupleSize(itup); itupsz = MAXALIGN(itupsz); + /* Leaf pages use suffix truncation, and need extra heap TID space */ + isleaf = state->btps_level == 0; /* - * Check whether the item can fit on a btree page at all. + * Check whether the item can fit on page, while making sure that page has + * at least two tuples (in addition to page high key) before starting next + * page. * * Every newly built index will treat heap TID as part of the keyspace, * which imposes the requirement that new high keys must occasionally have @@ -870,16 +875,27 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * the reserved space. This should never fail on internal pages. */ if (unlikely(itupsz > BTMaxItemSize(npage))) - _bt_check_third_page(wstate->index, wstate->heap, - state->btps_level == 0, npage, itup); + _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage, + itup); /* - * Check to see if page is "full". It's definitely full if the item won't - * fit. Otherwise, compare to the target freespace derived from the - * fillfactor. However, we must put at least two items on each page, so - * disregard fillfactor if we don't have that many. + * Page is definitely full if the new item won't fit. We take into + * account the possible need for heap TID space within _bt_truncate() when + * page is a leaf page. It is guaranteed that we can fit at least 2 + * non-pivot tuples plus a high key with heap TID when finishing off a + * leaf page, because _bt_check_third_page() conservatively rejects + * oversized non-pivot tuples. (On internal pages we can always fit 3 + * pivot tuples, including high key.) + * + * Most of the time, page is only "full" in the sense that inserting new + * tuple would cause us to exceed fillfactor-wise limit (no need to take + * heap TID space into account in this soft limit). However, we must + * always leave at least two items plus high key on each page before + * starting a new page, so disregard fillfactor if we don't have enough + * items to make that work yet. */ - if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY)) + if (pgspc - (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) < itupsz || + (pgspc < state->btps_full && last_off > P_FIRSTKEY)) { /* * Finish off the page and write it out. @@ -889,7 +905,6 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) ItemId ii; ItemId hii; IndexTuple oitup; - BTPageOpaque opageop = (BTPageOpaque) PageGetSpecialPointer(opage); /* Create new page of same level */ npage = _bt_blnewpage(state->btps_level); @@ -917,7 +932,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) ItemIdSetUnused(ii); /* redundant */ ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); - if (P_ISLEAF(opageop)) + if (isleaf) { IndexTuple lastleft; IndexTuple truncated; @@ -944,8 +959,9 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) * to actually save space on the leaf page). We delete the * original high key, and add our own truncated high key at the * same offset. It's okay if the truncated tuple is slightly - * larger due to containing a heap TID value, since this case is - * known to _bt_check_third_page(), which reserves space. + * larger due to containing a heap TID value, since that was taken + * into account when we determined that page is full; the extra + * space must already be available on page. * * Note that the page layout won't be changed very much. oitup is * already located at the physical beginning of tuple space, so we @@ -979,9 +995,9 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) Assert((BTreeTupleGetNAtts(state->btps_minkey, wstate->index) <= IndexRelationGetNumberOfKeyAttributes(wstate->index) && BTreeTupleGetNAtts(state->btps_minkey, wstate->index) > 0) || - P_LEFTMOST(opageop)); + P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage))); Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == 0 || - !P_LEFTMOST(opageop)); + !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage))); BTreeInnerTupleSetDownLink(state->btps_minkey, oblkno); _bt_buildadd(wstate, state->btps_next, state->btps_minkey); pfree(state->btps_minkey); -- 2.17.1