From 1314d4be1d0d0b5ab472884a01804ae910e3c610 Mon Sep 17 00:00:00 2001 From: Andrey Borodin Date: Sun, 31 Aug 2025 10:00:24 +0500 Subject: [PATCH v2 2/2] btree: Add scan abort mechanism for page merge with tuple movement When B-tree page merge moves tuples between pages, concurrent scans can miss tuples or see duplicates. This adds a BTP_HAD_TUPLES_MOVED flag to mark pages that had tuples moved during merge, and aborts scans that encounter such deleted pages with a serialization failure error. The default mergefactor is set to 0 to disable merging by default, ensuring no behavior change unless explicitly configured. Includes TAP test demonstrating the scan abort mechanism. --- src/backend/access/nbtree/nbtpage.c | 4 ++ src/backend/access/nbtree/nbtsearch.c | 61 ++++++++++++++++++- src/include/access/nbtree.h | 4 +- .../t/008_btree_merge_scan_correctness.pl | 37 ++++++----- 4 files changed, 83 insertions(+), 23 deletions(-) diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 792bc52558b..664f5f62b6b 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -2818,6 +2818,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, Page leafpage = BufferGetPage(leafbuf); Page rightpage = BufferGetPage(rbuf); BTPageOpaque rightopaque = BTPageGetOpaque(rightpage); + BTPageOpaque leafopaque = BTPageGetOpaque(leafpage); OffsetNumber insert_at = P_FIRSTDATAKEY(rightopaque); int i; @@ -2838,6 +2839,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, elog(DEBUG1, "Page merge completed in index \"%s\": moved %d tuples from page %u to page %u", RelationGetRelationName(rel), merge_ntuples, leafblkno, rightsib); + + /* Mark that this page had tuples moved for scan detection */ + leafopaque->btpo_flags |= BTP_HAD_TUPLES_MOVED; } /* diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 6ff31f87033..28ce65faa71 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -2223,6 +2223,8 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) Assert(!scan->parallel_scan); } + + BTScanPosUnpinIfPinned(so->currPos); /* @@ -2233,11 +2235,11 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) */ { Relation rel = scan->indexRelation; - BlockNumber blkno = so->currPos.currPage; + BlockNumber ip_blkno = so->currPos.currPage; /* Only pause scans on our test index (btree_test_idx has OID around 16400+) */ - if (rel && RelationGetRelid(rel) > 16384 && blkno == 20) + if (rel && RelationGetRelid(rel) > 16384 && ip_blkno == 20) { - INJECTION_POINT("btree-scan-between-pages", &blkno); + INJECTION_POINT("btree-scan-between-pages", &ip_blkno); } } @@ -2446,6 +2448,19 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, page = BufferGetPage(so->currPos.buf); opaque = BTPageGetOpaque(page); lastcurrblkno = blkno; + + /* + * Check if this is a deleted page that had tuples moved during merge. + * If so, abort the scan to prevent incorrect results. + */ + if (P_ISDELETED(opaque) && P_HAD_TUPLES_MOVED(opaque)) + { + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("scan aborted due to concurrent page merge with tuple movement"), + errhint("Retry the operation."))); + } + if (likely(!P_IGNORE(opaque))) { /* see if there are any matches on this page */ @@ -2549,6 +2564,20 @@ _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, /* Found desired page, return it */ return buf; } + + /* + * Check if this is a deleted page that had tuples moved during merge. + * If so, abort the scan to prevent incorrect results. + */ + if (P_ISDELETED(opaque) && P_HAD_TUPLES_MOVED(opaque)) + { + _bt_relbuf(rel, buf); + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("scan aborted due to concurrent page merge with tuple movement"), + errhint("Retry the operation."))); + } + if (P_RIGHTMOST(opaque) || ++tries > 4) break; /* step right */ @@ -2568,6 +2597,19 @@ _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, opaque = BTPageGetOpaque(page); if (P_ISDELETED(opaque)) { + /* + * Check if this is a deleted page that had tuples moved during merge. + * If so, abort the scan to prevent incorrect results. + */ + if (P_HAD_TUPLES_MOVED(opaque)) + { + _bt_relbuf(rel, buf); + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("scan aborted due to concurrent page merge with tuple movement"), + errhint("Retry the operation."))); + } + /* * It was deleted. Move right to first nondeleted page (there * must be one); that is the page that has acquired the deleted @@ -2585,6 +2627,19 @@ _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, opaque = BTPageGetOpaque(page); if (!P_ISDELETED(opaque)) break; + + /* + * Check if this is a deleted page that had tuples moved during merge. + * If so, abort the scan to prevent incorrect results. + */ + if (P_HAD_TUPLES_MOVED(opaque)) + { + _bt_relbuf(rel, buf); + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("scan aborted due to concurrent page merge with tuple movement"), + errhint("Retry the operation."))); + } } } else diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 6fd1bcd142d..041bf0596ae 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -83,6 +83,7 @@ typedef BTPageOpaqueData *BTPageOpaque; #define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples (deprecated) */ #define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */ #define BTP_HAS_FULLXID (1 << 8) /* contains BTDeletedPageData */ +#define BTP_HAD_TUPLES_MOVED (1 << 9) /* page was deleted after moving tuples */ /* * The max allowed value of a cycle ID is a bit less than 64K. This is @@ -201,7 +202,7 @@ typedef struct BTMetaPageData #define BTREE_DEFAULT_FILLFACTOR 90 #define BTREE_NONLEAF_FILLFACTOR 70 #define BTREE_SINGLEVAL_FILLFACTOR 96 -#define BTREE_DEFAULT_MERGEFACTOR 5 +#define BTREE_DEFAULT_MERGEFACTOR 0 /* Disabled by default for safety */ /* * In general, the btree code tries to localize its knowledge about @@ -228,6 +229,7 @@ typedef struct BTMetaPageData #define P_HAS_GARBAGE(opaque) (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0) #define P_INCOMPLETE_SPLIT(opaque) (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0) #define P_HAS_FULLXID(opaque) (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0) +#define P_HAD_TUPLES_MOVED(opaque) (((opaque)->btpo_flags & BTP_HAD_TUPLES_MOVED) != 0) /* * BTDeletedPageData is the page contents of a deleted page diff --git a/src/test/modules/test_misc/t/008_btree_merge_scan_correctness.pl b/src/test/modules/test_misc/t/008_btree_merge_scan_correctness.pl index abb40f6a4ff..ca06cd2cf28 100644 --- a/src/test/modules/test_misc/t/008_btree_merge_scan_correctness.pl +++ b/src/test/modules/test_misc/t/008_btree_merge_scan_correctness.pl @@ -76,7 +76,7 @@ $node->safe_psql('postgres', q{ my $forward_scan = $node->background_psql('postgres'); my $backward_scan = $node->background_psql('postgres'); -# Start queries without waiting for completion (they'll hang at injection point) +# Start queries without waiting for completion (they should abort with serialization error) $forward_scan->query_until(qr/starting forward scan/, q{ SET enable_seqscan = off; SET enable_bitmapscan = off; @@ -100,9 +100,13 @@ sleep(1); # Run VACUUM while scans are paused - this may trigger page merge $node->safe_psql('postgres', q{ + SET client_min_messages TO DEBUG1; VACUUM btree_test; }); +# Get current log position to check for new errors +my $log_offset = -s $node->logfile; + # Release waiting scans $node->safe_psql('postgres', q{ SELECT injection_points_detach('btree-scan-between-pages'); @@ -110,28 +114,23 @@ $node->safe_psql('postgres', q{ SELECT injection_points_wakeup('btree-scan-between-pages'); }); -$forward_scan->quit; -$backward_scan->quit; - -# Analyze results for scan correctness issues -my $expected_count = $node->safe_psql('postgres', - 'SELECT count(*) FROM btree_test'); +# Wait for scans to abort with serialization errors +$node->wait_for_log('scan aborted due to concurrent page merge with tuple movement', + $log_offset); -my $forward_count = $node->safe_psql('postgres', - 'SELECT count(*) FROM forward_results'); +# Clean up background processes - they should have failed +$forward_scan->{run}->finish; +$backward_scan->{run}->finish; -my $backward_count = $node->safe_psql('postgres', - 'SELECT count(*) FROM backward_results'); +$node->stop('fast'); -# Report results -note("Expected rows: $expected_count"); -note("Forward scan rows: $forward_count"); -note("Backward scan rows: $backward_count"); +# Verify that scans were aborted by checking the log file +my $log_contents = slurp_file($node->logfile); +my $error_count = () = $log_contents =~ /scan aborted due to concurrent page merge with tuple movement/g; -# Test assertions - these should fail with page merge, showing the race condition -is($forward_count, $expected_count, 'Forward scan returns correct count'); -is($backward_count, $expected_count, 'Backward scan returns correct count'); +note("Found $error_count scan abort errors in log"); -$node->stop('fast'); +# We should see at least two scan abort error (possibly two, one for each scan) +ok($error_count >= 2, 'At least twp scan was aborted due to tuple movement'); done_testing(); \ No newline at end of file -- 2.39.5 (Apple Git-154)