From 60aa76aaea5e21a6821aa2be44f11bf35f7ef94b Mon Sep 17 00:00:00 2001 From: "Andrey M. Borodin" Date: Sat, 23 Jul 2022 14:22:26 +0500 Subject: [PATCH v28-review 07/13] Add gin_index_parent_check() to verify GIN index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Grigory Kryachko Author: Heikki Linnakangas Author: Andrey Borodin Reviewed-By: José Villanova Reviewed-By: Aleksander Alekseev Reviewed-By: Nikolay Samokhvalov Reviewed-By: Andres Freund Discussion: https://postgr.es/m/45AC9B0A-2B45-40EE-B08F-BDCF5739D1E1%40yandex-team.ru --- contrib/amcheck/Makefile | 3 +- contrib/amcheck/amcheck--1.4--1.5.sql | 9 + contrib/amcheck/expected/check_gin.out | 64 ++ contrib/amcheck/meson.build | 2 + contrib/amcheck/sql/check_gin.sql | 40 ++ contrib/amcheck/verify_gin.c | 769 +++++++++++++++++++++++++ doc/src/sgml/amcheck.sgml | 19 + 7 files changed, 905 insertions(+), 1 deletion(-) create mode 100644 contrib/amcheck/expected/check_gin.out create mode 100644 contrib/amcheck/sql/check_gin.sql create mode 100644 contrib/amcheck/verify_gin.c diff --git a/contrib/amcheck/Makefile b/contrib/amcheck/Makefile index f63252ff33c..5c3ea8bc6a4 100644 --- a/contrib/amcheck/Makefile +++ b/contrib/amcheck/Makefile @@ -4,6 +4,7 @@ MODULE_big = amcheck OBJS = \ $(WIN32RES) \ amcheck.o \ + verify_gin.o \ verify_gist.o \ verify_heapam.o \ verify_nbtree.o @@ -13,7 +14,7 @@ DATA = amcheck--1.2--1.3.sql amcheck--1.1--1.2.sql amcheck--1.0--1.1.sql amcheck amcheck--1.3--1.4.sql amcheck--1.4--1.5.sql PGFILEDESC = "amcheck - function for verifying relation integrity" -REGRESS = check check_btree check_gist check_heap +REGRESS = check check_btree check_gin check_gist check_heap EXTRA_INSTALL = contrib/pg_walinspect TAP_TESTS = 1 diff --git a/contrib/amcheck/amcheck--1.4--1.5.sql b/contrib/amcheck/amcheck--1.4--1.5.sql index 3fc72364180..a2bca7c2037 100644 --- a/contrib/amcheck/amcheck--1.4--1.5.sql +++ b/contrib/amcheck/amcheck--1.4--1.5.sql @@ -12,3 +12,12 @@ AS 'MODULE_PATHNAME', 'gist_index_check' LANGUAGE C STRICT; REVOKE ALL ON FUNCTION gist_index_check(regclass, boolean) FROM PUBLIC; + +-- gin_index_parent_check() +-- +CREATE FUNCTION gin_index_parent_check(index regclass) +RETURNS VOID +AS 'MODULE_PATHNAME', 'gin_index_parent_check' +LANGUAGE C STRICT; + +REVOKE ALL ON FUNCTION gin_index_parent_check(regclass) FROM PUBLIC; diff --git a/contrib/amcheck/expected/check_gin.out b/contrib/amcheck/expected/check_gin.out new file mode 100644 index 00000000000..43fd769a506 --- /dev/null +++ b/contrib/amcheck/expected/check_gin.out @@ -0,0 +1,64 @@ +-- Test of index bulk load +SELECT setseed(1); + setseed +--------- + +(1 row) + +CREATE TABLE "gin_check"("Column1" int[]); +-- posting trees (frequently used entries) +INSERT INTO gin_check select array_agg(round(random()*255) ) from generate_series(1, 100000) as i group by i % 10000; +-- posting leaves (sparse entries) +INSERT INTO gin_check select array_agg(255 + round(random()*100)) from generate_series(1, 100) as i group by i % 100; +CREATE INDEX gin_check_idx on "gin_check" USING GIN("Column1"); +SELECT gin_index_parent_check('gin_check_idx'); + gin_index_parent_check +------------------------ + +(1 row) + +-- cleanup +DROP TABLE gin_check; +-- Test index inserts +SELECT setseed(1); + setseed +--------- + +(1 row) + +CREATE TABLE "gin_check"("Column1" int[]); +CREATE INDEX gin_check_idx on "gin_check" USING GIN("Column1"); +ALTER INDEX gin_check_idx SET (fastupdate = false); +-- posting trees +INSERT INTO gin_check select array_agg(round(random()*255) ) from generate_series(1, 100000) as i group by i % 10000; +-- posting leaves +INSERT INTO gin_check select array_agg(100 + round(random()*255)) from generate_series(1, 100) as i group by i % 100; +SELECT gin_index_parent_check('gin_check_idx'); + gin_index_parent_check +------------------------ + +(1 row) + +-- cleanup +DROP TABLE gin_check; +-- Test GIN over text array +SELECT setseed(1); + setseed +--------- + +(1 row) + +CREATE TABLE "gin_check_text_array"("Column1" text[]); +-- posting trees +INSERT INTO gin_check_text_array select array_agg(md5(round(random()*300)::text)::text) from generate_series(1, 100000) as i group by i % 10000; +-- posting leaves +INSERT INTO gin_check_text_array select array_agg(md5(round(random()*300 + 300)::text)::text) from generate_series(1, 10000) as i group by i % 100; +CREATE INDEX gin_check_text_array_idx on "gin_check_text_array" USING GIN("Column1"); +SELECT gin_index_parent_check('gin_check_text_array_idx'); + gin_index_parent_check +------------------------ + +(1 row) + +-- cleanup +DROP TABLE gin_check_text_array; diff --git a/contrib/amcheck/meson.build b/contrib/amcheck/meson.build index 15ae94cc90f..5c9ddfe0758 100644 --- a/contrib/amcheck/meson.build +++ b/contrib/amcheck/meson.build @@ -2,6 +2,7 @@ amcheck_sources = files( 'amcheck.c', + 'verify_gin.c', 'verify_gist.c', 'verify_heapam.c', 'verify_nbtree.c', @@ -38,6 +39,7 @@ tests += { 'sql': [ 'check', 'check_btree', + 'check_gin', 'check_gist', 'check_heap', ], diff --git a/contrib/amcheck/sql/check_gin.sql b/contrib/amcheck/sql/check_gin.sql new file mode 100644 index 00000000000..9771afffa5c --- /dev/null +++ b/contrib/amcheck/sql/check_gin.sql @@ -0,0 +1,40 @@ +-- Test of index bulk load +SELECT setseed(1); +CREATE TABLE "gin_check"("Column1" int[]); +-- posting trees (frequently used entries) +INSERT INTO gin_check select array_agg(round(random()*255) ) from generate_series(1, 100000) as i group by i % 10000; +-- posting leaves (sparse entries) +INSERT INTO gin_check select array_agg(255 + round(random()*100)) from generate_series(1, 100) as i group by i % 100; +CREATE INDEX gin_check_idx on "gin_check" USING GIN("Column1"); +SELECT gin_index_parent_check('gin_check_idx'); + +-- cleanup +DROP TABLE gin_check; + +-- Test index inserts +SELECT setseed(1); +CREATE TABLE "gin_check"("Column1" int[]); +CREATE INDEX gin_check_idx on "gin_check" USING GIN("Column1"); +ALTER INDEX gin_check_idx SET (fastupdate = false); +-- posting trees +INSERT INTO gin_check select array_agg(round(random()*255) ) from generate_series(1, 100000) as i group by i % 10000; +-- posting leaves +INSERT INTO gin_check select array_agg(100 + round(random()*255)) from generate_series(1, 100) as i group by i % 100; + +SELECT gin_index_parent_check('gin_check_idx'); + +-- cleanup +DROP TABLE gin_check; + +-- Test GIN over text array +SELECT setseed(1); +CREATE TABLE "gin_check_text_array"("Column1" text[]); +-- posting trees +INSERT INTO gin_check_text_array select array_agg(md5(round(random()*300)::text)::text) from generate_series(1, 100000) as i group by i % 10000; +-- posting leaves +INSERT INTO gin_check_text_array select array_agg(md5(round(random()*300 + 300)::text)::text) from generate_series(1, 10000) as i group by i % 100; +CREATE INDEX gin_check_text_array_idx on "gin_check_text_array" USING GIN("Column1"); +SELECT gin_index_parent_check('gin_check_text_array_idx'); + +-- cleanup +DROP TABLE gin_check_text_array; diff --git a/contrib/amcheck/verify_gin.c b/contrib/amcheck/verify_gin.c new file mode 100644 index 00000000000..877ecacb9c1 --- /dev/null +++ b/contrib/amcheck/verify_gin.c @@ -0,0 +1,769 @@ +/*------------------------------------------------------------------------- + * + * verify_gin.c + * Verifies the integrity of GIN indexes based on invariants. + * + * Verification checks that all paths in GIN graph contain + * consistent keys: tuples on parent pages consistently include tuples + * from children pages. Also, verification checks graph invariants: + * internal page must have at least one downlinks, internal page can + * reference either only leaf pages or only internal pages. + * + * + * Copyright (c) 2017-2023, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/amcheck/verify_gin.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/gin_private.h" +#include "access/nbtree.h" +#include "amcheck.h" +#include "catalog/pg_am.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "string.h" + +/* + * GinScanItem represents one item of depth-first scan of GIN index. + */ +typedef struct GinScanItem +{ + int depth; + IndexTuple parenttup; + BlockNumber parentblk; + XLogRecPtr parentlsn; + BlockNumber blkno; + struct GinScanItem *next; +} GinScanItem; + +/* + * GinPostingTreeScanItem represents one item of depth-first scan of GIN posting tree. + */ +typedef struct GinPostingTreeScanItem +{ + int depth; + ItemPointerData parentkey; + BlockNumber parentblk; + BlockNumber blkno; + struct GinPostingTreeScanItem *next; +} GinPostingTreeScanItem; + + +PG_FUNCTION_INFO_V1(gin_index_parent_check); + +static void gin_check_parent_keys_consistency(Relation rel, + Relation heaprel, + void *callback_state, bool readonly); +static void check_index_page(Relation rel, Buffer buffer, BlockNumber blockNo); +static IndexTuple gin_refind_parent(Relation rel, + BlockNumber parentblkno, + BlockNumber childblkno, + BufferAccessStrategy strategy); +static ItemId PageGetItemIdCareful(Relation rel, BlockNumber block, Page page, + OffsetNumber offset); + +/* + * gin_index_parent_check(index regclass) + * + * Verify integrity of GIN index. + * + * Acquires AccessShareLock on heap & index relations. + */ +Datum +gin_index_parent_check(PG_FUNCTION_ARGS) +{ + Oid indrelid = PG_GETARG_OID(0); + + amcheck_lock_relation_and_check(indrelid, + GIN_AM_OID, + gin_check_parent_keys_consistency, + AccessShareLock, + NULL); + + PG_RETURN_VOID(); +} + +/* + * Read item pointers from leaf entry tuple. + * + * Returns a palloc'd array of ItemPointers. The number of items is returned + * in *nitems. + */ +static ItemPointer +ginReadTupleWithoutState(IndexTuple itup, int *nitems) +{ + Pointer ptr = GinGetPosting(itup); + int nipd = GinGetNPosting(itup); + ItemPointer ipd; + int ndecoded; + + if (GinItupIsCompressed(itup)) + { + if (nipd > 0) + { + ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded); + if (nipd != ndecoded) + elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded", + nipd, ndecoded); + } + else + { + ipd = palloc(0); + } + } + else + { + ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd); + memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd); + } + *nitems = nipd; + return ipd; +} + +/* + * Allocates memory context and scans through postigTree graph + * + */ +static void +gin_check_posting_tree_parent_keys_consistency(Relation rel, BlockNumber posting_tree_root) +{ + BufferAccessStrategy strategy = GetAccessStrategy(BAS_BULKREAD); + GinPostingTreeScanItem *stack; + MemoryContext mctx; + MemoryContext oldcontext; + + int leafdepth; + + mctx = AllocSetContextCreate(CurrentMemoryContext, + "amcheck context", + ALLOCSET_DEFAULT_SIZES); + oldcontext = MemoryContextSwitchTo(mctx); + + /* + * We don't know the height of the tree yet, but as soon as we encounter a + * leaf page, we will set 'leafdepth' to its depth. + */ + leafdepth = -1; + + /* Start the scan at the root page */ + stack = (GinPostingTreeScanItem *) palloc0(sizeof(GinPostingTreeScanItem)); + stack->depth = 0; + ItemPointerSetInvalid(&stack->parentkey); + stack->parentblk = InvalidBlockNumber; + stack->blkno = posting_tree_root; + + elog(DEBUG3, "processing posting tree at blk %u", posting_tree_root); + + while (stack) + { + GinPostingTreeScanItem *stack_next; + Buffer buffer; + Page page; + OffsetNumber i, + maxoff; + + CHECK_FOR_INTERRUPTS(); + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno, + RBM_NORMAL, strategy); + LockBuffer(buffer, GIN_SHARE); + page = (Page) BufferGetPage(buffer); + Assert(GinPageIsData(page)); + + /* Check that the tree has the same height in all branches */ + if (GinPageIsLeaf(page)) + { + ItemPointerData minItem; + int nlist; + ItemPointerData *list; + char tidrange_buf[100]; + + ItemPointerSetMin(&minItem); + + if (leafdepth == -1) + leafdepth = stack->depth; + else if (stack->depth != leafdepth) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\": internal pages traversal encountered leaf page unexpectedly on block %u", + RelationGetRelationName(rel), stack->blkno))); + list = GinDataLeafPageGetItems(page, &nlist, minItem); + + if (nlist > 0) + { + snprintf(tidrange_buf, sizeof(tidrange_buf), + "%d tids (%u, %u) - (%u, %u)", + nlist, + ItemPointerGetBlockNumberNoCheck(&list[0]), + ItemPointerGetOffsetNumberNoCheck(&list[0]), + ItemPointerGetBlockNumberNoCheck(&list[nlist - 1]), + ItemPointerGetOffsetNumberNoCheck(&list[nlist - 1])); + } + else + { + snprintf(tidrange_buf, sizeof(tidrange_buf), "0 tids"); + } + + if (stack->parentblk != InvalidBlockNumber) + { + elog(DEBUG3, "blk %u: parent %u highkey (%u, %u), %s", + stack->blkno, + stack->parentblk, + ItemPointerGetBlockNumberNoCheck(&stack->parentkey), + ItemPointerGetOffsetNumberNoCheck(&stack->parentkey), + tidrange_buf); + } + else + { + elog(DEBUG3, "blk %u: root leaf, %s", + stack->blkno, + tidrange_buf); + } + + if (stack->parentblk != InvalidBlockNumber && + ItemPointerGetOffsetNumberNoCheck(&stack->parentkey) != InvalidOffsetNumber && + nlist > 0 && ItemPointerCompare(&stack->parentkey, &list[nlist - 1]) < 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\": tid exceeds parent's high key in postingTree leaf on block %u", + RelationGetRelationName(rel), stack->blkno))); + } + else + { + LocationIndex pd_lower; + ItemPointerData bound; + int lowersize; + + /* + * Check that tuples in each page are properly ordered and + * consistent with parent high key + */ + maxoff = GinPageGetOpaque(page)->maxoff; + if (stack->parentblk != InvalidBlockNumber) + elog(DEBUG3, "blk %u: internal posting tree page with %u items, parent %u highkey (%u, %u)", + stack->blkno, maxoff, stack->parentblk, + ItemPointerGetBlockNumberNoCheck(&stack->parentkey), + ItemPointerGetOffsetNumberNoCheck(&stack->parentkey)); + else + elog(DEBUG3, "blk %u: root internal posting tree page with %u items", + stack->blkno, maxoff); + + /* + * A GIN posting tree internal page stores PostingItems in the + * 'lower' part of the page. The 'upper' part is unused. The + * number of elements is stored in the opaque area (maxoff). Make + * sure the size of the 'lower' part agrees with 'maxoff' + * + * We didn't set pd_lower until PostgreSQL version 9.4, so if this + * check fails, it could also be because the index was + * binary-upgraded from an earlier version. That was a long time + * ago, though, so let's warn if it doesn't match. + */ + pd_lower = ((PageHeader) page)->pd_lower; + lowersize = pd_lower - MAXALIGN(SizeOfPageHeaderData); + if ((lowersize - MAXALIGN(sizeof(ItemPointerData))) / sizeof(PostingItem) != maxoff) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has unexpected pd_lower %u in posting tree block %u with maxoff %u)", + RelationGetRelationName(rel), pd_lower, stack->blkno, maxoff))); + + /* + * Before the PostingItems, there's one ItemPointerData in the + * 'lower' part that stores the page's high key. + */ + bound = *GinDataPageGetRightBound(page); + + if (stack->parentblk != InvalidBlockNumber && + !ItemPointerEquals(&stack->parentkey, &bound)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\": posting tree page's high key (%u, %u) doesn't match the downlink on block %u (parent blk %u, key (%u, %u))", + RelationGetRelationName(rel), + ItemPointerGetBlockNumberNoCheck(&bound), + ItemPointerGetOffsetNumberNoCheck(&bound), + stack->blkno, stack->parentblk, + ItemPointerGetBlockNumberNoCheck(&stack->parentkey), + ItemPointerGetOffsetNumberNoCheck(&stack->parentkey)))); + + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + PostingItem *posting_item = GinDataPageGetPostingItem(page, i); + + elog(DEBUG3, "key (%u, %u) -> %u", + ItemPointerGetBlockNumber(&posting_item->key), + ItemPointerGetOffsetNumber(&posting_item->key), + BlockIdGetBlockNumber(&posting_item->child_blkno)); + + if (i == maxoff && + GinPageGetOpaque(page)->rightlink == InvalidBlockNumber) + { + /* + * The rightmost item in the tree level has (0, 0) as the + * key + */ + if (ItemPointerGetBlockNumberNoCheck(&posting_item->key) != 0 || + ItemPointerGetOffsetNumberNoCheck(&posting_item->key) != 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\": rightmost posting tree page (blk %u) has unexpected last key (%u, %u)", + RelationGetRelationName(rel), + stack->blkno, + ItemPointerGetBlockNumberNoCheck(&posting_item->key), + ItemPointerGetOffsetNumberNoCheck(&posting_item->key)))); + } + else if (i != FirstOffsetNumber) + { + PostingItem *previous_posting_item = GinDataPageGetPostingItem(page, i - 1); + + if (ItemPointerCompare(&posting_item->key, &previous_posting_item->key) < 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has wrong tuple order in posting tree, block %u, offset %u", + RelationGetRelationName(rel), stack->blkno, i))); + } + + /* + * Check if this tuple is consistent with the downlink in the + * parent. + */ + if (stack->parentblk != InvalidBlockNumber && i == maxoff && + ItemPointerCompare(&stack->parentkey, &posting_item->key) < 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\": posting item exceeds parent's high key in postingTree internal page on block %u offset %u", + RelationGetRelationName(rel), + stack->blkno, i))); + + /* If this is an internal page, recurse into the child */ + if (!GinPageIsLeaf(page)) + { + GinPostingTreeScanItem *ptr; + + ptr = (GinPostingTreeScanItem *) palloc(sizeof(GinPostingTreeScanItem)); + ptr->depth = stack->depth + 1; + ptr->parentkey = posting_item->key; + ptr->parentblk = stack->blkno; + ptr->blkno = BlockIdGetBlockNumber(&posting_item->child_blkno); + ptr->next = stack->next; + stack->next = ptr; + } + } + } + LockBuffer(buffer, GIN_UNLOCK); + ReleaseBuffer(buffer); + + /* Step to next item in the queue */ + stack_next = stack->next; + pfree(stack); + stack = stack_next; + } + + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(mctx); +} + +/* + * Main entry point for GIN check. Allocates memory context and scans through + * GIN graph. + */ +static void +gin_check_parent_keys_consistency(Relation rel, + Relation heaprel, + void *callback_state, + bool readonly) +{ + BufferAccessStrategy strategy = GetAccessStrategy(BAS_BULKREAD); + GinScanItem *stack; + MemoryContext mctx; + MemoryContext oldcontext; + GinState state; + int leafdepth; + + mctx = AllocSetContextCreate(CurrentMemoryContext, + "amcheck context", + ALLOCSET_DEFAULT_SIZES); + oldcontext = MemoryContextSwitchTo(mctx); + initGinState(&state, rel); + + /* + * We don't know the height of the tree yet, but as soon as we encounter a + * leaf page, we will set 'leafdepth' to its depth. + */ + leafdepth = -1; + + /* Start the scan at the root page */ + stack = (GinScanItem *) palloc0(sizeof(GinScanItem)); + stack->depth = 0; + stack->parenttup = NULL; + stack->parentblk = InvalidBlockNumber; + stack->parentlsn = InvalidXLogRecPtr; + stack->blkno = GIN_ROOT_BLKNO; + + while (stack) + { + GinScanItem *stack_next; + Buffer buffer; + Page page; + OffsetNumber i, + maxoff; + XLogRecPtr lsn; + IndexTuple prev_tuple; + + CHECK_FOR_INTERRUPTS(); + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno, + RBM_NORMAL, strategy); + LockBuffer(buffer, GIN_SHARE); + page = (Page) BufferGetPage(buffer); + lsn = BufferGetLSNAtomic(buffer); + maxoff = PageGetMaxOffsetNumber(page); + + /* Do basic sanity checks on the page headers */ + check_index_page(rel, buffer, stack->blkno); + + /* + * It's possible that the page was split since we looked at the + * parent, so that we didn't missed the downlink of the right sibling + * when we scanned the parent. If so, add the right sibling to the + * stack now. + */ + if (stack->parenttup != NULL) + { + GinNullCategory parent_key_category; + Datum parent_key = gintuple_get_key(&state, + stack->parenttup, + &parent_key_category); + ItemId iid = PageGetItemIdCareful(rel, stack->blkno, + page, maxoff); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + OffsetNumber attnum = gintuple_get_attrnum(&state, idxtuple); + GinNullCategory page_max_key_category; + Datum page_max_key = gintuple_get_key(&state, idxtuple, &page_max_key_category); + + if (GinPageGetOpaque(page)->rightlink != InvalidBlockNumber && + ginCompareEntries(&state, attnum, page_max_key, + page_max_key_category, parent_key, + parent_key_category) > 0) + { + /* split page detected, install right link to the stack */ + GinScanItem *ptr; + + elog(DEBUG3, "split detected"); + + ptr = (GinScanItem *) palloc(sizeof(GinScanItem)); + ptr->depth = stack->depth; + ptr->parenttup = CopyIndexTuple(stack->parenttup); + ptr->parentblk = stack->parentblk; + ptr->parentlsn = stack->parentlsn; + ptr->blkno = GinPageGetOpaque(page)->rightlink; + ptr->next = stack->next; + stack->next = ptr; + } + } + + /* Check that the tree has the same height in all branches */ + if (GinPageIsLeaf(page)) + { + if (leafdepth == -1) + leafdepth = stack->depth; + else if (stack->depth != leafdepth) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\": internal pages traversal encountered leaf page unexpectedly on block %u", + RelationGetRelationName(rel), stack->blkno))); + } + + /* + * Check that tuples in each page are properly ordered and consistent + * with parent high key + */ + prev_tuple = NULL; + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + ItemId iid = PageGetItemIdCareful(rel, stack->blkno, page, i); + IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + OffsetNumber attnum = gintuple_get_attrnum(&state, idxtuple); + GinNullCategory prev_key_category; + Datum prev_key; + GinNullCategory current_key_category; + Datum current_key; + + if (MAXALIGN(ItemIdGetLength(iid)) != MAXALIGN(IndexTupleSize(idxtuple))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has inconsistent tuple sizes, block %u, offset %u", + RelationGetRelationName(rel), stack->blkno, i))); + + current_key = gintuple_get_key(&state, idxtuple, ¤t_key_category); + + /* (apparently) first block is metadata, skip order check */ + if (i != FirstOffsetNumber && stack->blkno != (BlockNumber) 1) + { + prev_key = gintuple_get_key(&state, prev_tuple, &prev_key_category); + if (ginCompareEntries(&state, attnum, prev_key, + prev_key_category, current_key, + current_key_category) >= 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has wrong tuple order, block %u, offset %u", + RelationGetRelationName(rel), stack->blkno, i))); + } + + /* + * Check if this tuple is consistent with the downlink in the + * parent. + */ + if (stack->parenttup && + i == maxoff) + { + GinNullCategory parent_key_category; + Datum parent_key = gintuple_get_key(&state, + stack->parenttup, + &parent_key_category); + + if (ginCompareEntries(&state, attnum, current_key, + current_key_category, parent_key, + parent_key_category) > 0) + { + /* + * There was a discrepancy between parent and child + * tuples. We need to verify it is not a result of + * concurrent call of gistplacetopage(). So, lock parent + * and try to find downlink for current page. It may be + * missing due to concurrent page split, this is OK. + */ + pfree(stack->parenttup); + stack->parenttup = gin_refind_parent(rel, stack->parentblk, + stack->blkno, strategy); + + /* We found it - make a final check before failing */ + if (!stack->parenttup) + elog(NOTICE, "Unable to find parent tuple for block %u on block %u due to concurrent split", + stack->blkno, stack->parentblk); + else + { + parent_key = gintuple_get_key(&state, + stack->parenttup, + &parent_key_category); + if (ginCompareEntries(&state, attnum, current_key, + current_key_category, parent_key, + parent_key_category) > 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has inconsistent records on page %u offset %u", + RelationGetRelationName(rel), stack->blkno, i))); + else + { + /* + * But now it is properly adjusted - nothing to do + * here. + */ + } + } + } + } + + /* If this is an internal page, recurse into the child */ + if (!GinPageIsLeaf(page)) + { + GinScanItem *ptr; + + ptr = (GinScanItem *) palloc(sizeof(GinScanItem)); + ptr->depth = stack->depth + 1; + /* last tuple in layer has no high key */ + if (i != maxoff && !GinPageGetOpaque(page)->rightlink) + { + ptr->parenttup = CopyIndexTuple(idxtuple); + } + else + { + ptr->parenttup = NULL; + } + ptr->parentblk = stack->blkno; + ptr->blkno = GinGetDownlink(idxtuple); + ptr->parentlsn = lsn; + ptr->next = stack->next; + stack->next = ptr; + } + /* If this item is a pointer to a posting tree, recurse into it */ + else if (GinIsPostingTree(idxtuple)) + { + BlockNumber rootPostingTree = GinGetPostingTree(idxtuple); + + gin_check_posting_tree_parent_keys_consistency(rel, rootPostingTree); + } + else + { + ItemPointer ipd; + int nipd; + + ipd = ginReadTupleWithoutState(idxtuple, &nipd); + + for (int j = 0; j < nipd; j++) + { + if (!OffsetNumberIsValid(ItemPointerGetOffsetNumber(&ipd[j]))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\": posting list contains invalid heap pointer on block %u", + RelationGetRelationName(rel), stack->blkno))); + } + pfree(ipd); + } + + prev_tuple = CopyIndexTuple(idxtuple); + } + + LockBuffer(buffer, GIN_UNLOCK); + ReleaseBuffer(buffer); + + /* Step to next item in the queue */ + stack_next = stack->next; + if (stack->parenttup) + pfree(stack->parenttup); + pfree(stack); + stack = stack_next; + } + + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(mctx); +} + +/* + * Verify that a freshly-read page looks sane. + */ +static void +check_index_page(Relation rel, Buffer buffer, BlockNumber blockNo) +{ + Page page = BufferGetPage(buffer); + + /* + * ReadBuffer verifies that every newly-read page passes + * PageHeaderIsValid, which means it either contains a reasonably sane + * page header or is all-zero. We have to defend against the all-zero + * case, however. + */ + if (PageIsNew(page)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains unexpected zero page at block %u", + RelationGetRelationName(rel), + BufferGetBlockNumber(buffer)), + errhint("Please REINDEX it."))); + + /* + * Additionally check that the special area looks sane. + */ + if (PageGetSpecialSize(page) != MAXALIGN(sizeof(GinPageOpaqueData))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" contains corrupted page at block %u", + RelationGetRelationName(rel), + BufferGetBlockNumber(buffer)), + errhint("Please REINDEX it."))); + + if (GinPageIsDeleted(page)) + { + if (!GinPageIsLeaf(page)) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has deleted internal page %d", + RelationGetRelationName(rel), blockNo))); + if (PageGetMaxOffsetNumber(page) > InvalidOffsetNumber) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has deleted page %d with tuples", + RelationGetRelationName(rel), blockNo))); + } + else if (PageGetMaxOffsetNumber(page) > MaxIndexTuplesPerPage) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("index \"%s\" has page %d with exceeding count of tuples", + RelationGetRelationName(rel), blockNo))); +} + +/* + * Try to re-find downlink pointing to 'blkno', in 'parentblkno'. + * + * If found, returns a palloc'd copy of the downlink tuple. Otherwise, + * returns NULL. + */ +static IndexTuple +gin_refind_parent(Relation rel, BlockNumber parentblkno, + BlockNumber childblkno, BufferAccessStrategy strategy) +{ + Buffer parentbuf; + Page parentpage; + OffsetNumber o, + parent_maxoff; + IndexTuple result = NULL; + + parentbuf = ReadBufferExtended(rel, MAIN_FORKNUM, parentblkno, RBM_NORMAL, + strategy); + + LockBuffer(parentbuf, GIN_SHARE); + parentpage = BufferGetPage(parentbuf); + + if (GinPageIsLeaf(parentpage)) + { + UnlockReleaseBuffer(parentbuf); + return result; + } + + parent_maxoff = PageGetMaxOffsetNumber(parentpage); + for (o = FirstOffsetNumber; o <= parent_maxoff; o = OffsetNumberNext(o)) + { + ItemId p_iid = PageGetItemIdCareful(rel, parentblkno, parentpage, o); + IndexTuple itup = (IndexTuple) PageGetItem(parentpage, p_iid); + + if (ItemPointerGetBlockNumber(&(itup->t_tid)) == childblkno) + { + /* Found it! Make copy and return it */ + result = CopyIndexTuple(itup); + break; + } + } + + UnlockReleaseBuffer(parentbuf); + + return result; +} + +static ItemId +PageGetItemIdCareful(Relation rel, BlockNumber block, Page page, + OffsetNumber offset) +{ + ItemId itemid = PageGetItemId(page, offset); + + if (ItemIdGetOffset(itemid) + ItemIdGetLength(itemid) > + BLCKSZ - MAXALIGN(sizeof(GinPageOpaqueData))) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("line pointer points past end of tuple space in index \"%s\"", + RelationGetRelationName(rel)), + errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.", + block, offset, ItemIdGetOffset(itemid), + ItemIdGetLength(itemid), + ItemIdGetFlags(itemid)))); + + /* + * Verify that line pointer isn't LP_REDIRECT or LP_UNUSED or LP_DEAD, + * since GIN never uses all three. Verify that line pointer has storage, + * too. + */ + if (ItemIdIsRedirected(itemid) || !ItemIdIsUsed(itemid) || + ItemIdIsDead(itemid) || ItemIdGetLength(itemid) == 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg("invalid line pointer storage in index \"%s\"", + RelationGetRelationName(rel)), + errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.", + block, offset, ItemIdGetOffset(itemid), + ItemIdGetLength(itemid), + ItemIdGetFlags(itemid)))); + + return itemid; +} diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml index 6eb526c6bb7..e1a471474e6 100644 --- a/doc/src/sgml/amcheck.sgml +++ b/doc/src/sgml/amcheck.sgml @@ -189,6 +189,25 @@ ORDER BY c.relpages DESC LIMIT 10; + + + gin_index_parent_check(index regclass) returns void + + gin_index_parent_check + + + + + + gin_index_parent_check tests that its target GIN index + has consistent parent-child tuples relations (no parent tuples + require tuple adjustement) and page graph respects balanced-tree + invariants (internal pages reference only leaf page or only internal + pages). + + + + gist_index_check(index regclass, heapallindexed boolean) returns void -- 2.45.2