From 0233fc19901b811f65f59bbdc13c820f7154893a Mon Sep 17 00:00:00 2001 From: David Christensen Date: Tue, 18 Oct 2022 14:28:09 -0400 Subject: [PATCH 1/6] Add reserved_page_space to Page structure This space is reserved for extended data on the Page structure which will be ultimately used for encrypted data, extended checksums, and potentially other things. This data appears at the end of the Page, after any `pd_special` area, and will be calculated at runtime based on specific ControlFile features. No effort is made to ensure this is backwards-compatible with existing clusters for `pg_upgrade`, as we will require logical replication to move data into a cluster with different settings here. --- src/backend/access/gin/ginfast.c | 2 +- src/backend/access/nbtree/nbtdedup.c | 2 +- src/backend/access/nbtree/nbtsplitloc.c | 2 +- src/backend/storage/page/bufpage.c | 16 ++++++++-------- src/backend/utils/init/globals.c | 3 +++ src/backend/utils/misc/guc_tables.c | 13 +++++++++++++ src/bin/initdb/initdb.c | 1 + src/include/access/ginblock.h | 4 +++- src/include/access/hash.h | 5 ++++- src/include/access/htup_details.h | 9 +++++---- src/include/access/nbtree.h | 13 +++++++++---- src/include/access/spgist_private.h | 1 + src/include/storage/bufpage.h | 21 +++++++++++++++------ src/test/regress/expected/insert.out | 2 +- src/test/regress/expected/vacuum.out | 4 ++-- src/test/regress/sql/insert.sql | 2 +- src/test/regress/sql/vacuum.sql | 4 ++-- 17 files changed, 71 insertions(+), 33 deletions(-) diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c index f750b5ed9e..c2bb952048 100644 --- a/src/backend/access/gin/ginfast.c +++ b/src/backend/access/gin/ginfast.c @@ -39,7 +39,7 @@ int gin_pending_list_limit = 0; #define GIN_PAGE_FREESIZE \ - ( BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(GinPageOpaqueData)) ) + ( BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(GinPageOpaqueData)) - MaxSizeOfPageReservedSpace ) typedef struct KeyArray { diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index 93a025b0a9..c40f6ae800 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -826,7 +826,7 @@ _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz) /* This calculation needs to match nbtsplitloc.c */ leftfree = PageGetPageSize(page) - SizeOfPageHeaderData - - MAXALIGN(sizeof(BTPageOpaqueData)); + MAXALIGN(sizeof(BTPageOpaqueData)) - SizeOfPageReservedSpace; /* Subtract size of new high key (includes pivot heap TID space) */ leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData)); diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index 241e26d338..bb18707892 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -156,7 +156,7 @@ _bt_findsplitloc(Relation rel, /* Total free space available on a btree page, after fixed overhead */ leftspace = rightspace = - PageGetPageSize(origpage) - SizeOfPageHeaderData - + PageGetPageSize(origpage) - SizeOfPageHeaderData - SizeOfPageReservedSpace - MAXALIGN(sizeof(BTPageOpaqueData)); /* The right page will have the same high key as the old page */ diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 8b617c7e79..a76b8aab6c 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -43,7 +43,7 @@ PageInit(Page page, Size pageSize, Size specialSize) { PageHeader p = (PageHeader) page; - specialSize = MAXALIGN(specialSize); + specialSize = MAXALIGN(specialSize) + reserved_page_size; Assert(pageSize == BLCKSZ); Assert(pageSize > specialSize + SizeOfPageHeaderData); @@ -117,7 +117,7 @@ PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags) if ((p->pd_flags & ~PD_VALID_FLAG_BITS) == 0 && p->pd_lower <= p->pd_upper && p->pd_upper <= p->pd_special && - p->pd_special <= BLCKSZ && + p->pd_special + reserved_page_size <= BLCKSZ && p->pd_special == MAXALIGN(p->pd_special)) header_sane = true; @@ -211,7 +211,7 @@ PageAddItemExtended(Page page, if (phdr->pd_lower < SizeOfPageHeaderData || phdr->pd_lower > phdr->pd_upper || phdr->pd_upper > phdr->pd_special || - phdr->pd_special > BLCKSZ) + phdr->pd_special + reserved_page_size > BLCKSZ) ereport(PANIC, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", @@ -723,7 +723,7 @@ PageRepairFragmentation(Page page) if (pd_lower < SizeOfPageHeaderData || pd_lower > pd_upper || pd_upper > pd_special || - pd_special > BLCKSZ || + pd_special + reserved_page_size > BLCKSZ || pd_special != MAXALIGN(pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), @@ -1066,7 +1066,7 @@ PageIndexTupleDelete(Page page, OffsetNumber offnum) if (phdr->pd_lower < SizeOfPageHeaderData || phdr->pd_lower > phdr->pd_upper || phdr->pd_upper > phdr->pd_special || - phdr->pd_special > BLCKSZ || + phdr->pd_special + reserved_page_size > BLCKSZ || phdr->pd_special != MAXALIGN(phdr->pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), @@ -1201,7 +1201,7 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) if (pd_lower < SizeOfPageHeaderData || pd_lower > pd_upper || pd_upper > pd_special || - pd_special > BLCKSZ || + pd_special + reserved_page_size > BLCKSZ || pd_special != MAXALIGN(pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), @@ -1307,7 +1307,7 @@ PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum) if (phdr->pd_lower < SizeOfPageHeaderData || phdr->pd_lower > phdr->pd_upper || phdr->pd_upper > phdr->pd_special || - phdr->pd_special > BLCKSZ || + phdr->pd_special + reserved_page_size > BLCKSZ || phdr->pd_special != MAXALIGN(phdr->pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), @@ -1419,7 +1419,7 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum, if (phdr->pd_lower < SizeOfPageHeaderData || phdr->pd_lower > phdr->pd_upper || phdr->pd_upper > phdr->pd_special || - phdr->pd_special > BLCKSZ || + phdr->pd_special + reserved_page_size > BLCKSZ || phdr->pd_special != MAXALIGN(phdr->pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 1a5d29ac9b..3e241eba5b 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -151,3 +151,6 @@ int64 VacuumPageDirty = 0; int VacuumCostBalance = 0; /* working state for vacuum */ bool VacuumCostActive = false; + +int reserved_page_size = 0; /* how much page space to reserve for extended unencrypted metadata */ + diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 05ab087934..a2d10d149d 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -2624,6 +2624,19 @@ struct config_int ConfigureNamesInt[] = NULL, assign_max_wal_size, NULL }, + { + {"reserved_page_size", PGC_INTERNAL, PRESET_OPTIONS, + gettext_noop("Shows the size of reserved space for extended pages."), + NULL, + GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE + }, + &reserved_page_size, + 0, + 0, + PG_UINT8_MAX, + NULL, NULL, NULL + }, + { {"checkpoint_timeout", PGC_SIGHUP, WAL_CHECKPOINTS, gettext_noop("Sets the maximum time between automatic WAL checkpoints."), diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index f61a043055..40561d5d61 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -77,6 +77,7 @@ #include "getopt_long.h" #include "mb/pg_wchar.h" #include "miscadmin.h" +#include "storage/bufpage.h" /* MaxSizeOfPageReservedSpace */ /* Ideally this would be in a .h file, but it hardly seems worth the trouble */ diff --git a/src/include/access/ginblock.h b/src/include/access/ginblock.h index 9347f464f3..d2c011abbd 100644 --- a/src/include/access/ginblock.h +++ b/src/include/access/ginblock.h @@ -249,6 +249,7 @@ typedef signed char GinNullCategory; #define GinMaxItemSize \ Min(INDEX_SIZE_MASK, \ MAXALIGN_DOWN(((BLCKSZ - \ + MaxSizeOfPageReservedSpace - \ MAXALIGN(SizeOfPageHeaderData + 3 * sizeof(ItemIdData)) - \ MAXALIGN(sizeof(GinPageOpaqueData))) / 3))) @@ -319,6 +320,7 @@ typedef signed char GinNullCategory; #define GinDataPageMaxDataSize \ (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \ + - MaxSizeOfPageReservedSpace \ - MAXALIGN(sizeof(ItemPointerData)) \ - MAXALIGN(sizeof(GinPageOpaqueData))) @@ -326,7 +328,7 @@ typedef signed char GinNullCategory; * List pages */ #define GinListPageSize \ - ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GinPageOpaqueData)) ) + ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GinPageOpaqueData)) - MaxSizeOfPageReservedSpace) /* * A compressed posting list. diff --git a/src/include/access/hash.h b/src/include/access/hash.h index da372841c4..b21cd2e8c6 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -287,6 +287,7 @@ typedef struct HashOptions #define HashMaxItemSize(page) \ MAXALIGN_DOWN(PageGetPageSize(page) - \ SizeOfPageHeaderData - \ + SizeOfPageReservedSpace - \ sizeof(ItemIdData) - \ MAXALIGN(sizeof(HashPageOpaqueData))) @@ -318,7 +319,9 @@ typedef struct HashOptions #define HashGetMaxBitmapSize(page) \ (PageGetPageSize((Page) page) - \ - (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData)))) + (MAXALIGN(SizeOfPageHeaderData) + \ + SizeOfPageReservedSpace + \ + MAXALIGN(sizeof(HashPageOpaqueData)))) #define HashPageGetMeta(page) \ ((HashMetaPage) PageGetContents(page)) diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h index 9561c835f2..0d0f97bd86 100644 --- a/src/include/access/htup_details.h +++ b/src/include/access/htup_details.h @@ -548,15 +548,16 @@ do { \ /* * MaxHeapTupleSize is the maximum allowed size of a heap tuple, including * header and MAXALIGN alignment padding. Basically it's BLCKSZ minus the - * other stuff that has to be on a disk page. Since heap pages use no - * "special space", there's no deduction for that. + * other stuff that has to be on a disk page. We also include + * MaxSizeOfPageReservedSpace bytes in this calculation as this could be + * enabled. * * NOTE: we allow for the ItemId that must point to the tuple, ensuring that * an otherwise-empty page can indeed hold a tuple of this size. Because * ItemIds and tuples have different alignment requirements, don't assume that * you can, say, fit 2 tuples of size MaxHeapTupleSize/2 on the same page. */ -#define MaxHeapTupleSize (BLCKSZ - MAXALIGN(SizeOfPageHeaderData + sizeof(ItemIdData))) +#define MaxHeapTupleSize (BLCKSZ - MAXALIGN(SizeOfPageHeaderData + sizeof(ItemIdData) + MaxSizeOfPageReservedSpace)) #define MinHeapTupleSize MAXALIGN(SizeofHeapTupleHeader) /* @@ -571,7 +572,7 @@ do { \ * require increases in the size of work arrays. */ #define MaxHeapTuplesPerPage \ - ((int) ((BLCKSZ - SizeOfPageHeaderData) / \ + ((int) ((BLCKSZ - SizeOfPageHeaderData - MaxSizeOfPageReservedSpace) / \ (MAXALIGN(SizeofHeapTupleHeader) + sizeof(ItemIdData)))) /* diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 8e4f6864e5..adf7be54d7 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -162,12 +162,12 @@ typedef struct BTMetaPageData * attribute, which we account for here. */ #define BTMaxItemSize(page) \ - (MAXALIGN_DOWN((PageGetPageSize(page) - \ + (MAXALIGN_DOWN((PageGetPageSize(page) - SizeOfPageReservedSpace - \ MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \ MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - \ MAXALIGN(sizeof(ItemPointerData))) #define BTMaxItemSizeNoHeapTid(page) \ - MAXALIGN_DOWN((PageGetPageSize(page) - \ + MAXALIGN_DOWN((PageGetPageSize(page) - SizeOfPageReservedSpace - \ MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \ MAXALIGN(sizeof(BTPageOpaqueData))) / 3) @@ -181,10 +181,15 @@ typedef struct BTMetaPageData * heap TIDs must have to fill the space between the page header and * special area). The value is slightly higher (i.e. more conservative) * than necessary as a result, which is considered acceptable. + * + * Since this is a fixed-size upper limit we restrict to the max size of page + * reserved space; this does mean that we pay a cost of + * (MaxSizeOfPageReservedSpace / sizeof(ItemPointerData)) less tuples stored + * on a page. */ #define MaxTIDsPerBTreePage \ - (int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \ - sizeof(ItemPointerData)) + (int) ((BLCKSZ - SizeOfPageHeaderData - MaxSizeOfPageReservedSpace - \ + sizeof(BTPageOpaqueData)) / sizeof(ItemPointerData)) /* * The leaf-page fillfactor defaults to 90% but is user-adjustable. diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h index eb56b1c6b8..d6b0cc11a5 100644 --- a/src/include/access/spgist_private.h +++ b/src/include/access/spgist_private.h @@ -447,6 +447,7 @@ typedef SpGistDeadTupleData *SpGistDeadTuple; #define SPGIST_PAGE_CAPACITY \ MAXALIGN_DOWN(BLCKSZ - \ SizeOfPageHeaderData - \ + SizeOfPageReservedSpace - \ MAXALIGN(sizeof(SpGistPageOpaqueData))) /* diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index 2708c4b683..295ac1367d 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -19,6 +19,13 @@ #include "storage/item.h" #include "storage/off.h" +extern int reserved_page_size; + +#define SizeOfPageReservedSpace reserved_page_size +/* strict upper bound on the amount of space occupied we have reserved on + * pages in this cluster */ +#define MaxSizeOfPageReservedSpace 64 + /* * A postgres disk page is an abstraction layered on top of a postgres * disk block (which is simply a unit of i/o, see block.h). @@ -36,10 +43,10 @@ * | v pd_upper | * +-------------+------------------------------------+ * | | tupleN ... | - * +-------------+------------------+-----------------+ - * | ... tuple3 tuple2 tuple1 | "special space" | - * +--------------------------------+-----------------+ - * ^ pd_special + * +-------------+-----+------------+-----------------+ + * | ... tuple2 tuple1 | "special space" | "reserved" | + * +-------------------+------------+-----------------+ + * ^ pd_special ^ reserved_page_space * * a page is full when nothing can be added between pd_lower and * pd_upper. @@ -73,6 +80,8 @@ * stored as the page trailer. an access method should always * initialize its pages with PageInit and then set its own opaque * fields. + * + * XXX - update more comments here about reserved_page_space */ typedef Pointer Page; @@ -313,7 +322,7 @@ PageSetPageSizeAndVersion(Page page, Size size, uint8 version) static inline uint16 PageGetSpecialSize(Page page) { - return (PageGetPageSize(page) - ((PageHeader) page)->pd_special); + return (PageGetPageSize(page) - ((PageHeader) page)->pd_special - reserved_page_size); } /* @@ -325,7 +334,7 @@ static inline void PageValidateSpecialPointer(Page page) { Assert(page); - Assert(((PageHeader) page)->pd_special <= BLCKSZ); + Assert((((PageHeader) page)->pd_special - reserved_page_size) <= BLCKSZ); Assert(((PageHeader) page)->pd_special >= SizeOfPageHeaderData); } diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index dd4354fc7d..548e896289 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -100,7 +100,7 @@ SELECT pg_size_pretty(pg_relation_size('large_tuple_test'::regclass, 'main')); INSERT INTO large_tuple_test (select 3, NULL); -- now this tuple won't fit on the second page, but the insert should -- still succeed by extending the relation -INSERT INTO large_tuple_test (select 4, repeat('a', 8126)); +INSERT INTO large_tuple_test (select 4, repeat('a', 8062)); DROP TABLE large_tuple_test; -- -- check indirection (field/array assignment), cf bug #14265 diff --git a/src/test/regress/expected/vacuum.out b/src/test/regress/expected/vacuum.out index c63a157e5f..583a5a91ae 100644 --- a/src/test/regress/expected/vacuum.out +++ b/src/test/regress/expected/vacuum.out @@ -134,7 +134,7 @@ CREATE TABLE no_index_cleanup (i INT PRIMARY KEY, t TEXT); CREATE INDEX no_index_cleanup_idx ON no_index_cleanup(t); ALTER TABLE no_index_cleanup ALTER COLUMN t SET STORAGE EXTERNAL; INSERT INTO no_index_cleanup(i, t) VALUES (generate_series(1,30), - repeat('1234567890',269)); + repeat('1234567890',266)); -- index cleanup option is ignored if VACUUM FULL VACUUM (INDEX_CLEANUP TRUE, FULL TRUE) no_index_cleanup; VACUUM (FULL TRUE) no_index_cleanup; @@ -150,7 +150,7 @@ ALTER TABLE no_index_cleanup SET (vacuum_index_cleanup = auto); VACUUM no_index_cleanup; -- Parameter is set for both the parent table and its toast relation. INSERT INTO no_index_cleanup(i, t) VALUES (generate_series(31,60), - repeat('1234567890',269)); + repeat('1234567890',266)); DELETE FROM no_index_cleanup WHERE i < 45; -- Only toast index is cleaned up. ALTER TABLE no_index_cleanup SET (vacuum_index_cleanup = off, diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql index bdcffd0314..f481bedd02 100644 --- a/src/test/regress/sql/insert.sql +++ b/src/test/regress/sql/insert.sql @@ -55,7 +55,7 @@ INSERT INTO large_tuple_test (select 3, NULL); -- now this tuple won't fit on the second page, but the insert should -- still succeed by extending the relation -INSERT INTO large_tuple_test (select 4, repeat('a', 8126)); +INSERT INTO large_tuple_test (select 4, repeat('a', 8062)); DROP TABLE large_tuple_test; diff --git a/src/test/regress/sql/vacuum.sql b/src/test/regress/sql/vacuum.sql index 9faa8a34a6..0aec01b88e 100644 --- a/src/test/regress/sql/vacuum.sql +++ b/src/test/regress/sql/vacuum.sql @@ -115,7 +115,7 @@ CREATE TABLE no_index_cleanup (i INT PRIMARY KEY, t TEXT); CREATE INDEX no_index_cleanup_idx ON no_index_cleanup(t); ALTER TABLE no_index_cleanup ALTER COLUMN t SET STORAGE EXTERNAL; INSERT INTO no_index_cleanup(i, t) VALUES (generate_series(1,30), - repeat('1234567890',269)); + repeat('1234567890',266)); -- index cleanup option is ignored if VACUUM FULL VACUUM (INDEX_CLEANUP TRUE, FULL TRUE) no_index_cleanup; VACUUM (FULL TRUE) no_index_cleanup; @@ -131,7 +131,7 @@ ALTER TABLE no_index_cleanup SET (vacuum_index_cleanup = auto); VACUUM no_index_cleanup; -- Parameter is set for both the parent table and its toast relation. INSERT INTO no_index_cleanup(i, t) VALUES (generate_series(31,60), - repeat('1234567890',269)); + repeat('1234567890',266)); DELETE FROM no_index_cleanup WHERE i < 45; -- Only toast index is cleaned up. ALTER TABLE no_index_cleanup SET (vacuum_index_cleanup = off, -- 2.37.0 (Apple Git-136)