From 42fa9f2e2a06ae493e1b77f148693c0633f773a2 Mon Sep 17 00:00:00 2001 From: David Christensen Date: Wed, 19 Oct 2022 22:33:20 -0400 Subject: [PATCH 3/6] Add cluster-wide Page Features Page features are a standardized way of assigning and using dynamic space usage from the tail end of a disk page. These features are set at cluster init time (so configured via `initdb` and initialized via the bootstrap process) and affect all disk pages. A PageFeatureSet is effectively a bitflag of all configured features, each of which has a fixed size. If not using any PageFeatures, the storage overhead of this is 0. Rather than using a variable location struct, an implementation of a PageFeature is responsible for an offset and a length in the page. The current API returns only a pointer to the page location for the implementation to manage, and no further checks are done to ensure that only the expected memory is accessed. Access to the underlying memory is synonymous with determining whether a given cluster is using an underlying PageFeature, so code paths can do something like: char *loc; if ((loc = ClusterGetPageFeatureOffset(page, PF_MY_FEATURE_ID))) { // ipso facto this feature is enabled in this cluster *and* we know the memory address ... } Since this is direct memory access to the underlying Page, ensure the buffer is pinned. Explicitly locking (assuming you stay in your lane) should only need to guard against access from other backends of this type if using shared buffers, so will be use-case dependent. This does have a runtime overhead due to moving some offset calculations from compile time to runtime. It is thought that the utility of this feature will outweigh the costs here. Candidates for page features include 32-bit or 64-bit checksums, encryption tags, or additional per-page metadata. While we are not currently getting rid of the pd_checksum field, this mechanism could be used to free up that 16 bits for some other purpose. One such purpose might be to mirror the cluster-wise PageFeatureSet, currently also a uint16, which would mean the entirety of this scheme could be reflected in a given page, opening up per-relation or even per-page setting/metadata here. (We'd presumably need to snag a pd_flags bit to interpret pd_checksum that way, but it would be an interesting use.) --- contrib/pg_surgery/heap_surgery.c | 2 +- src/backend/access/brin/brin_bloom.c | 1 + src/backend/access/gin/ginfast.c | 2 +- src/backend/access/heap/heapam.c | 6 +- src/backend/access/heap/heapam_handler.c | 6 +- src/backend/access/heap/pruneheap.c | 12 +-- src/backend/access/heap/vacuumlazy.c | 8 +- src/backend/access/transam/xlog.c | 9 ++ src/backend/bootstrap/bootstrap.c | 19 +++- src/backend/nodes/tidbitmap.c | 2 +- src/backend/storage/page/bufpage.c | 2 +- src/backend/utils/init/globals.c | 3 - src/bin/pg_controldata/pg_controldata.c | 3 + src/common/Makefile | 1 + src/common/pagefeat.c | 130 +++++++++++++++++++++++ src/include/access/ginblock.h | 17 ++- src/include/access/heapam.h | 2 +- src/include/access/htup_details.h | 36 +++++-- src/include/access/nbtree.h | 17 ++- src/include/catalog/pg_control.h | 5 +- src/include/common/pagefeat.h | 47 ++++++++ src/include/storage/bufmgr.h | 1 + src/include/storage/bufpage.h | 5 +- 23 files changed, 290 insertions(+), 46 deletions(-) create mode 100644 src/common/pagefeat.c create mode 100644 src/include/common/pagefeat.h diff --git a/contrib/pg_surgery/heap_surgery.c b/contrib/pg_surgery/heap_surgery.c index 8a2ad9773d..72cf1880de 100644 --- a/contrib/pg_surgery/heap_surgery.c +++ b/contrib/pg_surgery/heap_surgery.c @@ -89,7 +89,7 @@ heap_force_common(FunctionCallInfo fcinfo, HeapTupleForceOption heap_force_opt) Relation rel; OffsetNumber curr_start_ptr, next_start_ptr; - bool include_this_tid[MaxHeapTuplesPerPage]; + bool include_this_tid[MaxHeapTuplesPerPageLimit]; if (RecoveryInProgress()) ereport(ERROR, diff --git a/src/backend/access/brin/brin_bloom.c b/src/backend/access/brin/brin_bloom.c index 6b0af7267d..b44a77fed6 100644 --- a/src/backend/access/brin/brin_bloom.c +++ b/src/backend/access/brin/brin_bloom.c @@ -125,6 +125,7 @@ #include "access/stratnum.h" #include "catalog/pg_type.h" #include "catalog/pg_amop.h" +#include "common/pagefeat.h" #include "utils/builtins.h" #include "utils/datum.h" #include "utils/lsyscache.h" diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c index c2bb952048..f7f0d64bc2 100644 --- a/src/backend/access/gin/ginfast.c +++ b/src/backend/access/gin/ginfast.c @@ -39,7 +39,7 @@ int gin_pending_list_limit = 0; #define GIN_PAGE_FREESIZE \ - ( BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(GinPageOpaqueData)) - MaxSizeOfPageReservedSpace ) + ( BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(GinPageOpaqueData)) - SizeOfPageReservedSpace ) typedef struct KeyArray { diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index bd4d85041d..c54149b559 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -9088,7 +9088,7 @@ heap_xlog_insert(XLogReaderState *record) union { HeapTupleHeaderData hdr; - char data[MaxHeapTupleSize]; + char data[MaxHeapTupleSizeLimit]; } tbuf; HeapTupleHeader htup; xl_heap_header xlhdr; @@ -9210,7 +9210,7 @@ heap_xlog_multi_insert(XLogReaderState *record) union { HeapTupleHeaderData hdr; - char data[MaxHeapTupleSize]; + char data[MaxHeapTupleSizeLimit]; } tbuf; HeapTupleHeader htup; uint32 newlen; @@ -9367,7 +9367,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) union { HeapTupleHeaderData hdr; - char data[MaxHeapTupleSize]; + char data[MaxHeapTupleSizeLimit]; } tbuf; xl_heap_header xlhdr; uint32 newlen; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index a3414a76e8..4246b3345e 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1184,7 +1184,7 @@ heapam_index_build_range_scan(Relation heapRelation, TransactionId OldestXmin; BlockNumber previous_blkno = InvalidBlockNumber; BlockNumber root_blkno = InvalidBlockNumber; - OffsetNumber root_offsets[MaxHeapTuplesPerPage]; + OffsetNumber root_offsets[MaxHeapTuplesPerPageLimit]; /* * sanity checks @@ -1747,8 +1747,8 @@ heapam_index_validate_scan(Relation heapRelation, EState *estate; ExprContext *econtext; BlockNumber root_blkno = InvalidBlockNumber; - OffsetNumber root_offsets[MaxHeapTuplesPerPage]; - bool in_index[MaxHeapTuplesPerPage]; + OffsetNumber root_offsets[MaxHeapTuplesPerPageLimit]; + bool in_index[MaxHeapTuplesPerPageLimit]; BlockNumber previous_blkno = InvalidBlockNumber; /* state variables for the merge */ diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 9f43bbe25f..ec0fc5faed 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -54,9 +54,9 @@ typedef struct int ndead; int nunused; /* arrays that accumulate indexes of items to be changed */ - OffsetNumber redirected[MaxHeapTuplesPerPage * 2]; - OffsetNumber nowdead[MaxHeapTuplesPerPage]; - OffsetNumber nowunused[MaxHeapTuplesPerPage]; + OffsetNumber redirected[MaxHeapTuplesPerPageLimit * 2]; + OffsetNumber nowdead[MaxHeapTuplesPerPageLimit]; + OffsetNumber nowunused[MaxHeapTuplesPerPageLimit]; /* * marked[i] is true if item i is entered in one of the above arrays. @@ -64,7 +64,7 @@ typedef struct * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is * 1. Otherwise every access would need to subtract 1. */ - bool marked[MaxHeapTuplesPerPage + 1]; + bool marked[MaxHeapTuplesPerPageLimit + 1]; /* * Tuple visibility is only computed once for each tuple, for correctness @@ -74,7 +74,7 @@ typedef struct * * Same indexing as ->marked. */ - int8 htsv[MaxHeapTuplesPerPage + 1]; + int8 htsv[MaxHeapTuplesPerPageLimit + 1]; } PruneState; /* Local functions */ @@ -598,7 +598,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) OffsetNumber latestdead = InvalidOffsetNumber, maxoff = PageGetMaxOffsetNumber(dp), offnum; - OffsetNumber chainitems[MaxHeapTuplesPerPage]; + OffsetNumber chainitems[MaxHeapTuplesPerPageLimit]; int nchain = 0, i; diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index dfbe37472f..7e18f2f712 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -1565,8 +1565,8 @@ lazy_scan_prune(LVRelState *vacrel, int nnewlpdead; TransactionId NewRelfrozenXid; MultiXactId NewRelminMxid; - OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; - xl_heap_freeze_tuple frozen[MaxHeapTuplesPerPage]; + OffsetNumber deadoffsets[MaxHeapTuplesPerPageLimit]; + xl_heap_freeze_tuple frozen[MaxHeapTuplesPerPageLimit]; Assert(BufferGetBlockNumber(buf) == blkno); @@ -1968,7 +1968,7 @@ lazy_scan_noprune(LVRelState *vacrel, HeapTupleHeader tupleheader; TransactionId NewRelfrozenXid = vacrel->NewRelfrozenXid; MultiXactId NewRelminMxid = vacrel->NewRelminMxid; - OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; + OffsetNumber deadoffsets[MaxHeapTuplesPerPageLimit]; Assert(BufferGetBlockNumber(buf) == blkno); @@ -2497,7 +2497,7 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, { VacDeadItems *dead_items = vacrel->dead_items; Page page = BufferGetPage(buffer); - OffsetNumber unused[MaxHeapTuplesPerPage]; + OffsetNumber unused[MaxHeapTuplesPerPageLimit]; int uncnt = 0; TransactionId visibility_cutoff_xid; bool all_frozen; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index dea978a962..21a134a663 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -69,6 +69,7 @@ #include "catalog/pg_database.h" #include "common/controldata_utils.h" #include "common/file_utils.h" +#include "common/pagefeat.h" #include "executor/instrument.h" #include "miscadmin.h" #include "pg_trace.h" @@ -89,6 +90,7 @@ #include "storage/ipc.h" #include "storage/large_object.h" #include "storage/latch.h" +#include "common/pagefeat.h" #include "storage/pmsignal.h" #include "storage/predicate.h" #include "storage/proc.h" @@ -109,6 +111,7 @@ #include "utils/varlena.h" extern uint32 bootstrap_data_checksum_version; +extern PageFeatureSet bootstrap_page_features; /* timeline ID to be used when bootstrapping */ #define BootstrapTimeLineID 1 @@ -3898,6 +3901,7 @@ InitControlFile(uint64 sysidentifier) ControlFile->wal_log_hints = wal_log_hints; ControlFile->track_commit_timestamp = track_commit_timestamp; ControlFile->data_checksum_version = bootstrap_data_checksum_version; + ControlFile->page_features = bootstrap_page_features; } static void @@ -4182,9 +4186,14 @@ ReadControlFile(void) CalculateCheckpointSegments(); + /* set our page-level space reservation from ControlFile if any extended feature flags are set*/ + reserved_page_size = CalculateReservedPageSize(ControlFile->page_features); + /* Make the initdb settings visible as GUC variables, too */ SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no", PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); + + SetExtendedFeatureConfigOptions(ControlFile->page_features); } /* diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 58247d826d..4ee0bf3db7 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -46,7 +46,7 @@ #include "utils/relmapper.h" uint32 bootstrap_data_checksum_version = 0; /* No checksum */ - +PageFeatureSet bootstrap_page_features = 0; /* No special features */ static void CheckerModeMain(void); static void bootstrap_signals(void); @@ -221,7 +221,7 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) argv++; argc--; - while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:X:-:")) != -1) + while ((flag = getopt(argc, argv, "B:c:d:D:e:Fkr:X:-:")) != -1) { switch (flag) { @@ -244,6 +244,19 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) pfree(debugstr); } break; + case 'e': + { + /* enable specific features */ + PageFeatureSet features_tmp; + + features_tmp = PageFeatureSetAddFeatureByName(bootstrap_page_features, optarg); + if (features_tmp == bootstrap_page_features) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("Unrecognized page feature requested: \"%s\"", optarg))); + bootstrap_page_features = features_tmp; + } + break; case 'F': SetConfigOption("fsync", "false", PGC_POSTMASTER, PGC_S_ARGV); break; @@ -299,6 +312,8 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) } } + ClusterPageFeatureInit(bootstrap_page_features); + if (argc != optind) { write_stderr("%s: invalid command-line arguments\n", progname); diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index a7a6b26668..19adae4d23 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -53,7 +53,7 @@ * the per-page bitmaps variable size. We just legislate that the size * is this: */ -#define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage +#define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPageLimit /* * When we have to switch over to lossy storage, we use a data structure diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index a76b8aab6c..74a7bdce33 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -702,7 +702,7 @@ PageRepairFragmentation(Page page) Offset pd_upper = ((PageHeader) page)->pd_upper; Offset pd_special = ((PageHeader) page)->pd_special; Offset last_offset; - itemIdCompactData itemidbase[MaxHeapTuplesPerPage]; + itemIdCompactData itemidbase[MaxHeapTuplesPerPageLimit]; itemIdCompact itemidptr; ItemId lp; int nline, diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 3e241eba5b..1a5d29ac9b 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -151,6 +151,3 @@ int64 VacuumPageDirty = 0; int VacuumCostBalance = 0; /* working state for vacuum */ bool VacuumCostActive = false; - -int reserved_page_size = 0; /* how much page space to reserve for extended unencrypted metadata */ - diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index c390ec51ce..22e6122458 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -26,6 +26,7 @@ #include "catalog/pg_control.h" #include "common/controldata_utils.h" #include "common/logging.h" +#include "common/pagefeat.h" #include "getopt_long.h" #include "pg_getopt.h" @@ -328,5 +329,7 @@ main(int argc, char *argv[]) ControlFile->data_checksum_version); printf(_("Mock authentication nonce: %s\n"), mock_auth_nonce_str); + printf(_("Reserved page size for features: %d\n"), + CalculateReservedPageSize(ControlFile->page_features)); return 0; } diff --git a/src/common/Makefile b/src/common/Makefile index e9af7346c9..79ffa4dc9a 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -65,6 +65,7 @@ OBJS_COMMON = \ kwlookup.o \ link-canary.o \ md5_common.o \ + pagefeat.o \ pg_get_line.o \ pg_lzcompress.o \ pg_prng.o \ diff --git a/src/common/pagefeat.c b/src/common/pagefeat.c new file mode 100644 index 0000000000..75f589714b --- /dev/null +++ b/src/common/pagefeat.c @@ -0,0 +1,130 @@ +/*------------------------------------------------------------------------- + * + * pagefeat.c + * POSTGRES optional page features + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/common/pagefeat.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "common/pagefeat.h" +#include "utils/guc.h" + +/* global variable to store the reserved_page_size */ +int reserved_page_size; +PageFeatureSet cluster_page_features; + +/* + * A "page feature" is an optional cluster-defined additional data field that + * is stored in the "reserved_page_size" area in the footer of a given Page. + * These features are set at initdb time and are static for the life of the cluster. + * + * Page features are identified by flags, each corresponding to a blob of data + * with a fixed length and content. For a given cluster, these features will + * globally exist or not, and can be queried for feature existence. You can + * also get the data/length for a given feature using accessors. + */ + +typedef struct PageFeatureDesc +{ + uint16 length; + char *guc_name; +} PageFeatureDesc; + +/* these are the fixed widths for each feature type, indexed by feature */ +static PageFeatureDesc feature_descs[PF_MAX_FEATURE] = { +}; + + +/* Return the size for a given set of feature flags */ +uint16 +CalculateReservedPageSize(PageFeatureSet features) +{ + uint16 size = 0; + int i; + + if (!features) + return 0; + + for (i = 0; i < PF_MAX_FEATURE; i++) + if (features & (1<= 0 && feature_id < PF_MAX_FEATURE); + return feature_descs[feature_id].length; +} + + +/* expose the given feature flags as boolean yes/no GUCs */ +void +SetExtendedFeatureConfigOptions(PageFeatureSet features) +{ +#ifndef FRONTEND + int i; + + for (i = 0; i < PF_MAX_FEATURE; i++) + SetConfigOption(feature_descs[i].guc_name, (features & (1<= 0 && feature < PF_MAX_FEATURE); + return features | (1<