From b325d524c91be5860154516bb8b4ea2801e89552 Mon Sep 17 00:00:00 2001 From: David Christensen Date: Thu, 1 Jun 2023 11:34:14 -0400 Subject: [PATCH v1] Introduce initdb-selectable block sizes Whereas we had traditionally used BLCKSZ to indicate the compile-time selected block size, this commit adjusted things so the cluster block size can be selected at initdb time. In order to code for this, we introduce a few new defines: - CLUSTER_BLOCK_SIZE is the blocksize for this cluster itself. This is not valid until BlockSizeInit() has been called in the given backend, which we do as early as possible by parsing the ControlFile and using the blcksz field. - MIN_BLOCK_SIZE and MAX_BLOCK_SIZE are the limits for the selectable block size. It is required that CLUSTER_BLOCK_SIZE is a power of 2 between these two constants. - DEFAULT_BLOCK_SIZE is the moral equivalent of BLCKSZ; it is the built-in default value. This is used in a few places that just needed a buffer of an arbitrary size, but the dynamic value CLUSTER_BLOCK_SIZE should almost always be used instead. - CLUSTER_RELSEG_SIZE is used instead of RELSEG_SIZE, since internally we are storing the segment size in terms of number of blocks. RELSEG_SIZE is still kept, but is used in terms of the number of blocks of DEFAULT_BLOCK_SIZE; CLUSTER_RELSEG_SIZE scales appropriately (and is the only thing used internally) to keep the same target total segment size regardless of block size. This patch uses a precalculated table to store the block size itself, as well as additional derived values that have traditionally been compile-time constants (example: MaxHeapTuplesPerPage). The traditional macro names are kept so code that doesn't care about it should not need to change, however the definition of these has changed (see the CalcXXX() routines in blocksize.h for details). A new function, BlockSizeInit() populates the appropriate values based on the target block size. This should be called as early as possible in any code that utilizes block sizes. This patch adds this in the appropriate place on the handful of src/bin/ programs that used BLCKSZ, so this caveat mainly impacts new code. Code which had previously used BLCKZ should likely be able to get away with changing these instances to CLUSTER_BLOCK_SIZE, unless you're using a structure allocated on the stack. In these cases, the compiler will complain about dynamic structure. The solution is to utilize an expression with MAX_BLOCK_SIZE instead of BLCKSZ, ensuring enough stack space is allocated for the maximum size. This also does require using CLUSTER_BLOCK_SIZE or an expression based on it when actually using this structure, so in practice more stack space may be allocated then used in principal; as long as there is plenty of stack this should have no specific impacts on code. Initial (basic) performance testing shows only minor changes with the pgbench -S benchmark, though this is obviously an area that will need considerable testing/verification across multiple workloads. --- configure | 14 +- configure.ac | 25 +-- contrib/amcheck/verify_heapam.c | 14 +- contrib/amcheck/verify_nbtree.c | 6 +- contrib/bloom/blinsert.c | 4 +- contrib/bloom/bloom.h | 23 +- contrib/bloom/blutils.c | 6 +- contrib/bloom/blvacuum.c | 2 +- contrib/file_fdw/file_fdw.c | 6 +- contrib/pageinspect/btreefuncs.c | 6 +- contrib/pageinspect/expected/checksum.out | 2 +- contrib/pageinspect/expected/checksum_1.out | 2 +- contrib/pageinspect/rawpage.c | 12 +- contrib/pageinspect/sql/checksum.sql | 2 +- contrib/pg_surgery/heap_surgery.c | 2 +- contrib/pg_walinspect/pg_walinspect.c | 6 +- contrib/pgstattuple/pgstatapprox.c | 6 +- contrib/pgstattuple/pgstatindex.c | 4 +- contrib/pgstattuple/pgstattuple.c | 10 +- contrib/postgres_fdw/deparse.c | 2 +- contrib/postgres_fdw/postgres_fdw.c | 2 +- doc/src/sgml/catalogs.sgml | 4 +- doc/src/sgml/config.sgml | 39 ++-- doc/src/sgml/limits.sgml | 2 +- doc/src/sgml/monitoring.sgml | 4 +- doc/src/sgml/pgfreespacemap.sgml | 3 +- doc/src/sgml/ref/initdb.sgml | 16 ++ doc/src/sgml/ref/pg_resetwal.sgml | 19 ++ meson.build | 11 +- src/backend/access/brin/brin_bloom.c | 2 +- src/backend/access/brin/brin_pageops.c | 4 +- src/backend/access/common/bufmask.c | 4 +- src/backend/access/common/reloptions.c | 33 ++- src/backend/access/common/syncscan.c | 2 +- src/backend/access/common/toast_internals.c | 2 +- src/backend/access/gin/ginbtree.c | 12 +- src/backend/access/gin/gindatapage.c | 18 +- src/backend/access/gin/ginfast.c | 4 +- src/backend/access/gin/ginget.c | 6 +- src/backend/access/gin/ginvacuum.c | 2 +- src/backend/access/gin/ginxlog.c | 4 +- src/backend/access/gist/gist.c | 2 +- src/backend/access/gist/gistbuild.c | 16 +- src/backend/access/gist/gistbuildbuffers.c | 10 +- src/backend/access/gist/gistutil.c | 2 +- src/backend/access/gist/gistvacuum.c | 6 +- src/backend/access/hash/hash.c | 4 +- src/backend/access/hash/hashinsert.c | 2 +- src/backend/access/hash/hashovfl.c | 8 +- src/backend/access/hash/hashpage.c | 6 +- src/backend/access/heap/README.HOT | 2 +- src/backend/access/heap/heapam.c | 26 +-- src/backend/access/heap/heapam_handler.c | 8 +- src/backend/access/heap/heaptoast.c | 2 +- src/backend/access/heap/hio.c | 2 +- src/backend/access/heap/pruneheap.c | 14 +- src/backend/access/heap/rewriteheap.c | 6 +- src/backend/access/heap/vacuumlazy.c | 18 +- src/backend/access/heap/visibilitymap.c | 8 +- src/backend/access/nbtree/nbtdedup.c | 8 +- src/backend/access/nbtree/nbtinsert.c | 2 +- src/backend/access/nbtree/nbtpage.c | 22 +- src/backend/access/nbtree/nbtree.c | 10 +- src/backend/access/nbtree/nbtsort.c | 12 +- src/backend/access/nbtree/nbtxlog.c | 4 +- src/backend/access/rmgrdesc/xlogdesc.c | 2 +- src/backend/access/spgist/spgdoinsert.c | 6 +- src/backend/access/spgist/spginsert.c | 2 +- src/backend/access/spgist/spgscan.c | 4 +- src/backend/access/spgist/spgtextproc.c | 10 +- src/backend/access/spgist/spgutils.c | 4 +- src/backend/access/spgist/spgvacuum.c | 22 +- src/backend/access/table/tableam.c | 2 +- src/backend/access/transam/README | 2 +- src/backend/access/transam/clog.c | 6 +- src/backend/access/transam/commit_ts.c | 6 +- src/backend/access/transam/generic_xlog.c | 18 +- src/backend/access/transam/multixact.c | 12 +- src/backend/access/transam/slru.c | 22 +- src/backend/access/transam/subtrans.c | 4 +- src/backend/access/transam/timeline.c | 2 +- src/backend/access/transam/varsup.c | 2 +- src/backend/access/transam/xlog.c | 132 ++++++++++-- src/backend/access/transam/xloginsert.c | 24 +-- src/backend/access/transam/xlogprefetcher.c | 2 +- src/backend/access/transam/xlogreader.c | 28 +-- src/backend/access/transam/xlogrecovery.c | 10 +- src/backend/backup/basebackup.c | 41 ++-- src/backend/backup/basebackup_lz4.c | 6 +- src/backend/backup/basebackup_zstd.c | 6 +- src/backend/bootstrap/bootstrap.c | 14 +- src/backend/catalog/storage.c | 2 +- src/backend/commands/analyze.c | 4 +- src/backend/commands/async.c | 6 +- src/backend/commands/vacuumparallel.c | 2 +- src/backend/executor/nodeAgg.c | 16 +- src/backend/nodes/tidbitmap.c | 6 +- src/backend/optimizer/path/costsize.c | 12 +- src/backend/optimizer/util/plancat.c | 2 +- src/backend/po/de.po | 8 +- src/backend/po/es.po | 6 +- src/backend/po/fr.po | 6 +- src/backend/po/id.po | 4 +- src/backend/po/it.po | 4 +- src/backend/po/ja.po | 8 +- src/backend/po/ko.po | 6 +- src/backend/po/pl.po | 4 +- src/backend/po/pt_BR.po | 2 +- src/backend/po/ru.po | 8 +- src/backend/po/sv.po | 4 +- src/backend/po/tr.po | 4 +- src/backend/po/uk.po | 4 +- src/backend/po/zh_CN.po | 4 +- src/backend/replication/logical/worker.c | 2 +- src/backend/storage/buffer/buf_init.c | 4 +- src/backend/storage/buffer/bufmgr.c | 22 +- src/backend/storage/buffer/freelist.c | 2 +- src/backend/storage/buffer/localbuf.c | 8 +- src/backend/storage/file/buffile.c | 14 +- src/backend/storage/file/copydir.c | 2 +- src/backend/storage/file/fd.c | 4 +- src/backend/storage/freespace/README | 8 +- src/backend/storage/freespace/freespace.c | 29 ++- src/backend/storage/freespace/indexfsm.c | 6 +- src/backend/storage/large_object/inv_api.c | 4 +- src/backend/storage/lmgr/predicate.c | 2 +- src/backend/storage/page/bufpage.c | 34 +-- src/backend/storage/smgr/README | 52 +++++ src/backend/storage/smgr/md.c | 126 +++++------ src/backend/utils/adt/pgstatfuncs.c | 4 +- src/backend/utils/adt/selfuncs.c | 2 +- src/backend/utils/init/miscinit.c | 6 +- src/backend/utils/misc/guc.c | 27 ++- src/backend/utils/misc/guc_tables.c | 39 +++- src/backend/utils/sort/logtape.c | 48 ++--- src/backend/utils/sort/sharedtuplestore.c | 12 +- src/backend/utils/sort/tuplesort.c | 6 +- src/bin/initdb/initdb.c | 60 ++++-- src/bin/initdb/meson.build | 1 + src/bin/initdb/t/002_blocksize.pl | 24 +++ src/bin/pg_basebackup/pg_basebackup.c | 7 + src/bin/pg_basebackup/pg_receivewal.c | 4 + src/bin/pg_basebackup/streamutil.c | 59 ++++++ src/bin/pg_basebackup/streamutil.h | 3 + src/bin/pg_checksums/pg_checksums.c | 34 ++- src/bin/pg_resetwal/pg_resetwal.c | 54 ++++- src/bin/pg_resetwal/t/002_corrupted.pl | 4 +- src/bin/pg_rewind/filemap.c | 5 +- src/bin/pg_rewind/pg_rewind.c | 12 +- src/bin/pg_upgrade/file.c | 17 +- src/bin/pg_upgrade/meson.build | 2 + src/bin/pg_upgrade/pg_upgrade.c | 3 + src/bin/pg_upgrade/pg_upgrade.h | 2 +- src/bin/pg_waldump/pg_waldump.c | 36 +++- src/bin/pg_waldump/t/002_save_fullpage.pl | 1 + src/common/Makefile | 1 + src/common/blocksize.c | 56 +++++ src/common/file_utils.c | 6 +- src/common/meson.build | 1 + src/include/access/brin_page.h | 2 +- src/include/access/ginblock.h | 6 +- src/include/access/gist.h | 2 +- src/include/access/gist_private.h | 6 +- src/include/access/hash.h | 8 +- src/include/access/heapam.h | 2 +- src/include/access/heaptoast.h | 18 +- src/include/access/htup_details.h | 16 +- src/include/access/itup.h | 5 +- src/include/access/nbtree.h | 211 +------------------ src/include/access/nbtree_int.h | 220 ++++++++++++++++++++ src/include/access/slru.h | 2 +- src/include/access/spgist_private.h | 14 +- src/include/access/xlog_internal.h | 1 + src/include/access/xlogrecord.h | 6 +- src/include/backup/basebackup_sink.h | 4 +- src/include/c.h | 6 +- src/include/common/blocksize.h | 91 ++++++++ src/include/pg_config.h.in | 12 +- src/include/pg_config_manual.h | 10 +- src/include/postgres.h | 1 + src/include/storage/bufmgr.h | 4 +- src/include/storage/bufpage.h | 20 +- src/include/storage/checksum.h | 2 +- src/include/storage/checksum_impl.h | 13 +- src/include/storage/freespace.h | 1 + src/include/storage/fsm_internals.h | 4 +- src/include/storage/large_object.h | 6 +- src/include/storage/off.h | 4 +- src/include/utils/guc_tables.h | 1 + src/include/utils/rel.h | 4 +- src/test/modules/test_slru/test_slru.c | 2 +- src/test/regress/expected/btree_index.out | 4 +- src/test/regress/expected/largeobject.out | 2 +- src/test/regress/expected/largeobject_1.out | 2 +- src/test/regress/sql/btree_index.sql | 4 +- src/test/regress/sql/largeobject.sql | 2 +- src/tools/msvc/Mkvcbuild.pm | 2 +- src/tools/msvc/Solution.pm | 2 +- src/tools/pgindent/typedefs.list | 1 - 199 files changed, 1666 insertions(+), 970 deletions(-) create mode 100644 src/bin/initdb/t/002_blocksize.pl create mode 100644 src/common/blocksize.c create mode 100644 src/include/access/nbtree_int.h create mode 100644 src/include/common/blocksize.h diff --git a/configure b/configure index 8e5006bd4f..a374762c02 100755 --- a/configure +++ b/configure @@ -3714,12 +3714,12 @@ fi case ${blocksize} in - 1) BLCKSZ=1024;; - 2) BLCKSZ=2048;; - 4) BLCKSZ=4096;; - 8) BLCKSZ=8192;; - 16) BLCKSZ=16384;; - 32) BLCKSZ=32768;; + 1) DEFAULT_BLOCK_SIZE=1024;; + 2) DEFAULT_BLOCK_SIZE=2048;; + 4) DEFAULT_BLOCK_SIZE=4096;; + 8) DEFAULT_BLOCK_SIZE=8192;; + 16) DEFAULT_BLOCK_SIZE=16384;; + 32) DEFAULT_BLOCK_SIZE=32768;; *) as_fn_error $? "Invalid block size. Allowed values are 1,2,4,8,16,32." "$LINENO" 5 esac { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${blocksize}kB" >&5 @@ -3727,7 +3727,7 @@ $as_echo "${blocksize}kB" >&6; } cat >>confdefs.h <<_ACEOF -#define BLCKSZ ${BLCKSZ} +#define DEFAULT_BLOCK_SIZE ${DEFAULT_BLOCK_SIZE} _ACEOF diff --git a/configure.ac b/configure.ac index 0e8158a5e6..aa796ab9dc 100644 --- a/configure.ac +++ b/configure.ac @@ -258,28 +258,29 @@ PGAC_ARG_REQ(with, blocksize, [BLOCKSIZE], [set table block size in kB [8]], [blocksize=$withval], [blocksize=8]) case ${blocksize} in - 1) BLCKSZ=1024;; - 2) BLCKSZ=2048;; - 4) BLCKSZ=4096;; - 8) BLCKSZ=8192;; - 16) BLCKSZ=16384;; - 32) BLCKSZ=32768;; + 1) DEFAULT_BLOCK_SIZE=1024;; + 2) DEFAULT_BLOCK_SIZE=2048;; + 4) DEFAULT_BLOCK_SIZE=4096;; + 8) DEFAULT_BLOCK_SIZE=8192;; + 16) DEFAULT_BLOCK_SIZE=16384;; + 32) DEFAULT_BLOCK_SIZE=32768;; *) AC_MSG_ERROR([Invalid block size. Allowed values are 1,2,4,8,16,32.]) esac AC_MSG_RESULT([${blocksize}kB]) -AC_DEFINE_UNQUOTED([BLCKSZ], ${BLCKSZ}, [ +AC_DEFINE_UNQUOTED([DEFAULT_BLOCK_SIZE], ${DEFAULT_BLOCK_SIZE}, [ Size of a disk block --- this also limits the size of a tuple. You can set it bigger if you need bigger tuples (although TOAST should reduce the need to have large tuples, since fields can be spread across multiple tuples). - BLCKSZ must be a power of 2. The maximum possible value of BLCKSZ + DEFAULT_BLOCK_SIZE must be a power of 2. The maximum possible value of DEFAULT_BLOCK_SIZE is currently 2^15 (32768). This is determined by the 15-bit widths of the lp_off and lp_len fields in ItemIdData (see include/storage/itemid.h). - Changing BLCKSZ requires an initdb. + Changing DEFAULT_BLOCK_SIZE requires an initdb, however an individual cluster + can have its block size set. ]) # @@ -313,10 +314,10 @@ fi AC_DEFINE_UNQUOTED([RELSEG_SIZE], ${RELSEG_SIZE}, [ RELSEG_SIZE is the maximum number of blocks allowed in one disk file. - Thus, the maximum size of a single file is RELSEG_SIZE * BLCKSZ; + Thus, the maximum size of a single file is RELSEG_SIZE * CLUSTER_BLOCK_SIZE; relations bigger than that are divided into multiple files. - RELSEG_SIZE * BLCKSZ must be less than your OS' limit on file size. + RELSEG_SIZE * CLUSTER_BLOCK_SIZE must be less than your OS' limit on file size. This is often 2 GB or 4GB in a 32-bit operating system, unless you have large file support enabled. By default, we make the limit 1 GB to avoid any possible integer-overflow problems within the OS. @@ -350,7 +351,7 @@ esac AC_MSG_RESULT([${wal_blocksize}kB]) AC_DEFINE_UNQUOTED([XLOG_BLCKSZ], ${XLOG_BLCKSZ}, [ - Size of a WAL file block. This need have no particular relation to BLCKSZ. + Size of a WAL file block. This need have no particular relation to CLUSTER_BLOCK_SIZE. XLOG_BLCKSZ must be a power of 2, and if your system supports O_DIRECT I/O, XLOG_BLCKSZ must be a multiple of the alignment requirement for direct-I/O buffers, else direct I/O may fail. diff --git a/contrib/amcheck/verify_heapam.c b/contrib/amcheck/verify_heapam.c index 97f3253522..b4453f1797 100644 --- a/contrib/amcheck/verify_heapam.c +++ b/contrib/amcheck/verify_heapam.c @@ -403,11 +403,11 @@ verify_heapam(PG_FUNCTION_ARGS) for (ctx.blkno = first_block; ctx.blkno <= last_block; ctx.blkno++) { OffsetNumber maxoff; - OffsetNumber predecessor[MaxOffsetNumber]; - OffsetNumber successor[MaxOffsetNumber]; - bool lp_valid[MaxOffsetNumber]; - bool xmin_commit_status_ok[MaxOffsetNumber]; - XidCommitStatus xmin_commit_status[MaxOffsetNumber]; + OffsetNumber predecessor[MaxOffsetNumberLimit]; + OffsetNumber successor[MaxOffsetNumberLimit]; + bool lp_valid[MaxOffsetNumberLimit]; + bool xmin_commit_status_ok[MaxOffsetNumberLimit]; + XidCommitStatus xmin_commit_status[MaxOffsetNumberLimit]; CHECK_FOR_INTERRUPTS(); @@ -540,13 +540,13 @@ verify_heapam(PG_FUNCTION_ARGS) (unsigned) MAXALIGN(SizeofHeapTupleHeader))); continue; } - if (ctx.lp_off + ctx.lp_len > BLCKSZ) + if (ctx.lp_off + ctx.lp_len > CLUSTER_BLOCK_SIZE) { report_corruption(&ctx, psprintf("line pointer to page offset %u with length %u ends beyond maximum page offset %u", ctx.lp_off, ctx.lp_len, - (unsigned) BLCKSZ)); + (unsigned) CLUSTER_BLOCK_SIZE)); continue; } diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 94a9759322..62276d108e 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -2962,7 +2962,7 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) BTPageOpaque opaque; OffsetNumber maxoffset; - page = palloc(BLCKSZ); + page = palloc(CLUSTER_BLOCK_SIZE); /* * We copy the page into local storage to avoid holding pin on the buffer @@ -2979,7 +2979,7 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum) _bt_checkpage(state->rel, buffer); /* Only use copy of page in palloc()'d memory */ - memcpy(page, BufferGetPage(buffer), BLCKSZ); + memcpy(page, BufferGetPage(buffer), CLUSTER_BLOCK_SIZE); UnlockReleaseBuffer(buffer); opaque = BTPageGetOpaque(page); @@ -3163,7 +3163,7 @@ PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page, ItemId itemid = PageGetItemId(page, offset); if (ItemIdGetOffset(itemid) + ItemIdGetLength(itemid) > - BLCKSZ - MAXALIGN(sizeof(BTPageOpaqueData))) + CLUSTER_BLOCK_SIZE - MAXALIGN(sizeof(BTPageOpaqueData))) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("line pointer points past end of tuple space in index \"%s\"", diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c index b42b9e6c41..3c36c985f3 100644 --- a/contrib/bloom/blinsert.c +++ b/contrib/bloom/blinsert.c @@ -52,7 +52,7 @@ flushCachedPage(Relation index, BloomBuildState *buildstate) state = GenericXLogStart(index); page = GenericXLogRegisterBuffer(state, buffer, GENERIC_XLOG_FULL_IMAGE); - memcpy(page, buildstate->data.data, BLCKSZ); + memcpy(page, buildstate->data.data, CLUSTER_BLOCK_SIZE); GenericXLogFinish(state); UnlockReleaseBuffer(buffer); } @@ -166,7 +166,7 @@ blbuildempty(Relation index) Page metapage; /* Construct metapage. */ - metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + metapage = (Page) palloc_aligned(CLUSTER_BLOCK_SIZE, PG_IO_ALIGN_SIZE, 0); BloomFillMetapage(index, metapage); /* diff --git a/contrib/bloom/bloom.h b/contrib/bloom/bloom.h index efdf9415d1..683ecb8a77 100644 --- a/contrib/bloom/bloom.h +++ b/contrib/bloom/bloom.h @@ -106,16 +106,9 @@ typedef struct BloomOptions * index key */ } BloomOptions; -/* - * FreeBlockNumberArray - array of block numbers sized so that metadata fill - * all space in metapage. - */ -typedef BlockNumber FreeBlockNumberArray[ - MAXALIGN_DOWN( - BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(BloomPageOpaqueData)) - - MAXALIGN(sizeof(uint16) * 2 + sizeof(uint32) + sizeof(BloomOptions)) - ) / sizeof(BlockNumber) -]; +#define FREE_BLOCK_NUMBER_ELEMS CalcFreeBlockNumberElems(CLUSTER_BLOCK_SIZE) + + /* Metadata of bloom index */ typedef struct BloomMetaPageData @@ -124,14 +117,18 @@ typedef struct BloomMetaPageData uint16 nStart; uint16 nEnd; BloomOptions opts; - FreeBlockNumberArray notFullPage; + BlockNumber notFullPage[FLEXIBLE_ARRAY_MEMBER]; } BloomMetaPageData; +#define SizeOfBloomMetaPageData (offsetof(BloomMetaPageData,notFullPage) + sizeof(BlockNumber) * BloomMetaBlockN) + /* Magic number to distinguish bloom pages among anothers */ #define BLOOM_MAGICK_NUMBER (0xDBAC0DED) /* Number of blocks numbers fit in BloomMetaPageData */ -#define BloomMetaBlockN (sizeof(FreeBlockNumberArray) / sizeof(BlockNumber)) +#define BloomMetaBlockN (MAXALIGN_DOWN(CLUSTER_BLOCK_SIZE - SizeOfPageHeaderData - MAXALIGN(sizeof(BloomPageOpaqueData)) \ + - MAXALIGN(sizeof(uint16) * 2 + sizeof(uint32) + sizeof(BloomOptions)) \ + ) / sizeof(BlockNumber)) #define BloomPageGetMeta(page) ((BloomMetaPageData *) PageGetContents(page)) @@ -150,7 +147,7 @@ typedef struct BloomState } BloomState; #define BloomPageGetFreeSpace(state, page) \ - (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \ + (CLUSTER_BLOCK_SIZE - MAXALIGN(SizeOfPageHeaderData) \ - BloomPageGetMaxOffset(page) * (state)->sizeOfBloomTuple \ - MAXALIGN(sizeof(BloomPageOpaqueData))) diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index d935ed8fbd..c8abdc86f6 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -400,7 +400,7 @@ BloomInitPage(Page page, uint16 flags) { BloomPageOpaque opaque; - PageInit(page, BLCKSZ, sizeof(BloomPageOpaqueData)); + PageInit(page, CLUSTER_BLOCK_SIZE, sizeof(BloomPageOpaqueData)); opaque = BloomPageGetOpaque(page); opaque->flags = flags; @@ -430,10 +430,10 @@ BloomFillMetapage(Relation index, Page metaPage) */ BloomInitPage(metaPage, BLOOM_META); metadata = BloomPageGetMeta(metaPage); - memset(metadata, 0, sizeof(BloomMetaPageData)); + memset(metadata, 0, SizeOfBloomMetaPageData); metadata->magickNumber = BLOOM_MAGICK_NUMBER; metadata->opts = *opts; - ((PageHeader) metaPage)->pd_lower += sizeof(BloomMetaPageData); + ((PageHeader) metaPage)->pd_lower += SizeOfBloomMetaPageData; /* If this fails, probably FreeBlockNumberArray size calc is wrong: */ Assert(((PageHeader) metaPage)->pd_lower <= ((PageHeader) metaPage)->pd_upper); diff --git a/contrib/bloom/blvacuum.c b/contrib/bloom/blvacuum.c index 2340d49e00..373227af98 100644 --- a/contrib/bloom/blvacuum.c +++ b/contrib/bloom/blvacuum.c @@ -37,7 +37,7 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, Relation index = info->index; BlockNumber blkno, npages; - FreeBlockNumberArray notFullPage; + BlockNumber notFullPage[CalcFreeBlockNumberElems(MAX_BLOCK_SIZE)]; int countPage = 0; BloomState state; Buffer buffer; diff --git a/contrib/file_fdw/file_fdw.c b/contrib/file_fdw/file_fdw.c index 9e330b9934..e8e159c81b 100644 --- a/contrib/file_fdw/file_fdw.c +++ b/contrib/file_fdw/file_fdw.c @@ -839,7 +839,7 @@ fileAnalyzeForeignTable(Relation relation, * Convert size to pages. Must return at least 1 so that we can tell * later on that pg_class.relpages is not default. */ - *totalpages = (stat_buf.st_size + (BLCKSZ - 1)) / BLCKSZ; + *totalpages = (stat_buf.st_size + (CLUSTER_BLOCK_SIZE - 1)) / CLUSTER_BLOCK_SIZE; if (*totalpages < 1) *totalpages = 1; @@ -1009,12 +1009,12 @@ estimate_size(PlannerInfo *root, RelOptInfo *baserel, * back to the default if using a program as the input. */ if (fdw_private->is_program || stat(fdw_private->filename, &stat_buf) < 0) - stat_buf.st_size = 10 * BLCKSZ; + stat_buf.st_size = 10 * CLUSTER_BLOCK_SIZE; /* * Convert size to pages for use in I/O cost estimate later. */ - pages = (stat_buf.st_size + (BLCKSZ - 1)) / BLCKSZ; + pages = (stat_buf.st_size + (CLUSTER_BLOCK_SIZE - 1)) / CLUSTER_BLOCK_SIZE; if (pages < 1) pages = 1; fdw_private->pages = pages; diff --git a/contrib/pageinspect/btreefuncs.c b/contrib/pageinspect/btreefuncs.c index 9cdc8e182b..dd6115cbc7 100644 --- a/contrib/pageinspect/btreefuncs.c +++ b/contrib/pageinspect/btreefuncs.c @@ -116,7 +116,7 @@ GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat) stat->blkno = blkno; - stat->max_avail = BLCKSZ - (BLCKSZ - phdr->pd_special + SizeOfPageHeaderData); + stat->max_avail = CLUSTER_BLOCK_SIZE - (CLUSTER_BLOCK_SIZE - phdr->pd_special + SizeOfPageHeaderData); stat->dead_items = stat->live_items = 0; @@ -663,8 +663,8 @@ bt_page_items_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version) uargs = palloc(sizeof(ua_page_items)); - uargs->page = palloc(BLCKSZ); - memcpy(uargs->page, BufferGetPage(buffer), BLCKSZ); + uargs->page = palloc(CLUSTER_BLOCK_SIZE); + memcpy(uargs->page, BufferGetPage(buffer), CLUSTER_BLOCK_SIZE); UnlockReleaseBuffer(buffer); relation_close(rel, AccessShareLock); diff --git a/contrib/pageinspect/expected/checksum.out b/contrib/pageinspect/expected/checksum.out index a85388e158..2cebe89754 100644 --- a/contrib/pageinspect/expected/checksum.out +++ b/contrib/pageinspect/expected/checksum.out @@ -6,7 +6,7 @@ -- on the configured block size. This test has several different expected -- results files to handle the following possibilities: -- --- BLCKSZ end file +-- CLUSTER_BLOCK_SIZE end file -- 8K LE checksum.out -- 8K BE checksum_1.out -- diff --git a/contrib/pageinspect/expected/checksum_1.out b/contrib/pageinspect/expected/checksum_1.out index 6fb1b1b04d..807a70dacd 100644 --- a/contrib/pageinspect/expected/checksum_1.out +++ b/contrib/pageinspect/expected/checksum_1.out @@ -6,7 +6,7 @@ -- on the configured block size. This test has several different expected -- results files to handle the following possibilities: -- --- BLCKSZ end file +-- CLUSTER_BLOCK_SIZE end file -- 8K LE checksum.out -- 8K BE checksum_1.out -- diff --git a/contrib/pageinspect/rawpage.c b/contrib/pageinspect/rawpage.c index b25a63cbd6..cb8bd8de91 100644 --- a/contrib/pageinspect/rawpage.c +++ b/contrib/pageinspect/rawpage.c @@ -179,8 +179,8 @@ get_raw_page_internal(text *relname, ForkNumber forknum, BlockNumber blkno) blkno, RelationGetRelationName(rel)))); /* Initialize buffer to copy to */ - raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ); - SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ); + raw_page = (bytea *) palloc(CLUSTER_BLOCK_SIZE + VARHDRSZ); + SET_VARSIZE(raw_page, CLUSTER_BLOCK_SIZE + VARHDRSZ); raw_page_data = VARDATA(raw_page); /* Take a verbatim copy of the page */ @@ -188,7 +188,7 @@ get_raw_page_internal(text *relname, ForkNumber forknum, BlockNumber blkno) buf = ReadBufferExtended(rel, forknum, blkno, RBM_NORMAL, NULL); LockBuffer(buf, BUFFER_LOCK_SHARE); - memcpy(raw_page_data, BufferGetPage(buf), BLCKSZ); + memcpy(raw_page_data, BufferGetPage(buf), CLUSTER_BLOCK_SIZE); LockBuffer(buf, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buf); @@ -219,12 +219,12 @@ get_page_from_raw(bytea *raw_page) raw_page_size = VARSIZE_ANY_EXHDR(raw_page); - if (raw_page_size != BLCKSZ) + if (raw_page_size != CLUSTER_BLOCK_SIZE) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid page size"), errdetail("Expected %d bytes, got %d.", - BLCKSZ, raw_page_size))); + CLUSTER_BLOCK_SIZE, raw_page_size))); page = palloc(raw_page_size); @@ -357,7 +357,7 @@ page_checksum_internal(PG_FUNCTION_ARGS, enum pageinspect_version ext_version) if (PageIsNew(page)) PG_RETURN_NULL(); - PG_RETURN_INT16(pg_checksum_page((char *) page, blkno)); + PG_RETURN_INT16(pg_checksum_page((char *) page, blkno, CLUSTER_BLOCK_SIZE)); } Datum diff --git a/contrib/pageinspect/sql/checksum.sql b/contrib/pageinspect/sql/checksum.sql index b877db0611..98eb1c3b8c 100644 --- a/contrib/pageinspect/sql/checksum.sql +++ b/contrib/pageinspect/sql/checksum.sql @@ -6,7 +6,7 @@ -- on the configured block size. This test has several different expected -- results files to handle the following possibilities: -- --- BLCKSZ end file +-- CLUSTER_BLOCK_SIZE end file -- 8K LE checksum.out -- 8K BE checksum_1.out -- diff --git a/contrib/pg_surgery/heap_surgery.c b/contrib/pg_surgery/heap_surgery.c index 88a40ab7d3..58c4819675 100644 --- a/contrib/pg_surgery/heap_surgery.c +++ b/contrib/pg_surgery/heap_surgery.c @@ -89,7 +89,7 @@ heap_force_common(FunctionCallInfo fcinfo, HeapTupleForceOption heap_force_opt) Relation rel; OffsetNumber curr_start_ptr, next_start_ptr; - bool include_this_tid[MaxHeapTuplesPerPage]; + bool include_this_tid[MaxHeapTuplesPerPageLimit]; if (RecoveryInProgress()) ereport(ERROR, diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c index 796a74f322..3a5a852989 100644 --- a/contrib/pg_walinspect/pg_walinspect.c +++ b/contrib/pg_walinspect/pg_walinspect.c @@ -386,9 +386,9 @@ GetWALBlockInfo(FunctionCallInfo fcinfo, XLogReaderState *record, (errcode(ERRCODE_INTERNAL_ERROR), errmsg_internal("%s", record->errormsg_buf))); - block_fpi_data = (bytea *) palloc(BLCKSZ + VARHDRSZ); - SET_VARSIZE(block_fpi_data, BLCKSZ + VARHDRSZ); - memcpy(VARDATA(block_fpi_data), page, BLCKSZ); + block_fpi_data = (bytea *) palloc(CLUSTER_BLOCK_SIZE + VARHDRSZ); + SET_VARSIZE(block_fpi_data, CLUSTER_BLOCK_SIZE + VARHDRSZ); + memcpy(VARDATA(block_fpi_data), page, CLUSTER_BLOCK_SIZE); values[i++] = PointerGetDatum(block_fpi_data); } else diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c index f601dc6121..d961e91f09 100644 --- a/contrib/pgstattuple/pgstatapprox.c +++ b/contrib/pgstattuple/pgstatapprox.c @@ -94,7 +94,7 @@ statapprox_heap(Relation rel, output_type *stat) if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) { freespace = GetRecordedFreeSpace(rel, blkno); - stat->tuple_len += BLCKSZ - freespace; + stat->tuple_len += CLUSTER_BLOCK_SIZE - freespace; stat->free_space += freespace; continue; } @@ -113,7 +113,7 @@ statapprox_heap(Relation rel, output_type *stat) if (!PageIsNew(page)) stat->free_space += PageGetHeapFreeSpace(page); else - stat->free_space += BLCKSZ - SizeOfPageHeaderData; + stat->free_space += CLUSTER_BLOCK_SIZE - SizeOfPageHeaderData; /* We may count the page as scanned even if it's new/empty */ scanned++; @@ -182,7 +182,7 @@ statapprox_heap(Relation rel, output_type *stat) UnlockReleaseBuffer(buf); } - stat->table_len = (uint64) nblocks * BLCKSZ; + stat->table_len = (uint64) nblocks * CLUSTER_BLOCK_SIZE; /* * We don't know how many tuples are in the pages we didn't scan, so diff --git a/contrib/pgstattuple/pgstatindex.c b/contrib/pgstattuple/pgstatindex.c index d69ac1c93d..a0367184b8 100644 --- a/contrib/pgstattuple/pgstatindex.c +++ b/contrib/pgstattuple/pgstatindex.c @@ -297,7 +297,7 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo) { int max_avail; - max_avail = BLCKSZ - (BLCKSZ - ((PageHeader) page)->pd_special + SizeOfPageHeaderData); + max_avail = CLUSTER_BLOCK_SIZE - (CLUSTER_BLOCK_SIZE - ((PageHeader) page)->pd_special + SizeOfPageHeaderData); indexStat.max_avail += max_avail; indexStat.free_space += PageGetFreeSpace(page); @@ -342,7 +342,7 @@ pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo) indexStat.leaf_pages + indexStat.internal_pages + indexStat.deleted_pages + - indexStat.empty_pages) * BLCKSZ); + indexStat.empty_pages) * CLUSTER_BLOCK_SIZE); values[j++] = psprintf("%u", indexStat.root_blkno); values[j++] = psprintf(INT64_FORMAT, indexStat.internal_pages); values[j++] = psprintf(INT64_FORMAT, indexStat.leaf_pages); diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c index 93b7834b77..91f5e5dcfe 100644 --- a/contrib/pgstattuple/pgstattuple.c +++ b/contrib/pgstattuple/pgstattuple.c @@ -386,7 +386,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo) table_endscan(scan); relation_close(rel, AccessShareLock); - stat.table_len = (uint64) nblocks * BLCKSZ; + stat.table_len = (uint64) nblocks * CLUSTER_BLOCK_SIZE; return build_pgstattuple_type(&stat, fcinfo); } @@ -409,7 +409,7 @@ pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, if (PageIsNew(page)) { /* fully empty page */ - stat->free_space += BLCKSZ; + stat->free_space += CLUSTER_BLOCK_SIZE; } else { @@ -419,7 +419,7 @@ pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, if (P_IGNORE(opaque)) { /* deleted or half-dead page */ - stat->free_space += BLCKSZ; + stat->free_space += CLUSTER_BLOCK_SIZE; } else if (P_ISLEAF(opaque)) { @@ -456,7 +456,7 @@ pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno, switch (opaque->hasho_flag & LH_PAGE_TYPE) { case LH_UNUSED_PAGE: - stat->free_space += BLCKSZ; + stat->free_space += CLUSTER_BLOCK_SIZE; break; case LH_BUCKET_PAGE: case LH_OVERFLOW_PAGE: @@ -531,7 +531,7 @@ pgstat_index(Relation rel, BlockNumber start, pgstat_page pagefn, /* Quit if we've scanned the whole relation */ if (blkno >= nblocks) { - stat.table_len = (uint64) nblocks * BLCKSZ; + stat.table_len = (uint64) nblocks * CLUSTER_BLOCK_SIZE; break; } diff --git a/contrib/postgres_fdw/deparse.c b/contrib/postgres_fdw/deparse.c index 09d6dd60dd..74f0d2d16d 100644 --- a/contrib/postgres_fdw/deparse.c +++ b/contrib/postgres_fdw/deparse.c @@ -2364,7 +2364,7 @@ deparseAnalyzeSizeSql(StringInfo buf, Relation rel) appendStringInfoString(buf, "SELECT pg_catalog.pg_relation_size("); deparseStringLiteral(buf, relname.data); - appendStringInfo(buf, "::pg_catalog.regclass) / %d", BLCKSZ); + appendStringInfo(buf, "::pg_catalog.regclass) / %d", CLUSTER_BLOCK_SIZE); } /* diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c index c5cada55fb..2acc44a421 100644 --- a/contrib/postgres_fdw/postgres_fdw.c +++ b/contrib/postgres_fdw/postgres_fdw.c @@ -755,7 +755,7 @@ postgresGetForeignRelSize(PlannerInfo *root, { baserel->pages = 10; baserel->tuples = - (10 * BLCKSZ) / (baserel->reltarget->width + + (10 * CLUSTER_BLOCK_SIZE) / (baserel->reltarget->width + MAXALIGN(SizeofHeapTupleHeader)); } diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index ed32ca0349..180d809249 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -2022,7 +2022,7 @@ SCRAM-SHA-256$<iteration count>:&l Size of the on-disk representation of this table in pages (of size - BLCKSZ). This is only an estimate used by the + CLUSTER_BLOCK_SIZE). This is only an estimate used by the planner. It is updated by VACUUM, ANALYZE, and a few DDL commands such as CREATE INDEX. @@ -4885,7 +4885,7 @@ SCRAM-SHA-256$<iteration count>:&l segments or pages small enough to be conveniently stored as rows in pg_largeobject. The amount of data per page is defined to be LOBLKSIZE (which is currently - BLCKSZ/4, or typically 2 kB). + CLUSTER_BLOCK_SIZE/4, or typically 2 kB). diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 6262cb7bb2..6880976700 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1683,8 +1683,8 @@ include_dir 'conf.d' settings significantly higher than the minimum are usually needed for good performance. If this value is specified without units, it is taken as blocks, - that is BLCKSZ bytes, typically 8kB. - (Non-default values of BLCKSZ change the minimum + that is CLUSTER_BLOCK_SIZE bytes, typically 8kB. + (Non-default values of CLUSTER_BLOCK_SIZE change the minimum value.) This parameter can only be set at server start. @@ -1812,9 +1812,9 @@ include_dir 'conf.d' each database session. These are session-local buffers used only for access to temporary tables. If this value is specified without units, it is taken as blocks, - that is BLCKSZ bytes, typically 8kB. + that is CLUSTER_BLOCK_SIZE bytes, typically 8kB. The default is eight megabytes (8MB). - (If BLCKSZ is not 8kB, the default value scales + (If CLUSTER_BLOCK_SIZE is not 8kB, the default value scales proportionally to it.) This setting can be changed within individual sessions, but only before the first use of temporary tables @@ -1829,7 +1829,7 @@ include_dir 'conf.d' buffers is only a buffer descriptor, or about 64 bytes, per increment in temp_buffers. However if a buffer is actually used an additional 8192 bytes will be consumed for it - (or in general, BLCKSZ bytes). + (or in general, CLUSTER_BLOCK_SIZE bytes). @@ -2486,11 +2486,11 @@ include_dir 'conf.d' cache, where performance might degrade. This setting may have no effect on some platforms. If this value is specified without units, it is taken as blocks, - that is BLCKSZ bytes, typically 8kB. + that is CLUSTER_BLOCK_SIZE bytes, typically 8kB. The valid range is between 0, which disables forced writeback, and 2MB. The default is 512kB on Linux, - 0 elsewhere. (If BLCKSZ is not 8kB, + 0 elsewhere. (If CLUSTER_BLOCK_SIZE is not 8kB, the default and maximum values scale proportionally to it.) This parameter can only be set in the postgresql.conf file or on the server command line. @@ -2532,11 +2532,11 @@ include_dir 'conf.d' than the OS's page cache, where performance might degrade. This setting may have no effect on some platforms. If this value is specified without units, it is taken as blocks, - that is BLCKSZ bytes, typically 8kB. + that is CLUSTER_BLOCK_SIZE bytes, typically 8kB. The valid range is between 0, which disables forced writeback, and 2MB. The default is 0, i.e., no - forced writeback. (If BLCKSZ is not 8kB, + forced writeback. (If CLUSTER_BLOCK_SIZE is not 8kB, the maximum value scales proportionally to it.) @@ -3539,11 +3539,11 @@ include_dir 'conf.d' than the OS's page cache, where performance might degrade. This setting may have no effect on some platforms. If this value is specified without units, it is taken as blocks, - that is BLCKSZ bytes, typically 8kB. + that is CLUSTER_BLOCK_SIZE bytes, typically 8kB. The valid range is between 0, which disables forced writeback, and 2MB. The default is 256kB on - Linux, 0 elsewhere. (If BLCKSZ is not + Linux, 0 elsewhere. (If CLUSTER_BLOCK_SIZE is not 8kB, the default and maximum values scale proportionally to it.) This parameter can only be set in the postgresql.conf file or on the server command line. @@ -5665,7 +5665,7 @@ ANY num_sync ( num_sync ( . If this value is specified without units, it is taken as blocks, - that is BLCKSZ bytes, typically 8kB. + that is CLUSTER_BLOCK_SIZE bytes, typically 8kB. The default is 512 kilobytes (512kB). @@ -5719,9 +5719,9 @@ ANY num_sync ( ) is influenced by block_size. See for information. + linkend="runtime-config-resource"/> for information on GUC configuration + or for more information on + selecting a cluster's block size. diff --git a/doc/src/sgml/limits.sgml b/doc/src/sgml/limits.sgml index d5b2b627dd..92194779da 100644 --- a/doc/src/sgml/limits.sgml +++ b/doc/src/sgml/limits.sgml @@ -46,7 +46,7 @@ relation size 32 TB - with the default BLCKSZ of 8192 bytes + with the default CLUSTER_BLOCK_SIZE of 8192 bytes diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 5cfdc70c03..8d44648c17 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -3927,8 +3927,8 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i Relation data reads, writes, and extends are done in - block_size units, derived from the build-time - parameter BLCKSZ, which is 8192 by + block_size units, derived from the cluster init-time + parameter CLUSTER_BLOCK_SIZE, which is 8192 by default. diff --git a/doc/src/sgml/pgfreespacemap.sgml b/doc/src/sgml/pgfreespacemap.sgml index 829ad60f32..1ac1561a74 100644 --- a/doc/src/sgml/pgfreespacemap.sgml +++ b/doc/src/sgml/pgfreespacemap.sgml @@ -60,7 +60,8 @@ The values stored in the free space map are not exact. They're rounded - to precision of 1/256th of BLCKSZ (32 bytes with default BLCKSZ), and + to precision of 1/256th of CLUSTER_BLOCK_SIZE (32 bytes + with default CLUSTER_BLOCK_SIZE), and they're not kept fully up-to-date as tuples are inserted and updated. diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml index 22f1011781..6ba8fc9771 100644 --- a/doc/src/sgml/ref/initdb.sgml +++ b/doc/src/sgml/ref/initdb.sgml @@ -277,6 +277,22 @@ PostgreSQL documentation + + + + + + Sets the cluster's database block size + to blocksize kilobytes. This should be one + of 1, 2, 4, 8, 16, or 32. Many database limits are defined relative + to this parameter. If unspecified, defaults to 8. + + + This used to be the compile-time parameter BLCKSZ. + + + + diff --git a/doc/src/sgml/ref/pg_resetwal.sgml b/doc/src/sgml/ref/pg_resetwal.sgml index fd539f5604..54dd73bb12 100644 --- a/doc/src/sgml/ref/pg_resetwal.sgml +++ b/doc/src/sgml/ref/pg_resetwal.sgml @@ -136,6 +136,25 @@ PostgreSQL documentation + + + + + + Sets the cluster's database block size + to blocksize kilobytes. This should be one + of 1, 2, 4, 8, 16, or 32. Many database limits are defined relative + to this parameter. + + + This must be the same value selected + at initdb time, or this will result in database + corruption. This option should only be provided + when pg_control is corrupted or missing. + + + + diff --git a/meson.build b/meson.build index e408967fd4..f7fd4a7c49 100644 --- a/meson.build +++ b/meson.build @@ -440,14 +440,15 @@ else segsize = (get_option('segsize') * 1024 * 1024 * 1024) / blocksize endif -cdata.set('BLCKSZ', blocksize, description: -'''Size of a disk block --- this also limits the size of a tuple. You can set +cdata.set('DEFAULT_BLOCK_SIZE', blocksize, description: +'''Default size of a disk block --- this also limits the size of a tuple. You can set it bigger if you need bigger tuples (although TOAST should reduce the need to have large tuples, since fields can be spread across multiple tuples). - BLCKSZ must be a power of 2. The maximum possible value of BLCKSZ is + DEFAULT_BLOCK_SIZE must be a power of 2. The maximum possible value of DEFAULT_BLOCK_SIZE is currently 2^15 (32768). This is determined by the 15-bit widths of the lp_off and lp_len fields in ItemIdData (see include/storage/itemid.h). - Changing BLCKSZ requires an initdb.''') + Changing DEFAULT_BLOCK_SIZE requires an initdb, however individual clusters + can have different values for block size (CLUSTER_BLOCK_SIZE)''') cdata.set('XLOG_BLCKSZ', get_option('wal_blocksize').to_int() * 1024) cdata.set('RELSEG_SIZE', segsize) @@ -3311,7 +3312,7 @@ if meson.version().version_compare('>=0.57') summary( { - 'data block size': '@0@ kB'.format(cdata.get('BLCKSZ') / 1024), + 'data block size': '@0@ kB'.format(cdata.get('DEFAULT_BLOCK_SIZE') / 1024), 'WAL block size': '@0@ kB'.format(cdata.get('XLOG_BLCKSZ') / 1024), 'segment size': get_option('segsize_blocks') != 0 ? '@0@ blocks'.format(cdata.get('RELSEG_SIZE')) : diff --git a/src/backend/access/brin/brin_bloom.c b/src/backend/access/brin/brin_bloom.c index e4953a9d37..7e0976b2b0 100644 --- a/src/backend/access/brin/brin_bloom.c +++ b/src/backend/access/brin/brin_bloom.c @@ -212,7 +212,7 @@ typedef struct BloomOptions * be larger because the index has multiple columns. */ #define BloomMaxFilterSize \ - MAXALIGN_DOWN(BLCKSZ - \ + MAXALIGN_DOWN(CLUSTER_BLOCK_SIZE - \ (MAXALIGN(SizeOfPageHeaderData + \ sizeof(ItemIdData)) + \ MAXALIGN(sizeof(BrinSpecialSpace)) + \ diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c index b578d25954..a38605b02b 100644 --- a/src/backend/access/brin/brin_pageops.c +++ b/src/backend/access/brin/brin_pageops.c @@ -27,7 +27,7 @@ * a single item per page, unlike other index AMs. */ #define BrinMaxItemSize \ - MAXALIGN_DOWN(BLCKSZ - \ + MAXALIGN_DOWN(CLUSTER_BLOCK_SIZE - \ (MAXALIGN(SizeOfPageHeaderData + \ sizeof(ItemIdData)) + \ MAXALIGN(sizeof(BrinSpecialSpace)))) @@ -475,7 +475,7 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, void brin_page_init(Page page, uint16 type) { - PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace)); + PageInit(page, CLUSTER_BLOCK_SIZE, sizeof(BrinSpecialSpace)); BrinPageType(page) = type; } diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c index 5e392dab1e..258b7110d9 100644 --- a/src/backend/access/common/bufmask.c +++ b/src/backend/access/common/bufmask.c @@ -76,7 +76,7 @@ mask_unused_space(Page page) /* Sanity check */ if (pd_lower > pd_upper || pd_special < pd_upper || - pd_lower < SizeOfPageHeaderData || pd_special > BLCKSZ) + pd_lower < SizeOfPageHeaderData || pd_special > CLUSTER_BLOCK_SIZE) { elog(ERROR, "invalid page pd_lower %u pd_upper %u pd_special %u", pd_lower, pd_upper, pd_special); @@ -120,7 +120,7 @@ mask_page_content(Page page) { /* Mask Page Content */ memset(page + SizeOfPageHeaderData, MASK_MARKER, - BLCKSZ - SizeOfPageHeaderData); + CLUSTER_BLOCK_SIZE - SizeOfPageHeaderData); /* Mask pd_lower and pd_upper */ memset(&((PageHeader) page)->pd_lower, MASK_MARKER, diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c index 11cc431677..36d59d095b 100644 --- a/src/backend/access/common/reloptions.c +++ b/src/backend/access/common/reloptions.c @@ -28,6 +28,7 @@ #include "commands/defrem.h" #include "commands/tablespace.h" #include "commands/view.h" +#include "common/blocksize.h" #include "nodes/makefuncs.h" #include "postmaster/postmaster.h" #include "utils/array.h" @@ -328,7 +329,8 @@ static relopt_int intRelOpts[] = RELOPT_KIND_HEAP, ShareUpdateExclusiveLock }, - TOAST_TUPLE_TARGET, 128, TOAST_TUPLE_TARGET_MAIN + /* NOTE: these limits are dynamically updated */ + 0, 0, 0 }, { { @@ -563,6 +565,8 @@ static void initialize_reloptions(void); static void parse_one_reloption(relopt_value *option, char *text_str, int text_len, bool validate); +static void update_dynamic_reloptions(void); + /* * Get the length of a string reloption (either default or the user-defined * value). This is used for allocation purposes when building a set of @@ -572,6 +576,26 @@ static void parse_one_reloption(relopt_value *option, char *text_str, ((option).isset ? strlen((option).values.string_val) : \ ((relopt_string *) (option).gen)->default_len) +/* + * handle adjustments to the config table based on dynamic parameters' limits + */ + +static void +update_dynamic_reloptions(void) +{ + int i; + + for (i = 0; intRelOpts[i].gen.name; i++) + { + if (strcmp("toast_tuple_target", intRelOpts[i].gen.name) == 0) + { + intRelOpts[i].min = 128; + intRelOpts[i].default_val = TOAST_TUPLE_TARGET; + intRelOpts[i].max = TOAST_TUPLE_TARGET_MAIN; + } + } +} + /* * initialize_reloptions * initialization routine, must be called before parsing @@ -584,6 +608,13 @@ initialize_reloptions(void) int i; int j; + /* + * Set the dynamic limits based on block size; if we get multiple can make + * more sophisticated. + */ + + update_dynamic_reloptions(); + j = 0; for (i = 0; boolRelOpts[i].gen.name; i++) { diff --git a/src/backend/access/common/syncscan.c b/src/backend/access/common/syncscan.c index 2bc6828883..21aa57d620 100644 --- a/src/backend/access/common/syncscan.c +++ b/src/backend/access/common/syncscan.c @@ -80,7 +80,7 @@ bool trace_syncscan = false; * the buffer cache anyway, and on the other hand the page is most likely * still in the OS cache. */ -#define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ) +#define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / CLUSTER_BLOCK_SIZE) /* diff --git a/src/backend/access/common/toast_internals.c b/src/backend/access/common/toast_internals.c index 588825ed85..e4f11bc35e 100644 --- a/src/backend/access/common/toast_internals.c +++ b/src/backend/access/common/toast_internals.c @@ -133,7 +133,7 @@ toast_save_datum(Relation rel, Datum value, { struct varlena hdr; /* this is to make the union big enough for a chunk: */ - char data[TOAST_MAX_CHUNK_SIZE + VARHDRSZ]; + char data[TOAST_MAX_CHUNK_SIZE_LIMIT + VARHDRSZ]; /* ensure union is aligned well enough: */ int32 align_it; } chunk_data; diff --git a/src/backend/access/gin/ginbtree.c b/src/backend/access/gin/ginbtree.c index 35490c7283..c4352d39a2 100644 --- a/src/backend/access/gin/ginbtree.c +++ b/src/backend/access/gin/ginbtree.c @@ -510,7 +510,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, * critical section yet.) */ newrootpg = PageGetTempPage(newrpage); - GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), BLCKSZ); + GinInitPage(newrootpg, GinPageGetOpaque(newlpage)->flags & ~(GIN_LEAF | GIN_COMPRESSED), CLUSTER_BLOCK_SIZE); btree->fillRoot(btree, newrootpg, BufferGetBlockNumber(lbuffer), newlpage, @@ -567,15 +567,15 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack, { /* Splitting the root, three pages to update */ MarkBufferDirty(lbuffer); - memcpy(page, newrootpg, BLCKSZ); - memcpy(BufferGetPage(lbuffer), newlpage, BLCKSZ); - memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); + memcpy(page, newrootpg, CLUSTER_BLOCK_SIZE); + memcpy(BufferGetPage(lbuffer), newlpage, CLUSTER_BLOCK_SIZE); + memcpy(BufferGetPage(rbuffer), newrpage, CLUSTER_BLOCK_SIZE); } else { /* Normal split, only two pages to update */ - memcpy(page, newlpage, BLCKSZ); - memcpy(BufferGetPage(rbuffer), newrpage, BLCKSZ); + memcpy(page, newlpage, CLUSTER_BLOCK_SIZE); + memcpy(BufferGetPage(rbuffer), newrpage, CLUSTER_BLOCK_SIZE); } /* We also clear childbuf's INCOMPLETE_SPLIT flag, if passed */ diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index 9caeac164a..a280136253 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -655,7 +655,7 @@ dataBeginPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, break; if (append) { - if ((leaf->lsize - segsize) < (BLCKSZ * 3) / 4) + if ((leaf->lsize - segsize) < (CLUSTER_BLOCK_SIZE * 3) / 4) break; } @@ -681,8 +681,8 @@ dataBeginPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack, /* * Now allocate a couple of temporary page images, and fill them. */ - *newlpage = palloc(BLCKSZ); - *newrpage = palloc(BLCKSZ); + *newlpage = palloc(CLUSTER_BLOCK_SIZE); + *newrpage = palloc(CLUSTER_BLOCK_SIZE); dataPlaceToPageLeafSplit(leaf, lbound, rbound, *newlpage, *newrpage); @@ -887,7 +887,7 @@ computeLeafRecompressWALData(disassembledLeaf *leaf) walbufbegin = palloc(sizeof(ginxlogRecompressDataLeaf) + - BLCKSZ + /* max size needed to hold the segment data */ + CLUSTER_BLOCK_SIZE + /* max size needed to hold the segment data */ nmodified * 2 /* (segno + action) per action */ ); walbufend = walbufbegin; @@ -1041,8 +1041,8 @@ dataPlaceToPageLeafSplit(disassembledLeaf *leaf, leafSegmentInfo *seginfo; /* Initialize temporary pages to hold the new left and right pages */ - GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); - GinInitPage(rpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); + GinInitPage(lpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, CLUSTER_BLOCK_SIZE); + GinInitPage(rpage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, CLUSTER_BLOCK_SIZE); /* * Copy the segments that go to the left page. @@ -1259,7 +1259,7 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf, Page lpage; Page rpage; OffsetNumber separator; - PostingItem allitems[(BLCKSZ / sizeof(PostingItem)) + 1]; + PostingItem allitems[(MAX_BLOCK_SIZE / sizeof(PostingItem)) + 1]; lpage = PageGetTempPage(oldpage); rpage = PageGetTempPage(oldpage); @@ -1779,8 +1779,8 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems, bool is_build = (buildStats != NULL); /* Construct the new root page in memory first. */ - tmppage = (Page) palloc(BLCKSZ); - GinInitPage(tmppage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, BLCKSZ); + tmppage = (Page) palloc(CLUSTER_BLOCK_SIZE); + GinInitPage(tmppage, GIN_DATA | GIN_LEAF | GIN_COMPRESSED, CLUSTER_BLOCK_SIZE); GinPageGetOpaque(tmppage)->rightlink = InvalidBlockNumber; /* diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c index ca7d770d86..f1128eef6b 100644 --- a/src/backend/access/gin/ginfast.c +++ b/src/backend/access/gin/ginfast.c @@ -39,7 +39,7 @@ int gin_pending_list_limit = 0; #define GIN_PAGE_FREESIZE \ - ( BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(GinPageOpaqueData)) ) + ( CLUSTER_BLOCK_SIZE - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(GinPageOpaqueData)) ) typedef struct KeyArray { @@ -92,7 +92,7 @@ writeListPage(Relation index, Buffer buffer, off++; } - Assert(size <= BLCKSZ); /* else we overran workspace */ + Assert(size <= CLUSTER_BLOCK_SIZE); /* else we overran workspace */ GinPageGetOpaque(page)->rightlink = rightlink; diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index cb676a710f..e816b1b7d1 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -1630,9 +1630,9 @@ collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos) */ for (;;) { - Datum datum[BLCKSZ / sizeof(IndexTupleData)]; - GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)]; - bool datumExtracted[BLCKSZ / sizeof(IndexTupleData)]; + Datum datum[MAX_BLOCK_SIZE / sizeof(IndexTupleData)]; + GinNullCategory category[MAX_BLOCK_SIZE / sizeof(IndexTupleData)]; + bool datumExtracted[MAX_BLOCK_SIZE / sizeof(IndexTupleData)]; Assert(pos->lastOffset > pos->firstOffset); memset(datumExtracted + pos->firstOffset - 1, 0, diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index e5d310d836..c7198d85c9 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -569,7 +569,7 @@ ginbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BlockNumber blkno = GIN_ROOT_BLKNO; GinVacuumState gvs; Buffer buffer; - BlockNumber rootOfPostingTree[BLCKSZ / (sizeof(IndexTupleData) + sizeof(ItemId))]; + BlockNumber rootOfPostingTree[MAX_BLOCK_SIZE / (sizeof(IndexTupleData) + sizeof(ItemId))]; uint32 nRoot; gvs.tmpCxt = AllocSetContextCreate(CurrentMemoryContext, diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index f7c84beef8..761e6fedf0 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -146,7 +146,7 @@ ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data) GinPostingList *plist; plist = ginCompressPostingList(uncompressed, nuncompressed, - BLCKSZ, &npacked); + CLUSTER_BLOCK_SIZE, &npacked); totalsize = SizeOfGinPostingList(plist); Assert(npacked == nuncompressed); @@ -236,7 +236,7 @@ ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data) Assert(nnewitems == nolditems + nitems); newseg = ginCompressPostingList(newitems, nnewitems, - BLCKSZ, &npacked); + CLUSTER_BLOCK_SIZE, &npacked); Assert(npacked == nnewitems); newsegsize = SizeOfGinPostingList(newseg); diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 516465f8b7..acaca88863 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -1642,7 +1642,7 @@ freeGISTstate(GISTSTATE *giststate) static void gistprunepage(Relation rel, Page page, Buffer buffer, Relation heapRel) { - OffsetNumber deletable[MaxIndexTuplesPerPage]; + OffsetNumber deletable[MaxIndexTuplesPerPageLimit]; int ndeletable = 0; OffsetNumber offnum, maxoff; diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 5e0c1447f9..31d4cdf917 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -255,7 +255,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) * Calculate target amount of free space to leave on pages. */ fillfactor = options ? options->fillfactor : GIST_DEFAULT_FILLFACTOR; - buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100; + buildstate.freespace = CLUSTER_BLOCK_SIZE * (100 - fillfactor) / 100; /* * Build the index using the chosen strategy. @@ -415,7 +415,7 @@ gist_indexsortbuild(GISTBuildState *state) * Write an empty page as a placeholder for the root page. It will be * replaced with the real root page at the end. */ - page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); + page = palloc_aligned(CLUSTER_BLOCK_SIZE, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO, page, true); state->pages_allocated++; @@ -510,7 +510,7 @@ gist_indexsortbuild_levelstate_add(GISTBuildState *state, if (levelstate->pages[levelstate->current_page] == NULL) levelstate->pages[levelstate->current_page] = - palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + palloc_aligned(CLUSTER_BLOCK_SIZE, PG_IO_ALIGN_SIZE, 0); newPage = levelstate->pages[levelstate->current_page]; gistinitpage(newPage, old_page_flags); @@ -580,7 +580,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, /* Create page and copy data */ data = (char *) (dist->list); - target = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); + target = palloc_aligned(CLUSTER_BLOCK_SIZE, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); gistinitpage(target, isleaf ? F_LEAF : 0); for (int i = 0; i < dist->block.num; i++) { @@ -631,7 +631,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, if (parent == NULL) { parent = palloc0(sizeof(GistSortedBuildLevelState)); - parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + parent->pages[0] = (Page) palloc_aligned(CLUSTER_BLOCK_SIZE, PG_IO_ALIGN_SIZE, 0); parent->parent = NULL; gistinitpage(parent->pages[0], 0); @@ -702,7 +702,7 @@ gistInitBuffering(GISTBuildState *buildstate) int levelStep; /* Calc space of index page which is available for index tuples */ - pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData) + pageFreeSpace = CLUSTER_BLOCK_SIZE - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData) - sizeof(ItemIdData) - buildstate->freespace; @@ -799,7 +799,7 @@ gistInitBuffering(GISTBuildState *buildstate) break; /* each node in the lowest level of a subtree has one page in memory */ - if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ) + if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / CLUSTER_BLOCK_SIZE) break; /* Good, we can handle this levelStep. See if we can go one higher. */ @@ -858,7 +858,7 @@ calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep) Size pageFreeSpace; /* Calc space of index page which is available for index tuples */ - pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData) + pageFreeSpace = CLUSTER_BLOCK_SIZE - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData) - sizeof(ItemIdData) - buildstate->freespace; diff --git a/src/backend/access/gist/gistbuildbuffers.c b/src/backend/access/gist/gistbuildbuffers.c index 1423b4b047..5f2c18a40f 100644 --- a/src/backend/access/gist/gistbuildbuffers.c +++ b/src/backend/access/gist/gistbuildbuffers.c @@ -187,11 +187,11 @@ gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb) GISTNodeBufferPage *pageBuffer; pageBuffer = (GISTNodeBufferPage *) MemoryContextAllocZero(gfbb->context, - BLCKSZ); + CLUSTER_BLOCK_SIZE); pageBuffer->prev = InvalidBlockNumber; /* Set page free space */ - PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - BUFFER_PAGE_DATA_OFFSET; + PAGE_FREE_SPACE(pageBuffer) = CLUSTER_BLOCK_SIZE - BUFFER_PAGE_DATA_OFFSET; return pageBuffer; } @@ -379,7 +379,7 @@ gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, * the new page by storing its block number in the prev-link. */ PAGE_FREE_SPACE(nodeBuffer->pageBuffer) = - BLCKSZ - MAXALIGN(offsetof(GISTNodeBufferPage, tupledata)); + CLUSTER_BLOCK_SIZE - MAXALIGN(offsetof(GISTNodeBufferPage, tupledata)); nodeBuffer->pageBuffer->prev = blkno; /* We've just added one more page */ @@ -755,7 +755,7 @@ ReadTempFileBlock(BufFile *file, long blknum, void *ptr) { if (BufFileSeekBlock(file, blknum) != 0) elog(ERROR, "could not seek to block %ld in temporary file", blknum); - BufFileReadExact(file, ptr, BLCKSZ); + BufFileReadExact(file, ptr, CLUSTER_BLOCK_SIZE); } static void @@ -763,5 +763,5 @@ WriteTempFileBlock(BufFile *file, long blknum, const void *ptr) { if (BufFileSeekBlock(file, blknum) != 0) elog(ERROR, "could not seek to block %ld in temporary file", blknum); - BufFileWrite(file, ptr, BLCKSZ); + BufFileWrite(file, ptr, CLUSTER_BLOCK_SIZE); } diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index f9f51152b8..7fa78eb002 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -758,7 +758,7 @@ gistinitpage(Page page, uint32 f) { GISTPageOpaque opaque; - PageInit(page, BLCKSZ, sizeof(GISTPageOpaqueData)); + PageInit(page, CLUSTER_BLOCK_SIZE, sizeof(GISTPageOpaqueData)); opaque = GistPageGetOpaque(page); opaque->rightlink = InvalidBlockNumber; diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index 3f60d3274d..53a29e55b3 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -309,7 +309,7 @@ restart: } else if (GistPageIsLeaf(page)) { - OffsetNumber todelete[MaxOffsetNumber]; + OffsetNumber todelete[MaxOffsetNumberLimit]; int ntodelete = 0; int nremain; GISTPageOpaque opaque = GistPageGetOpaque(page); @@ -476,8 +476,8 @@ gistvacuum_delete_empty_pages(IndexVacuumInfo *info, GistVacState *vstate) Page page; OffsetNumber off, maxoff; - OffsetNumber todelete[MaxOffsetNumber]; - BlockNumber leafs_to_delete[MaxOffsetNumber]; + OffsetNumber todelete[MaxOffsetNumberLimit]; + BlockNumber leafs_to_delete[MaxOffsetNumberLimit]; int ntodelete; int deleted; diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index fc5d97f606..f0a8ca0e4e 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -152,7 +152,7 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) * one page. Also, "initial index size" accounting does not include the * metapage, nor the first bitmap page. */ - sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ; + sort_threshold = (maintenance_work_mem * 1024L) / CLUSTER_BLOCK_SIZE; if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP) sort_threshold = Min(sort_threshold, NBuffers); else @@ -709,7 +709,7 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, OffsetNumber maxoffno; Buffer next_buf; Page page; - OffsetNumber deletable[MaxOffsetNumber]; + OffsetNumber deletable[MaxOffsetNumberLimit]; int ndeletable = 0; bool retain_pin = false; bool clear_dead_marking = false; diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c index 22656b24e2..48d986272a 100644 --- a/src/backend/access/hash/hashinsert.c +++ b/src/backend/access/hash/hashinsert.c @@ -371,7 +371,7 @@ _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, static void _hash_vacuum_one_page(Relation rel, Relation hrel, Buffer metabuf, Buffer buf) { - OffsetNumber deletable[MaxOffsetNumber]; + OffsetNumber deletable[MaxOffsetNumberLimit]; int ndeletable = 0; OffsetNumber offnum, maxoff; diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index 39bb2cb9f6..dd414c557e 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -866,10 +866,10 @@ _hash_squeezebucket(Relation rel, { OffsetNumber roffnum; OffsetNumber maxroffnum; - OffsetNumber deletable[MaxOffsetNumber]; - IndexTuple itups[MaxIndexTuplesPerPage]; - Size tups_size[MaxIndexTuplesPerPage]; - OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; + OffsetNumber deletable[MaxOffsetNumberLimit]; + IndexTuple itups[MaxIndexTuplesPerPageLimit]; + Size tups_size[MaxIndexTuplesPerPageLimit]; + OffsetNumber itup_offsets[MaxIndexTuplesPerPageLimit]; uint16 ndeletable = 0; uint16 nitups = 0; Size all_tups_size = 0; diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index af3a154266..3a4380bde7 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -1012,7 +1012,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) * _hash_freeovflpage for similar usage. We take care to make the special * space valid for the benefit of tools such as pageinspect. */ - _hash_pageinit(page, BLCKSZ); + _hash_pageinit(page, CLUSTER_BLOCK_SIZE); ovflopaque = HashPageGetOpaque(page); @@ -1087,8 +1087,8 @@ _hash_splitbucket(Relation rel, Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; - OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; - IndexTuple itups[MaxIndexTuplesPerPage]; + OffsetNumber itup_offsets[MaxIndexTuplesPerPageLimit]; + IndexTuple itups[MaxIndexTuplesPerPageLimit]; Size all_tups_size = 0; int i; uint16 nitups = 0; diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT index 6fd1767f70..242740552f 100644 --- a/src/backend/access/heap/README.HOT +++ b/src/backend/access/heap/README.HOT @@ -233,7 +233,7 @@ large enough to accept any extra maintenance burden for. The currently planned heuristic is to prune and defrag when first accessing a page that potentially has prunable tuples (as flagged by the pd_prune_xid page hint field) and that either has free space less than MAX(fillfactor -target free space, BLCKSZ/10) *or* has recently had an UPDATE fail to +target free space, CLUSTER_BLOCK_SIZE/10) *or* has recently had an UPDATE fail to find enough free space to store an updated tuple version. (These rules are subject to change.) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 7ed72abe59..59c020589c 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2061,7 +2061,7 @@ heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, static int heap_multi_insert_pages(HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace) { - size_t page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace; + size_t page_avail = CLUSTER_BLOCK_SIZE - SizeOfPageHeaderData - saveFreeSpace; int npages = 1; for (int i = done; i < ntuples; i++) @@ -2071,7 +2071,7 @@ heap_multi_insert_pages(HeapTuple *heaptuples, int done, int ntuples, Size saveF if (page_avail < tup_sz) { npages++; - page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace; + page_avail = CLUSTER_BLOCK_SIZE - SizeOfPageHeaderData - saveFreeSpace; } page_avail -= tup_sz; } @@ -2332,7 +2332,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, scratchptr += datalen; } totaldatalen = scratchptr - tupledata; - Assert((scratchptr - scratch.data) < BLCKSZ); + Assert((scratchptr - scratch.data) < CLUSTER_BLOCK_SIZE); if (need_tuple_data) xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE; @@ -6740,8 +6740,8 @@ heap_freeze_execute_prepared(Relation rel, Buffer buffer, /* Now WAL-log freezing if necessary */ if (RelationNeedsWAL(rel)) { - xl_heap_freeze_plan plans[MaxHeapTuplesPerPage]; - OffsetNumber offsets[MaxHeapTuplesPerPage]; + xl_heap_freeze_plan plans[MaxHeapTuplesPerPageLimit]; + OffsetNumber offsets[MaxHeapTuplesPerPageLimit]; int nplans; xl_heap_freeze_page xlrec; XLogRecPtr recptr; @@ -7989,7 +7989,7 @@ index_delete_sort(TM_IndexDeleteOp *delstate) * Shellsort gap sequence (taken from Sedgewick-Incerpi paper). * * This implementation is fast with array sizes up to ~4500. This covers - * all supported BLCKSZ values. + * all supported CLUSTER_BLOCK_SIZE values. */ const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1}; @@ -9019,7 +9019,7 @@ heap_xlog_visible(XLogReaderState *record) /* initialize the page if it was read as zeros */ if (PageIsNew(vmpage)) - PageInit(vmpage, BLCKSZ, 0); + PageInit(vmpage, CLUSTER_BLOCK_SIZE, 0); /* remove VISIBILITYMAP_XLOG_* */ vmbits = xlrec->flags & VISIBILITYMAP_VALID_BITS; @@ -9221,7 +9221,7 @@ heap_xlog_insert(XLogReaderState *record) union { HeapTupleHeaderData hdr; - char data[MaxHeapTupleSize]; + char data[MaxHeapTupleSizeLimit]; } tbuf; HeapTupleHeader htup; xl_heap_header xlhdr; @@ -9324,7 +9324,7 @@ heap_xlog_insert(XLogReaderState *record) * don't bother to update the FSM in that case, it doesn't need to be * totally accurate anyway. */ - if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) + if (action == BLK_NEEDS_REDO && freespace < CLUSTER_BLOCK_SIZE / 5) XLogRecordPageWithFreeSpace(target_locator, blkno, freespace); } @@ -9343,7 +9343,7 @@ heap_xlog_multi_insert(XLogReaderState *record) union { HeapTupleHeaderData hdr; - char data[MaxHeapTupleSize]; + char data[MaxHeapTupleSizeLimit]; } tbuf; HeapTupleHeader htup; uint32 newlen; @@ -9471,7 +9471,7 @@ heap_xlog_multi_insert(XLogReaderState *record) * don't bother to update the FSM in that case, it doesn't need to be * totally accurate anyway. */ - if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) + if (action == BLK_NEEDS_REDO && freespace < CLUSTER_BLOCK_SIZE / 5) XLogRecordPageWithFreeSpace(rlocator, blkno, freespace); } @@ -9500,7 +9500,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) union { HeapTupleHeaderData hdr; - char data[MaxHeapTupleSize]; + char data[MaxHeapTupleSizeLimit]; } tbuf; xl_heap_header xlhdr; uint32 newlen; @@ -9746,7 +9746,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) * don't bother to update the FSM in that case, it doesn't need to be * totally accurate anyway. */ - if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5) + if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < CLUSTER_BLOCK_SIZE / 5) XLogRecordPageWithFreeSpace(rlocator, newblk, freespace); } diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 0755be8390..89c213ac91 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1195,7 +1195,7 @@ heapam_index_build_range_scan(Relation heapRelation, TransactionId OldestXmin; BlockNumber previous_blkno = InvalidBlockNumber; BlockNumber root_blkno = InvalidBlockNumber; - OffsetNumber root_offsets[MaxHeapTuplesPerPage]; + OffsetNumber root_offsets[MaxHeapTuplesPerPageLimit]; /* * sanity checks @@ -1758,8 +1758,8 @@ heapam_index_validate_scan(Relation heapRelation, EState *estate; ExprContext *econtext; BlockNumber root_blkno = InvalidBlockNumber; - OffsetNumber root_offsets[MaxHeapTuplesPerPage]; - bool in_index[MaxHeapTuplesPerPage]; + OffsetNumber root_offsets[MaxHeapTuplesPerPageLimit]; + bool in_index[MaxHeapTuplesPerPageLimit]; BlockNumber previous_blkno = InvalidBlockNumber; /* state variables for the merge */ @@ -2096,7 +2096,7 @@ heapam_relation_toast_am(Relation rel) #define HEAP_OVERHEAD_BYTES_PER_TUPLE \ (MAXALIGN(SizeofHeapTupleHeader) + sizeof(ItemIdData)) #define HEAP_USABLE_BYTES_PER_PAGE \ - (BLCKSZ - SizeOfPageHeaderData) + (CLUSTER_BLOCK_SIZE - SizeOfPageHeaderData) static void heapam_estimate_rel_size(Relation rel, int32 *attr_widths, diff --git a/src/backend/access/heap/heaptoast.c b/src/backend/access/heap/heaptoast.c index 52ecd45654..8d38e375d9 100644 --- a/src/backend/access/heap/heaptoast.c +++ b/src/backend/access/heap/heaptoast.c @@ -770,7 +770,7 @@ heap_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, chcpyend = (sliceoffset + slicelength - 1) % TOAST_MAX_CHUNK_SIZE; memcpy(VARDATA(result) + - (curchunk * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, + ((int32)(curchunk * TOAST_MAX_CHUNK_SIZE) - sliceoffset) + chcpystrt, chunkdata + chcpystrt, (chcpyend - chcpystrt) + 1); diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index c275b08494..d8fb0107ba 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -515,7 +515,7 @@ RelationGetBufferForTuple(Relation relation, Size len, ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("row is too big: size %zu, maximum size %zu", - len, MaxHeapTupleSize))); + len, (Size)MaxHeapTupleSize))); /* Compute desired extra freespace due to fillfactor option */ saveFreeSpace = RelationGetTargetPageFreeSpace(relation, diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 47b9e20915..999ae8f063 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -54,9 +54,9 @@ typedef struct int ndead; int nunused; /* arrays that accumulate indexes of items to be changed */ - OffsetNumber redirected[MaxHeapTuplesPerPage * 2]; - OffsetNumber nowdead[MaxHeapTuplesPerPage]; - OffsetNumber nowunused[MaxHeapTuplesPerPage]; + OffsetNumber redirected[MaxHeapTuplesPerPageLimit * 2]; + OffsetNumber nowdead[MaxHeapTuplesPerPageLimit]; + OffsetNumber nowunused[MaxHeapTuplesPerPageLimit]; /* * marked[i] is true if item i is entered in one of the above arrays. @@ -64,7 +64,7 @@ typedef struct * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is * 1. Otherwise every access would need to subtract 1. */ - bool marked[MaxHeapTuplesPerPage + 1]; + bool marked[MaxHeapTuplesPerPageLimit + 1]; /* * Tuple visibility is only computed once for each tuple, for correctness @@ -74,7 +74,7 @@ typedef struct * * Same indexing as ->marked. */ - int8 htsv[MaxHeapTuplesPerPage + 1]; + int8 htsv[MaxHeapTuplesPerPageLimit + 1]; } PruneState; /* Local functions */ @@ -187,7 +187,7 @@ heap_page_prune_opt(Relation relation, Buffer buffer) */ minfree = RelationGetTargetPageFreeSpace(relation, HEAP_DEFAULT_FILLFACTOR); - minfree = Max(minfree, BLCKSZ / 10); + minfree = Max(minfree, CLUSTER_BLOCK_SIZE / 10); if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree) { @@ -598,7 +598,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) OffsetNumber latestdead = InvalidOffsetNumber, maxoff = PageGetMaxOffsetNumber(dp), offnum; - OffsetNumber chainitems[MaxHeapTuplesPerPage]; + OffsetNumber chainitems[MaxHeapTuplesPerPageLimit]; int nchain = 0, i; diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 424958912c..23096b0149 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -255,7 +255,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm state->rs_old_rel = old_heap; state->rs_new_rel = new_heap; - state->rs_buffer = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + state->rs_buffer = (Page) palloc_aligned(CLUSTER_BLOCK_SIZE, PG_IO_ALIGN_SIZE, 0); /* new_heap needn't be empty, just locked */ state->rs_blockno = RelationGetNumberOfBlocks(new_heap); state->rs_buffer_valid = false; @@ -657,7 +657,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("row is too big: size %zu, maximum size %zu", - len, MaxHeapTupleSize))); + len, (Size)MaxHeapTupleSize))); /* Compute desired extra freespace due to fillfactor option */ saveFreeSpace = RelationGetTargetPageFreeSpace(state->rs_new_rel, @@ -702,7 +702,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup) if (!state->rs_buffer_valid) { /* Initialize a new empty page */ - PageInit(page, BLCKSZ, 0); + PageInit(page, CLUSTER_BLOCK_SIZE, 0); state->rs_buffer_valid = true; } diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 4eb953f904..6037bfffd2 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -98,7 +98,7 @@ * (Note that this is deliberately kept to a power-of-two, usually 2^19.) */ #define FAILSAFE_EVERY_PAGES \ - ((BlockNumber) (((uint64) 4 * 1024 * 1024 * 1024) / BLCKSZ)) + ((BlockNumber) (((uint64) 4 * 1024 * 1024 * 1024) / CLUSTER_BLOCK_SIZE)) /* * When a table has no indexes, vacuum the FSM after every 8GB, approximately @@ -107,7 +107,7 @@ * and we vacuum FSM after each index/heap cleaning pass. */ #define VACUUM_FSM_EVERY_PAGES \ - ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / BLCKSZ)) + ((BlockNumber) (((uint64) 8 * 1024 * 1024 * 1024) / CLUSTER_BLOCK_SIZE)) /* * Before we consider skipping a page that's marked as clean in @@ -749,9 +749,9 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, } if (secs_dur > 0 || usecs_dur > 0) { - read_rate = (double) BLCKSZ * PageMissOp / (1024 * 1024) / + read_rate = (double) CLUSTER_BLOCK_SIZE * PageMissOp / (1024 * 1024) / (secs_dur + usecs_dur / 1000000.0); - write_rate = (double) BLCKSZ * PageDirtyOp / (1024 * 1024) / + write_rate = (double) CLUSTER_BLOCK_SIZE * PageDirtyOp / (1024 * 1024) / (secs_dur + usecs_dur / 1000000.0); } appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"), @@ -1439,7 +1439,7 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, if (GetRecordedFreeSpace(vacrel->rel, blkno) == 0) { - freespace = BLCKSZ - SizeOfPageHeaderData; + freespace = CLUSTER_BLOCK_SIZE - SizeOfPageHeaderData; RecordPageWithFreeSpace(vacrel->rel, blkno, freespace); } @@ -1552,8 +1552,8 @@ lazy_scan_prune(LVRelState *vacrel, int nnewlpdead; HeapPageFreeze pagefrz; int64 fpi_before = pgWalUsage.wal_fpi; - OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; - HeapTupleFreeze frozen[MaxHeapTuplesPerPage]; + OffsetNumber deadoffsets[MaxHeapTuplesPerPageLimit]; + HeapTupleFreeze frozen[MaxHeapTuplesPerPageLimit]; Assert(BufferGetBlockNumber(buf) == blkno); @@ -1972,7 +1972,7 @@ lazy_scan_noprune(LVRelState *vacrel, HeapTupleHeader tupleheader; TransactionId NoFreezePageRelfrozenXid = vacrel->NewRelfrozenXid; MultiXactId NoFreezePageRelminMxid = vacrel->NewRelminMxid; - OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; + OffsetNumber deadoffsets[MaxHeapTuplesPerPageLimit]; Assert(BufferGetBlockNumber(buf) == blkno); @@ -2504,7 +2504,7 @@ lazy_vacuum_heap_page(LVRelState *vacrel, BlockNumber blkno, Buffer buffer, { VacDeadItems *dead_items = vacrel->dead_items; Page page = BufferGetPage(buffer); - OffsetNumber unused[MaxHeapTuplesPerPage]; + OffsetNumber unused[MaxHeapTuplesPerPageLimit]; int nunused = 0; TransactionId visibility_cutoff_xid; bool all_frozen; diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 7d54ec9c0f..24049e9291 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -105,7 +105,7 @@ * extra headers, so the whole page minus the standard page header is * used for the bitmap. */ -#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData)) +#define MAPSIZE (CLUSTER_BLOCK_SIZE - MAXALIGN(SizeOfPageHeaderData)) /* Number of heap blocks we can represent in one byte */ #define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK) @@ -414,8 +414,8 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro */ map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer)); - StaticAssertStmt(MAPSIZE % sizeof(uint64) == 0, - "unsupported MAPSIZE"); + Assert(MAPSIZE % sizeof(uint64) == 0); + if (all_frozen == NULL) { for (i = 0; i < MAPSIZE / sizeof(uint64); i++) @@ -613,7 +613,7 @@ vm_readbuf(Relation rel, BlockNumber blkno, bool extend) { LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); if (PageIsNew(BufferGetPage(buf))) - PageInit(BufferGetPage(buf), BLCKSZ, 0); + PageInit(BufferGetPage(buf), CLUSTER_BLOCK_SIZE, 0); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } return buf; diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index d4db0b28f2..35f12b434d 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -324,7 +324,7 @@ _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, state = (BTDedupState) palloc(sizeof(BTDedupStateData)); state->deduplicate = true; state->nmaxitems = 0; - state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */ + state->maxpostingsize = CLUSTER_BLOCK_SIZE; /* We're not really deduplicating */ state->base = NULL; state->baseoff = InvalidOffsetNumber; state->basetupsize = 0; @@ -353,7 +353,7 @@ _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, delstate.irel = rel; delstate.iblknum = BufferGetBlockNumber(buf); delstate.bottomup = true; - delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz); + delstate.bottomupfreespace = Max(CLUSTER_BLOCK_SIZE / 16, newitemsz); delstate.ndeltids = 0; delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete)); delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus)); @@ -417,7 +417,7 @@ _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, return true; /* Don't dedup when we won't end up back here any time soon anyway */ - return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz); + return PageGetExactFreeSpace(page) >= Max(CLUSTER_BLOCK_SIZE / 24, newitemsz); } /* @@ -597,7 +597,7 @@ _bt_dedup_finish_pending(Page newpage, BTDedupState state) spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData)); /* Increment nintervals, since we wrote a new posting list tuple */ state->nintervals++; - Assert(spacesaving > 0 && spacesaving < BLCKSZ); + Assert(spacesaving > 0 && spacesaving < CLUSTER_BLOCK_SIZE); } /* Reset state for next pending posting list */ diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index d33f814a93..eed23d35fd 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -2685,7 +2685,7 @@ _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, bool simpleonly, bool checkingunique, bool uniquedup, bool indexUnchanged) { - OffsetNumber deletable[MaxIndexTuplesPerPage]; + OffsetNumber deletable[MaxIndexTuplesPerPageLimit]; int ndeletable = 0; OffsetNumber offnum, minoff, diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index d78971bfe8..43d6fe6c50 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -70,7 +70,7 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, BTMetaPageData *metad; BTPageOpaque metaopaque; - _bt_pageinit(page, BLCKSZ); + _bt_pageinit(page, CLUSTER_BLOCK_SIZE); metad = BTPageGetMeta(page); metad->btm_magic = BTREE_MAGIC; @@ -977,7 +977,7 @@ _bt_allocbuf(Relation rel, Relation heaprel) */ buf = ExtendBufferedRel(EB_REL(rel), MAIN_FORKNUM, NULL, EB_LOCK_FIRST); if (!RelationUsesLocalBuffers(rel)) - VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); + VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), CLUSTER_BLOCK_SIZE); /* Initialize the new page before returning it */ page = BufferGetPage(buf); @@ -1060,7 +1060,7 @@ _bt_lockbuf(Relation rel, Buffer buf, int access) * lock/pin held, though. */ if (!RelationUsesLocalBuffers(rel)) - VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); + VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), CLUSTER_BLOCK_SIZE); } /* @@ -1073,13 +1073,13 @@ _bt_unlockbuf(Relation rel, Buffer buf) * Buffer is pinned and locked, which means that it is expected to be * defined and addressable. Check that proactively. */ - VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ); + VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), CLUSTER_BLOCK_SIZE); /* LockBuffer() asserts that pin is held by this backend */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); if (!RelationUsesLocalBuffers(rel)) - VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ); + VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), CLUSTER_BLOCK_SIZE); } /* @@ -1097,7 +1097,7 @@ _bt_conditionallockbuf(Relation rel, Buffer buf) return false; if (!RelationUsesLocalBuffers(rel)) - VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ); + VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), CLUSTER_BLOCK_SIZE); return true; } @@ -1112,7 +1112,7 @@ _bt_upgradelockbufcleanup(Relation rel, Buffer buf) * Buffer is pinned and locked, which means that it is expected to be * defined and addressable. Check that proactively. */ - VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ); + VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), CLUSTER_BLOCK_SIZE); /* LockBuffer() asserts that pin is held by this backend */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); @@ -1160,7 +1160,7 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, bool needswal = RelationNeedsWAL(rel); char *updatedbuf = NULL; Size updatedbuflen = 0; - OffsetNumber updatedoffsets[MaxIndexTuplesPerPage]; + OffsetNumber updatedoffsets[MaxIndexTuplesPerPageLimit]; /* Shouldn't be called unless there's something to do */ Assert(ndeletable > 0 || nupdatable > 0); @@ -1291,7 +1291,7 @@ _bt_delitems_delete(Relation rel, Buffer buf, bool needswal = RelationNeedsWAL(rel); char *updatedbuf = NULL; Size updatedbuflen = 0; - OffsetNumber updatedoffsets[MaxIndexTuplesPerPage]; + OffsetNumber updatedoffsets[MaxIndexTuplesPerPageLimit]; /* Shouldn't be called unless there's something to do */ Assert(ndeletable > 0 || nupdatable > 0); @@ -1524,8 +1524,8 @@ _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, OffsetNumber postingidxoffnum = InvalidOffsetNumber; int ndeletable = 0, nupdatable = 0; - OffsetNumber deletable[MaxIndexTuplesPerPage]; - BTVacuumPosting updatable[MaxIndexTuplesPerPage]; + OffsetNumber deletable[MaxIndexTuplesPerPageLimit]; + BTVacuumPosting updatable[MaxIndexTuplesPerPageLimit]; /* Use tableam interface to determine which tuples to delete first */ snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 4553aaee53..259663bca0 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -154,7 +154,7 @@ btbuildempty(Relation index) Page metapage; /* Construct metapage. */ - metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + metapage = (Page) palloc_aligned(CLUSTER_BLOCK_SIZE, PG_IO_ALIGN_SIZE, 0); _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false)); /* @@ -425,8 +425,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, */ if (scan->xs_want_itup && so->currTuples == NULL) { - so->currTuples = (char *) palloc(BLCKSZ * 2); - so->markTuples = so->currTuples + BLCKSZ; + so->currTuples = (char *) palloc(CLUSTER_BLOCK_SIZE * 2); + so->markTuples = so->currTuples + CLUSTER_BLOCK_SIZE; } /* @@ -1154,9 +1154,9 @@ backtrack: } else if (P_ISLEAF(opaque)) { - OffsetNumber deletable[MaxIndexTuplesPerPage]; + OffsetNumber deletable[MaxIndexTuplesPerPageLimit]; int ndeletable; - BTVacuumPosting updatable[MaxIndexTuplesPerPage]; + BTVacuumPosting updatable[MaxIndexTuplesPerPageLimit]; int nupdatable; OffsetNumber offnum, minoff, diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index c2665fce41..f584adc8c9 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -619,10 +619,10 @@ _bt_blnewpage(uint32 level) Page page; BTPageOpaque opaque; - page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + page = (Page) palloc_aligned(CLUSTER_BLOCK_SIZE, PG_IO_ALIGN_SIZE, 0); /* Zero the page and set up standard page header info */ - _bt_pageinit(page, BLCKSZ); + _bt_pageinit(page, CLUSTER_BLOCK_SIZE); /* Initialize BT opaque state */ opaque = BTPageGetOpaque(page); @@ -660,7 +660,7 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) while (blkno > wstate->btws_pages_written) { if (!wstate->btws_zeropage) - wstate->btws_zeropage = (Page) palloc_aligned(BLCKSZ, + wstate->btws_zeropage = (Page) palloc_aligned(CLUSTER_BLOCK_SIZE, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); /* don't set checksum for all-zero page */ @@ -715,7 +715,7 @@ _bt_pagestate(BTWriteState *wstate, uint32 level) state->btps_level = level; /* set "full" threshold based on level. See notes at head of file. */ if (level > 0) - state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100); + state->btps_full = (CLUSTER_BLOCK_SIZE * (100 - BTREE_NONLEAF_FILLFACTOR) / 100); else state->btps_full = BTGetTargetPageFreeSpace(wstate->index); @@ -1172,7 +1172,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) * set to point to "P_NONE"). This changes the index to the "valid" state * by filling in a valid magic number in the metapage. */ - metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + metapage = (Page) palloc_aligned(CLUSTER_BLOCK_SIZE, PG_IO_ALIGN_SIZE, 0); _bt_initmetapage(metapage, rootblkno, rootlevel, wstate->inskey->allequalimage); _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); @@ -1350,7 +1350,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) * leaf pages full with few very large tuples doesn't seem * like a useful goal.) */ - dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) - + dstate->maxpostingsize = MAXALIGN_DOWN((CLUSTER_BLOCK_SIZE * 10 / 100)) - sizeof(ItemIdData); Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) && dstate->maxpostingsize <= INDEX_SIZE_MASK); diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index c87e46ed66..7035665eaf 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -40,8 +40,8 @@ _bt_restore_page(Page page, char *from, int len) IndexTupleData itupdata; Size itemsz; char *end = from + len; - Item items[MaxIndexTuplesPerPage]; - uint16 itemsizes[MaxIndexTuplesPerPage]; + Item items[MaxIndexTuplesPerPageLimit]; + uint16 itemsizes[MaxIndexTuplesPerPageLimit]; int i; int nitems; diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c index f390c177e4..de760439b0 100644 --- a/src/backend/access/rmgrdesc/xlogdesc.c +++ b/src/backend/access/rmgrdesc/xlogdesc.c @@ -271,7 +271,7 @@ XLogRecGetBlockRefInfo(XLogReaderState *record, bool pretty, "" : " for WAL verification", XLogRecGetBlock(record, block_id)->hole_offset, XLogRecGetBlock(record, block_id)->hole_length, - BLCKSZ - + CLUSTER_BLOCK_SIZE - XLogRecGetBlock(record, block_id)->hole_length - XLogRecGetBlock(record, block_id)->bimg_len, method); diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c index 3554edcc9a..12ec7cf9ee 100644 --- a/src/backend/access/spgist/spgdoinsert.c +++ b/src/backend/access/spgist/spgdoinsert.c @@ -135,7 +135,7 @@ spgPageIndexMultiDelete(SpGistState *state, Page page, BlockNumber blkno, OffsetNumber offnum) { OffsetNumber firstItem; - OffsetNumber sortednos[MaxIndexTuplesPerPage]; + OffsetNumber sortednos[MaxIndexTuplesPerPageLimit]; SpGistDeadTuple tuple = NULL; int i; @@ -341,8 +341,8 @@ checkSplitConditions(Relation index, SpGistState *state, if (SpGistBlockIsRoot(current->blkno)) { /* return impossible values to force split */ - *nToSplit = BLCKSZ; - return BLCKSZ; + *nToSplit = CLUSTER_BLOCK_SIZE; + return CLUSTER_BLOCK_SIZE; } i = current->offnum; diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 72d2e1551c..1e1025128d 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -158,7 +158,7 @@ spgbuildempty(Relation index) Page page; /* Construct metapage. */ - page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); + page = (Page) palloc_aligned(CLUSTER_BLOCK_SIZE, PG_IO_ALIGN_SIZE, 0); SpGistInitMetapage(page); /* diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c index cbfaf0c00a..8686916846 100644 --- a/src/backend/access/spgist/spgscan.c +++ b/src/backend/access/spgist/spgscan.c @@ -755,8 +755,8 @@ spgGetNextQueueItem(SpGistScanOpaque so) enum SpGistSpecialOffsetNumbers { SpGistBreakOffsetNumber = InvalidOffsetNumber, - SpGistRedirectOffsetNumber = MaxOffsetNumber + 1, - SpGistErrorOffsetNumber = MaxOffsetNumber + 2 + SpGistRedirectOffsetNumber = MaxOffsetNumberLimit + 1, + SpGistErrorOffsetNumber = MaxOffsetNumberLimit + 2 }; static OffsetNumber diff --git a/src/backend/access/spgist/spgtextproc.c b/src/backend/access/spgist/spgtextproc.c index 03a7afdbab..a741dd1614 100644 --- a/src/backend/access/spgist/spgtextproc.c +++ b/src/backend/access/spgist/spgtextproc.c @@ -53,20 +53,20 @@ * In the worst case, an inner tuple in a text radix tree could have as many * as 258 nodes (one for each possible byte value, plus the two special * cases). Each node can take 16 bytes on MAXALIGN=8 machines. The inner - * tuple must fit on an index page of size BLCKSZ. Rather than assuming we + * tuple must fit on an index page of size CLUSTER_BLOCK_SIZE. Rather than assuming we * know the exact amount of overhead imposed by page headers, tuple headers, * etc, we leave 100 bytes for that (the actual overhead should be no more * than 56 bytes at this writing, so there is slop in this number). - * So we can safely create prefixes up to BLCKSZ - 258 * 16 - 100 bytes long. + * So we can safely create prefixes up to CLUSTER_BLOCK_SIZE - 258 * 16 - 100 bytes long. * Unfortunately, because 258 * 16 is over 4K, there is no safe prefix length - * when BLCKSZ is less than 8K; it is always possible to get "SPGiST inner + * when CLUSTER_BLOCK_SIZE is less than 8K; it is always possible to get "SPGiST inner * tuple size exceeds maximum" if there are too many distinct next-byte values * at a given place in the tree. Since use of nonstandard block sizes appears * to be negligible in the field, we just live with that fact for now, - * choosing a max prefix size of 32 bytes when BLCKSZ is configured smaller + * choosing a max prefix size of 32 bytes when CLUSTER_BLOCK_SIZE is configured smaller * than default. */ -#define SPGIST_MAX_PREFIX_LENGTH Max((int) (BLCKSZ - 258 * 16 - 100), 32) +#define SPGIST_MAX_PREFIX_LENGTH Max((int) (CLUSTER_BLOCK_SIZE - 258 * 16 - 100), 32) /* * Strategy for collation aware operator on text is equal to btree strategy diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index 190e4f76a9..768b7128c5 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -681,7 +681,7 @@ SpGistInitPage(Page page, uint16 f) { SpGistPageOpaque opaque; - PageInit(page, BLCKSZ, sizeof(SpGistPageOpaqueData)); + PageInit(page, CLUSTER_BLOCK_SIZE, sizeof(SpGistPageOpaqueData)); opaque = SpGistPageGetOpaque(page); opaque->flags = f; opaque->spgist_page_id = SPGIST_PAGE_ID; @@ -693,7 +693,7 @@ SpGistInitPage(Page page, uint16 f) void SpGistInitBuffer(Buffer b, uint16 f) { - Assert(BufferGetPageSize(b) == BLCKSZ); + Assert(BufferGetPageSize(b) == CLUSTER_BLOCK_SIZE); SpGistInitPage(BufferGetPage(b), f); } diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 8a5b540c80..f561ff6e2b 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -128,14 +128,14 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, { Page page = BufferGetPage(buffer); spgxlogVacuumLeaf xlrec; - OffsetNumber toDead[MaxIndexTuplesPerPage]; - OffsetNumber toPlaceholder[MaxIndexTuplesPerPage]; - OffsetNumber moveSrc[MaxIndexTuplesPerPage]; - OffsetNumber moveDest[MaxIndexTuplesPerPage]; - OffsetNumber chainSrc[MaxIndexTuplesPerPage]; - OffsetNumber chainDest[MaxIndexTuplesPerPage]; - OffsetNumber predecessor[MaxIndexTuplesPerPage + 1]; - bool deletable[MaxIndexTuplesPerPage + 1]; + OffsetNumber toDead[MaxIndexTuplesPerPageLimit]; + OffsetNumber toPlaceholder[MaxIndexTuplesPerPageLimit]; + OffsetNumber moveSrc[MaxIndexTuplesPerPageLimit]; + OffsetNumber moveDest[MaxIndexTuplesPerPageLimit]; + OffsetNumber chainSrc[MaxIndexTuplesPerPageLimit]; + OffsetNumber chainDest[MaxIndexTuplesPerPageLimit]; + OffsetNumber predecessor[MaxIndexTuplesPerPageLimit + 1]; + bool deletable[MaxIndexTuplesPerPageLimit + 1]; int nDeletable; OffsetNumber i, max = PageGetMaxOffsetNumber(page); @@ -408,7 +408,7 @@ vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer) { Page page = BufferGetPage(buffer); spgxlogVacuumRoot xlrec; - OffsetNumber toDelete[MaxIndexTuplesPerPage]; + OffsetNumber toDelete[MaxIndexTuplesPerPageLimit]; OffsetNumber i, max = PageGetMaxOffsetNumber(page); @@ -498,8 +498,8 @@ vacuumRedirectAndPlaceholder(Relation index, Relation heaprel, Buffer buffer) firstPlaceholder = InvalidOffsetNumber; bool hasNonPlaceholder = false; bool hasUpdate = false; - OffsetNumber itemToPlaceholder[MaxIndexTuplesPerPage]; - OffsetNumber itemnos[MaxIndexTuplesPerPage]; + OffsetNumber itemToPlaceholder[MaxIndexTuplesPerPageLimit]; + OffsetNumber itemnos[MaxIndexTuplesPerPageLimit]; spgxlogVacuumRedirect xlrec; GlobalVisState *vistest; diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 771438c8ce..d962334251 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -636,7 +636,7 @@ table_block_relation_size(Relation rel, ForkNumber forkNumber) else nblocks = smgrnblocks(RelationGetSmgr(rel), forkNumber); - return nblocks * BLCKSZ; + return nblocks * CLUSTER_BLOCK_SIZE; } /* diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index 22c8ae9755..51558efd2c 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -851,7 +851,7 @@ we won't be able to hint its outputs until the second xact is sync'd, up to three walwriter cycles later. This argues for keeping N (the group size) as small as possible. For the moment we are setting the group size to 32, which makes the LSN cache space the same size as the actual clog buffer -space (independently of BLCKSZ). +space (independently of CLUSTER_BLOCK_SIZE). It is useful that we can run both synchronous and asynchronous commit transactions concurrently, but the safety of this is perhaps not diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 4a431d5876..ffcc6e4056 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -45,7 +45,7 @@ #include "storage/sync.h" /* - * Defines for CLOG page sizes. A page is the same BLCKSZ as is used + * Defines for CLOG page sizes. A page is the same CLUSTER_BLOCK_SIZE as is used * everywhere else in Postgres. * * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, @@ -59,7 +59,7 @@ /* We need two bits per xact, so four xacts fit in a byte */ #define CLOG_BITS_PER_XACT 2 #define CLOG_XACTS_PER_BYTE 4 -#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE) +#define CLOG_XACTS_PER_PAGE (CLUSTER_BLOCK_SIZE * CLOG_XACTS_PER_BYTE) #define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1) #define TransactionIdToPage(xid) ((xid) / (TransactionId) CLOG_XACTS_PER_PAGE) @@ -802,7 +802,7 @@ TrimCLOG(void) /* Zero so-far-unused positions in the current byte */ *byteptr &= (1 << bshift) - 1; /* Zero the rest of the page */ - MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1); + MemSet(byteptr + 1, 0, CLUSTER_BLOCK_SIZE - byteno - 1); XactCtl->shared->page_dirty[slotno] = true; } diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index b897fabc70..8d1c26951d 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -37,7 +37,7 @@ #include "utils/timestamp.h" /* - * Defines for CommitTs page sizes. A page is the same BLCKSZ as is used + * Defines for CommitTs page sizes. A page is the same CLUSTER_BLOCK_SIZE as is used * everywhere else in Postgres. * * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, @@ -63,7 +63,7 @@ typedef struct CommitTimestampEntry sizeof(RepOriginId)) #define COMMIT_TS_XACTS_PER_PAGE \ - (BLCKSZ / SizeOfCommitTimestampEntry) + (CLUSTER_BLOCK_SIZE / SizeOfCommitTimestampEntry) #define TransactionIdToCTsPage(xid) \ ((xid) / (TransactionId) COMMIT_TS_XACTS_PER_PAGE) @@ -898,7 +898,7 @@ AdvanceOldestCommitTsXid(TransactionId oldestXact) * Decide whether a commitTS page number is "older" for truncation purposes. * Analogous to CLOGPagePrecedes(). * - * At default BLCKSZ, (1 << 31) % COMMIT_TS_XACTS_PER_PAGE == 128. This + * At default CLUSTER_BLOCK_SIZE, (1 << 31) % COMMIT_TS_XACTS_PER_PAGE == 128. This * introduces differences compared to CLOG and the other SLRUs having (1 << * 31) % per_page == 0. This function never tests exactly * TransactionIdPrecedes(x-2^31, x). When the system reaches xidStopLimit, diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c index 6c68191ca6..9050e449fe 100644 --- a/src/backend/access/transam/generic_xlog.c +++ b/src/backend/access/transam/generic_xlog.c @@ -45,7 +45,7 @@ */ #define FRAGMENT_HEADER_SIZE (2 * sizeof(OffsetNumber)) #define MATCH_THRESHOLD FRAGMENT_HEADER_SIZE -#define MAX_DELTA_SIZE (BLCKSZ + 2 * FRAGMENT_HEADER_SIZE) +#define MAX_DELTA_SIZE (MAX_BLOCK_SIZE + 2 * FRAGMENT_HEADER_SIZE) /* Struct of generic xlog data for single page */ typedef struct @@ -241,8 +241,8 @@ computeDelta(PageData *pageData, Page curpage, Page targetpage) 0, curLower); /* ... and for upper part, ignoring what's between */ computeRegionDelta(pageData, curpage, targetpage, - targetUpper, BLCKSZ, - curUpper, BLCKSZ); + targetUpper, CLUSTER_BLOCK_SIZE, + curUpper, CLUSTER_BLOCK_SIZE); /* * If xlog debug is enabled, then check produced delta. Result of delta @@ -253,11 +253,11 @@ computeDelta(PageData *pageData, Page curpage, Page targetpage) { PGAlignedBlock tmp; - memcpy(tmp.data, curpage, BLCKSZ); + memcpy(tmp.data, curpage, CLUSTER_BLOCK_SIZE); applyPageRedo(tmp.data, pageData->delta, pageData->deltaLen); if (memcmp(tmp.data, targetpage, targetLower) != 0 || memcmp(tmp.data + targetUpper, targetpage + targetUpper, - BLCKSZ - targetUpper) != 0) + CLUSTER_BLOCK_SIZE - targetUpper) != 0) elog(ERROR, "result of generic xlog apply does not match"); } #endif @@ -311,7 +311,7 @@ GenericXLogRegisterBuffer(GenericXLogState *state, Buffer buffer, int flags) /* Empty slot, so use it (there cannot be a match later) */ page->buffer = buffer; page->flags = flags; - memcpy(page->image, BufferGetPage(buffer), BLCKSZ); + memcpy(page->image, BufferGetPage(buffer), CLUSTER_BLOCK_SIZE); return (Page) page->image; } else if (page->buffer == buffer) @@ -373,7 +373,7 @@ GenericXLogFinish(GenericXLogState *state) pageHeader->pd_upper - pageHeader->pd_lower); memcpy(page + pageHeader->pd_upper, pageData->image + pageHeader->pd_upper, - BLCKSZ - pageHeader->pd_upper); + CLUSTER_BLOCK_SIZE - pageHeader->pd_upper); XLogRegisterBuffer(i, pageData->buffer, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); @@ -392,7 +392,7 @@ GenericXLogFinish(GenericXLogState *state) pageHeader->pd_upper - pageHeader->pd_lower); memcpy(page + pageHeader->pd_upper, pageData->image + pageHeader->pd_upper, - BLCKSZ - pageHeader->pd_upper); + CLUSTER_BLOCK_SIZE - pageHeader->pd_upper); XLogRegisterBuffer(i, pageData->buffer, REGBUF_STANDARD); XLogRegisterBufData(i, pageData->delta, pageData->deltaLen); @@ -426,7 +426,7 @@ GenericXLogFinish(GenericXLogState *state) continue; memcpy(BufferGetPage(pageData->buffer), pageData->image, - BLCKSZ); + CLUSTER_BLOCK_SIZE); /* We don't worry about zeroing the "hole" in this case */ MarkBufferDirty(pageData->buffer); } diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index abb022e067..0a3e7e882b 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -93,7 +93,7 @@ /* - * Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is + * Defines for MultiXactOffset page sizes. A page is the same CLUSTER_BLOCK_SIZE as is * used everywhere else in Postgres. * * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF, @@ -106,7 +106,7 @@ */ /* We need four bytes per offset */ -#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) +#define MULTIXACT_OFFSETS_PER_PAGE (CLUSTER_BLOCK_SIZE / sizeof(MultiXactOffset)) #define MultiXactIdToOffsetPage(xid) \ ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) @@ -119,7 +119,7 @@ * additional flag bits for each TransactionId. To do this without getting * into alignment issues, we store four bytes of flags, and then the * corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and - * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups + * are stored as a whole in pages. Thus, with 8kB CLUSTER_BLOCK_SIZE, we keep 409 groups * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and * performance) trumps space efficiency here. * @@ -138,7 +138,7 @@ /* size in bytes of a complete group */ #define MULTIXACT_MEMBERGROUP_SIZE \ (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) -#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE (CLUSTER_BLOCK_SIZE / MULTIXACT_MEMBERGROUP_SIZE) #define MULTIXACT_MEMBERS_PER_PAGE \ (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) @@ -2072,7 +2072,7 @@ TrimMultiXact(void) offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; - MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); + MemSet(offptr, 0, CLUSTER_BLOCK_SIZE - (entryno * sizeof(MultiXactOffset))); MultiXactOffsetCtl->shared->page_dirty[slotno] = true; } @@ -2104,7 +2104,7 @@ TrimMultiXact(void) xidptr = (TransactionId *) (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); - MemSet(xidptr, 0, BLCKSZ - memberoff); + MemSet(xidptr, 0, CLUSTER_BLOCK_SIZE - memberoff); /* * Note: we don't need to zero out the flag bits in the remaining diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 71ac70fb40..2030fd36e7 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -169,7 +169,7 @@ SimpleLruShmemSize(int nslots, int nlsns) if (nlsns > 0) sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */ - return BUFFERALIGN(sz) + BLCKSZ * nslots; + return BUFFERALIGN(sz) + CLUSTER_BLOCK_SIZE * nslots; } /* @@ -251,7 +251,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, shared->page_status[slotno] = SLRU_PAGE_EMPTY; shared->page_dirty[slotno] = false; shared->page_lru_count[slotno] = 0; - ptr += BLCKSZ; + ptr += CLUSTER_BLOCK_SIZE; } /* Should fit to estimated shmem size */ @@ -297,7 +297,7 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno) SlruRecentlyUsed(shared, slotno); /* Set the buffer to zeroes */ - MemSet(shared->page_buffer[slotno], 0, BLCKSZ); + MemSet(shared->page_buffer[slotno], 0, CLUSTER_BLOCK_SIZE); /* Set the LSNs for this new page to zero */ SimpleLruZeroLSNs(ctl, slotno); @@ -628,7 +628,7 @@ SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno) { int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; - int offset = rpageno * BLCKSZ; + int offset = rpageno * CLUSTER_BLOCK_SIZE; char path[MAXPGPATH]; int fd; bool result; @@ -659,7 +659,7 @@ SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno) SlruReportIOError(ctl, pageno, 0); } - result = endpos >= (off_t) (offset + BLCKSZ); + result = endpos >= (off_t) (offset + CLUSTER_BLOCK_SIZE); if (CloseTransientFile(fd) != 0) { @@ -687,7 +687,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; - off_t offset = rpageno * BLCKSZ; + off_t offset = rpageno * CLUSTER_BLOCK_SIZE; char path[MAXPGPATH]; int fd; @@ -713,13 +713,13 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) ereport(LOG, (errmsg("file \"%s\" doesn't exist, reading as zeroes", path))); - MemSet(shared->page_buffer[slotno], 0, BLCKSZ); + MemSet(shared->page_buffer[slotno], 0, CLUSTER_BLOCK_SIZE); return true; } errno = 0; pgstat_report_wait_start(WAIT_EVENT_SLRU_READ); - if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ) + if (pg_pread(fd, shared->page_buffer[slotno], CLUSTER_BLOCK_SIZE, offset) != CLUSTER_BLOCK_SIZE) { pgstat_report_wait_end(); slru_errcause = SLRU_READ_FAILED; @@ -759,7 +759,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruWriteAll fdata) SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; - off_t offset = rpageno * BLCKSZ; + off_t offset = rpageno * CLUSTER_BLOCK_SIZE; char path[MAXPGPATH]; int fd = -1; @@ -874,7 +874,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruWriteAll fdata) errno = 0; pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE); - if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ) + if (pg_pwrite(fd, shared->page_buffer[slotno], CLUSTER_BLOCK_SIZE, offset) != CLUSTER_BLOCK_SIZE) { pgstat_report_wait_end(); /* if write didn't set errno, assume problem is no disk space */ @@ -933,7 +933,7 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid) { int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; - int offset = rpageno * BLCKSZ; + int offset = rpageno * CLUSTER_BLOCK_SIZE; char path[MAXPGPATH]; SlruFileName(ctl, path, segno); diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index 62bb610167..a066be2672 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -36,7 +36,7 @@ /* - * Defines for SubTrans page sizes. A page is the same BLCKSZ as is used + * Defines for SubTrans page sizes. A page is the same CLUSTER_BLOCK_SIZE as is used * everywhere else in Postgres. * * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, @@ -49,7 +49,7 @@ */ /* We need four bytes per xact */ -#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId)) +#define SUBTRANS_XACTS_PER_PAGE (CLUSTER_BLOCK_SIZE / sizeof(TransactionId)) #define TransactionIdToPage(xid) ((xid) / (TransactionId) SUBTRANS_XACTS_PER_PAGE) #define TransactionIdToEntry(xid) ((xid) % (TransactionId) SUBTRANS_XACTS_PER_PAGE) diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c index 94e152694e..24319d1aeb 100644 --- a/src/backend/access/transam/timeline.c +++ b/src/backend/access/transam/timeline.c @@ -307,7 +307,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, char path[MAXPGPATH]; char tmppath[MAXPGPATH]; char histfname[MAXFNAMELEN]; - char buffer[BLCKSZ]; + char buffer[DEFAULT_BLOCK_SIZE]; int srcfd; int fd; int nbytes; diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 334adac09e..b95d23a4c3 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -370,7 +370,7 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid) * being significant compared to total XID space. (VACUUM requires an XID * if it truncates at wal_level!=minimal. "VACUUM (ANALYZE)", which a DBA * might do by reflex, assigns an XID. Hence, we had better be sure - * there's lots of XIDs left...) Also, at default BLCKSZ, this leaves two + * there's lots of XIDs left...) Also, at default CLUSTER_BLOCK_SIZE, this leaves two * completely-idle segments. In the event of edge-case bugs involving * page or segment arithmetic, idle segments render the bugs unreachable * outside of single-user mode. diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8b0710abe6..de6b0c193a 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -67,6 +67,7 @@ #include "catalog/catversion.h" #include "catalog/pg_control.h" #include "catalog/pg_database.h" +#include "common/blocksize.h" #include "common/controldata_utils.h" #include "common/file_utils.h" #include "executor/instrument.h" @@ -3900,8 +3901,8 @@ WriteControlFile(void) ControlFile->maxAlign = MAXIMUM_ALIGNOF; ControlFile->floatFormat = FLOATFORMAT_VALUE; - ControlFile->blcksz = BLCKSZ; - ControlFile->relseg_size = RELSEG_SIZE; + ControlFile->blcksz = CLUSTER_BLOCK_SIZE; + ControlFile->relseg_size = CLUSTER_RELSEG_SIZE; ControlFile->xlog_blcksz = XLOG_BLCKSZ; ControlFile->xlog_seg_size = wal_segment_size; @@ -3967,14 +3968,104 @@ WriteControlFile(void) XLOG_CONTROL_FILE))); } +extern int block_size; + +/* + * This routine reads and returns the block size from the control file as + * needed; since this is the only field we care about and it needs to be + * handled early in the process, we don't worry about locks and the like; this + * is static for the life of the cluster. + */ +uint32 +ClusterBlockSize(void) +{ + pg_crc32c crc; + int fd, r; + ControlFileData cluster_control; + + if (block_size) + return block_size; + + if (ControlFile) + return ControlFile->blcksz; + + // TODO: check for bootstrap mode, since that's passed in the param? + + /* neither shortcut worked, so go ahead and open the control file and + * parse out only basic field */ + fd = BasicOpenFile(XLOG_CONTROL_FILE, + O_RDONLY | PG_BINARY); + if (fd < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + XLOG_CONTROL_FILE))); + + r = read(fd, &cluster_control, sizeof(ControlFileData)); + if (r != sizeof(ControlFileData)) + { + if (r < 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + XLOG_CONTROL_FILE))); + else + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read file \"%s\": read %d of %zu", + XLOG_CONTROL_FILE, r, sizeof(ControlFileData)))); + } + close(fd); + + /* + * Check for expected pg_control format version. If this is wrong, the + * CRC check will likely fail because we'll be checking the wrong number + * of bytes. Complaining about wrong version will probably be more + * enlightening than complaining about wrong CRC. + */ + + if (cluster_control.pg_control_version != PG_CONTROL_VERSION && cluster_control.pg_control_version % 65536 == 0 && cluster_control.pg_control_version / 65536 != 0) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x)," + " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).", + cluster_control.pg_control_version, cluster_control.pg_control_version, + PG_CONTROL_VERSION, PG_CONTROL_VERSION), + errhint("This could be a problem of mismatched byte ordering. It looks like you need to initdb."))); + + if (cluster_control.pg_control_version != PG_CONTROL_VERSION) + ereport(FATAL, + (errmsg("database files are incompatible with server"), + errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d," + " but the server was compiled with PG_CONTROL_VERSION %d.", + cluster_control.pg_control_version, PG_CONTROL_VERSION), + errhint("It looks like you need to initdb."))); + + /* Now check the CRC. */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, + (char *) &cluster_control, + offsetof(ControlFileData, crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(crc, cluster_control.crc)) + ereport(FATAL, + (errmsg("incorrect checksum in control file"))); + + block_size = cluster_control.blcksz; + + return block_size; +} + static void ReadControlFile(void) { pg_crc32c crc; int fd; static char wal_segsz_str[20]; + static char block_size_str[20]; int r; - + int block_size; /* * Read data... */ @@ -4040,6 +4131,26 @@ ReadControlFile(void) ereport(FATAL, (errmsg("incorrect checksum in control file"))); + /* + * Block size computations affect a number of things that are later + * checked, so ensure that we calculate as soon as CRC has been validated + * before checking other things that may depend on it. + */ + + block_size = ControlFile->blcksz; + + if (!IsValidBlockSize(block_size)) + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg_plural("Block size must be a power of two between 1k and 32k, but the control file specifies %d byte", + "Block size must be a power of two between 1k and 32k, but the control file specifies %d bytes", + block_size, + block_size))); + + BlockSizeInit(block_size); + snprintf(block_size_str, sizeof(block_size_str), "%d", block_size); + SetConfigOption("block_size", block_size_str, PGC_INTERNAL, + PGC_S_DYNAMIC_DEFAULT); + /* * Do compatibility checking immediately. If the database isn't * compatible with the backend executable, we want to abort before we can @@ -4064,19 +4175,12 @@ ReadControlFile(void) (errmsg("database files are incompatible with server"), errdetail("The database cluster appears to use a different floating-point number format than the server executable."), errhint("It looks like you need to initdb."))); - if (ControlFile->blcksz != BLCKSZ) - ereport(FATAL, - (errmsg("database files are incompatible with server"), - errdetail("The database cluster was initialized with BLCKSZ %d," - " but the server was compiled with BLCKSZ %d.", - ControlFile->blcksz, BLCKSZ), - errhint("It looks like you need to recompile or initdb."))); - if (ControlFile->relseg_size != RELSEG_SIZE) + if (ControlFile->relseg_size != CLUSTER_RELSEG_SIZE) ereport(FATAL, (errmsg("database files are incompatible with server"), - errdetail("The database cluster was initialized with RELSEG_SIZE %d," - " but the server was compiled with RELSEG_SIZE %d.", - ControlFile->relseg_size, RELSEG_SIZE), + errdetail("The database cluster was initialized with CLUSTER_RELSEG_SIZE %d," + " but the server was compiled with CLUSTER_RELSEG_SIZE %d.", + ControlFile->relseg_size, CLUSTER_RELSEG_SIZE), errhint("It looks like you need to recompile or initdb."))); if (ControlFile->xlog_blcksz != XLOG_BLCKSZ) ereport(FATAL, diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 54247e1d81..8210eec724 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -46,18 +46,18 @@ * backup block image. */ #ifdef USE_LZ4 -#define LZ4_MAX_BLCKSZ LZ4_COMPRESSBOUND(BLCKSZ) +#define LZ4_MAX_BLCKSZ LZ4_COMPRESSBOUND(MAX_BLOCK_SIZE) #else #define LZ4_MAX_BLCKSZ 0 #endif #ifdef USE_ZSTD -#define ZSTD_MAX_BLCKSZ ZSTD_COMPRESSBOUND(BLCKSZ) +#define ZSTD_MAX_BLCKSZ ZSTD_COMPRESSBOUND(MAX_BLOCK_SIZE) #else #define ZSTD_MAX_BLCKSZ 0 #endif -#define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ) +#define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(MAX_BLOCK_SIZE) /* Buffer size required to store a compressed version of backup block image */ #define COMPRESS_BUFSIZE Max(Max(PGLZ_MAX_BLCKSZ, LZ4_MAX_BLCKSZ), ZSTD_MAX_BLCKSZ) @@ -383,7 +383,7 @@ XLogRegisterData(char *data, uint32 len) * block_id, the data is appended. * * The maximum amount of data that can be registered per block is 65535 - * bytes. That should be plenty; if you need more than BLCKSZ bytes to + * bytes. That should be plenty; if you need more than CLUSTER_BLOCK_SIZE bytes to * reconstruct the changes to the page, you might as well just log a full * copy of it. (the "main data" that's not associated with a block is not * limited) @@ -650,7 +650,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, if (lower >= SizeOfPageHeaderData && upper > lower && - upper <= BLCKSZ) + upper <= CLUSTER_BLOCK_SIZE) { bimg.hole_offset = lower; cbimg.hole_length = upper - lower; @@ -746,12 +746,12 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, } else { - bimg.length = BLCKSZ - cbimg.hole_length; + bimg.length = CLUSTER_BLOCK_SIZE - cbimg.hole_length; if (cbimg.hole_length == 0) { rdt_datas_last->data = page; - rdt_datas_last->len = BLCKSZ; + rdt_datas_last->len = CLUSTER_BLOCK_SIZE; } else { @@ -765,7 +765,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, rdt_datas_last->data = page + (bimg.hole_offset + cbimg.hole_length); rdt_datas_last->len = - BLCKSZ - (bimg.hole_offset + cbimg.hole_length); + CLUSTER_BLOCK_SIZE - (bimg.hole_offset + cbimg.hole_length); } } @@ -932,7 +932,7 @@ static bool XLogCompressBackupBlock(char *page, uint16 hole_offset, uint16 hole_length, char *dest, uint16 *dlen) { - int32 orig_len = BLCKSZ - hole_length; + int32 orig_len = CLUSTER_BLOCK_SIZE - hole_length; int32 len = -1; int32 extra_bytes = 0; char *source; @@ -945,7 +945,7 @@ XLogCompressBackupBlock(char *page, uint16 hole_offset, uint16 hole_length, memcpy(source, page, hole_offset); memcpy(source + hole_offset, page + (hole_offset + hole_length), - BLCKSZ - (hole_length + hole_offset)); + CLUSTER_BLOCK_SIZE - (hole_length + hole_offset)); /* * Extra data needs to be stored in WAL record for the compressed @@ -1096,10 +1096,10 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std) uint16 upper = ((PageHeader) page)->pd_upper; memcpy(copied_buffer.data, origdata, lower); - memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper); + memcpy(copied_buffer.data + upper, origdata + upper, CLUSTER_BLOCK_SIZE - upper); } else - memcpy(copied_buffer.data, origdata, BLCKSZ); + memcpy(copied_buffer.data, origdata, CLUSTER_BLOCK_SIZE); XLogBeginInsert(); diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index 539928cb85..a70baaab12 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -51,7 +51,7 @@ * Every time we process this much WAL, we'll update the values in * pg_stat_recovery_prefetch. */ -#define XLOGPREFETCHER_STATS_DISTANCE BLCKSZ +#define XLOGPREFETCHER_STATS_DISTANCE CLUSTER_BLOCK_SIZE /* * To detect repeated access to the same block and skip useless extra system diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 2e7b1ba8e1..70170c2029 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -189,7 +189,7 @@ XLogReaderFree(XLogReaderState *state) * readRecordBufSize is set to the new buffer size. * * To avoid useless small increases, round its size to a multiple of - * XLOG_BLCKSZ, and make sure it's at least 5*Max(BLCKSZ, XLOG_BLCKSZ) to start + * XLOG_BLCKSZ, and make sure it's at least 5*Max(CLUSTER_BLOCK_SIZE, XLOG_BLCKSZ) to start * with. (That is enough for all "normal" records, but very large commit or * abort records might need more space.) */ @@ -199,7 +199,7 @@ allocate_recordbuf(XLogReaderState *state, uint32 reclength) uint32 newSize = reclength; newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ); - newSize = Max(newSize, 5 * Max(BLCKSZ, XLOG_BLCKSZ)); + newSize = Max(newSize, 5 * Max(CLUSTER_BLOCK_SIZE, XLOG_BLCKSZ)); #ifndef FRONTEND @@ -1790,17 +1790,17 @@ DecodeXLogRecord(XLogReaderState *state, blk->hole_length = 0; } else - blk->hole_length = BLCKSZ - blk->bimg_len; + blk->hole_length = CLUSTER_BLOCK_SIZE - blk->bimg_len; datatotal += blk->bimg_len; /* * cross-check that hole_offset > 0, hole_length > 0 and - * bimg_len < BLCKSZ if the HAS_HOLE flag is set. + * bimg_len < CLUSTER_BLOCK_SIZE if the HAS_HOLE flag is set. */ if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) && (blk->hole_offset == 0 || blk->hole_length == 0 || - blk->bimg_len == BLCKSZ)) + blk->bimg_len == CLUSTER_BLOCK_SIZE)) { report_invalid_record(state, "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X", @@ -1827,10 +1827,10 @@ DecodeXLogRecord(XLogReaderState *state, } /* - * Cross-check that bimg_len < BLCKSZ if it is compressed. + * Cross-check that bimg_len < CLUSTER_BLOCK_SIZE if it is compressed. */ if (BKPIMAGE_COMPRESSED(blk->bimg_info) && - blk->bimg_len == BLCKSZ) + blk->bimg_len == CLUSTER_BLOCK_SIZE) { report_invalid_record(state, "BKPIMAGE_COMPRESSED set, but block image length %u at %X/%X", @@ -1840,12 +1840,12 @@ DecodeXLogRecord(XLogReaderState *state, } /* - * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE is + * cross-check that bimg_len = CLUSTER_BLOCK_SIZE if neither HAS_HOLE is * set nor COMPRESSED(). */ if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) && !BKPIMAGE_COMPRESSED(blk->bimg_info) && - blk->bimg_len != BLCKSZ) + blk->bimg_len != CLUSTER_BLOCK_SIZE) { report_invalid_record(state, "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_COMPRESSED set, but block image length is %u at %X/%X", @@ -2077,14 +2077,14 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) if ((bkpb->bimg_info & BKPIMAGE_COMPRESS_PGLZ) != 0) { if (pglz_decompress(ptr, bkpb->bimg_len, tmp.data, - BLCKSZ - bkpb->hole_length, true) < 0) + CLUSTER_BLOCK_SIZE - bkpb->hole_length, true) < 0) decomp_success = false; } else if ((bkpb->bimg_info & BKPIMAGE_COMPRESS_LZ4) != 0) { #ifdef USE_LZ4 if (LZ4_decompress_safe(ptr, tmp.data, - bkpb->bimg_len, BLCKSZ - bkpb->hole_length) <= 0) + bkpb->bimg_len, CLUSTER_BLOCK_SIZE - bkpb->hole_length) <= 0) decomp_success = false; #else report_invalid_record(record, "could not restore image at %X/%X compressed with %s not supported by build, block %d", @@ -2098,7 +2098,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) { #ifdef USE_ZSTD size_t decomp_result = ZSTD_decompress(tmp.data, - BLCKSZ - bkpb->hole_length, + CLUSTER_BLOCK_SIZE - bkpb->hole_length, ptr, bkpb->bimg_len); if (ZSTD_isError(decomp_result)) @@ -2133,7 +2133,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) /* generate page, taking into account hole if necessary */ if (bkpb->hole_length == 0) { - memcpy(page, ptr, BLCKSZ); + memcpy(page, ptr, CLUSTER_BLOCK_SIZE); } else { @@ -2142,7 +2142,7 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) MemSet(page + bkpb->hole_offset, 0, bkpb->hole_length); memcpy(page + (bkpb->hole_offset + bkpb->hole_length), ptr + bkpb->hole_offset, - BLCKSZ - (bkpb->hole_offset + bkpb->hole_length)); + CLUSTER_BLOCK_SIZE - (bkpb->hole_offset + bkpb->hole_length)); } return true; diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index becc2bda62..771a626ffc 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -293,7 +293,7 @@ static bool backupEndRequired = false; */ bool reachedConsistency = false; -/* Buffers dedicated to consistency checks of size BLCKSZ */ +/* Buffers dedicated to consistency checks of size CLUSTER_BLOCK_SIZE */ static char *replay_image_masked = NULL; static char *primary_image_masked = NULL; @@ -606,8 +606,8 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, * (2) a static char array isn't guaranteed to have any particular * alignment, whereas palloc() will provide MAXALIGN'd storage. */ - replay_image_masked = (char *) palloc(BLCKSZ); - primary_image_masked = (char *) palloc(BLCKSZ); + replay_image_masked = (char *) palloc(CLUSTER_BLOCK_SIZE); + primary_image_masked = (char *) palloc(CLUSTER_BLOCK_SIZE); if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired, &backupFromStandby)) @@ -2432,7 +2432,7 @@ verifyBackupPageConsistency(XLogReaderState *record) * Take a copy of the local page where WAL has been applied to have a * comparison base before masking it... */ - memcpy(replay_image_masked, page, BLCKSZ); + memcpy(replay_image_masked, page, CLUSTER_BLOCK_SIZE); /* No need for this page anymore now that a copy is in. */ UnlockReleaseBuffer(buf); @@ -2467,7 +2467,7 @@ verifyBackupPageConsistency(XLogReaderState *record) } /* Time to compare the primary and replay images. */ - if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0) + if (memcmp(replay_image_masked, primary_image_masked, CLUSTER_BLOCK_SIZE) != 0) { elog(FATAL, "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u", diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c index 45be21131c..b37502bf36 100644 --- a/src/backend/backup/basebackup.c +++ b/src/backend/backup/basebackup.c @@ -23,6 +23,7 @@ #include "backup/basebackup_sink.h" #include "backup/basebackup_target.h" #include "commands/defrem.h" +#include "common/blocksize.h" #include "common/compression.h" #include "common/file_perm.h" #include "lib/stringinfo.h" @@ -55,7 +56,7 @@ * NB: The buffer size is required to be a multiple of the system block * size, so use that value instead if it's bigger than our preference. */ -#define SINK_BUFFER_LENGTH Max(32768, BLCKSZ) +#define SINK_BUFFER_LENGTH Max(32768, CLUSTER_BLOCK_SIZE) typedef struct { @@ -370,8 +371,8 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) else { /* Properly terminate the tarfile. */ - StaticAssertDecl(2 * TAR_BLOCK_SIZE <= BLCKSZ, - "BLCKSZ too small for 2 tar blocks"); + StaticAssertDecl(2 * TAR_BLOCK_SIZE <= MAX_BLOCK_SIZE, + "CLUSTER_BLOCK_SIZE too small for 2 tar blocks"); memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE); bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE); @@ -623,8 +624,8 @@ perform_base_backup(basebackup_options *opt, bbsink *sink) } /* Properly terminate the tar file. */ - StaticAssertStmt(2 * TAR_BLOCK_SIZE <= BLCKSZ, - "BLCKSZ too small for 2 tar blocks"); + StaticAssertStmt(2 * TAR_BLOCK_SIZE <= MAX_BLOCK_SIZE, + "CLUSTER_BLOCK_SIZE too small for 2 tar blocks"); memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE); bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE); @@ -1562,27 +1563,27 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, /* * The checksums are verified at block level, so we iterate over the - * buffer in chunks of BLCKSZ, after making sure that - * TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read a multiple of - * BLCKSZ bytes. + * buffer in chunks of CLUSTER_BLOCK_SIZE, after making sure that + * TAR_SEND_SIZE/buf is divisible by CLUSTER_BLOCK_SIZE and we read a multiple of + * CLUSTER_BLOCK_SIZE bytes. */ - Assert((sink->bbs_buffer_length % BLCKSZ) == 0); + Assert((sink->bbs_buffer_length % CLUSTER_BLOCK_SIZE) == 0); - if (verify_checksum && (cnt % BLCKSZ != 0)) + if (verify_checksum && (cnt % CLUSTER_BLOCK_SIZE != 0)) { ereport(WARNING, (errmsg("could not verify checksum in file \"%s\", block " "%u: read buffer size %d and page size %d " "differ", - readfilename, blkno, (int) cnt, BLCKSZ))); + readfilename, blkno, (int) cnt, CLUSTER_BLOCK_SIZE))); verify_checksum = false; } if (verify_checksum) { - for (i = 0; i < cnt / BLCKSZ; i++) + for (i = 0; i < cnt / CLUSTER_BLOCK_SIZE; i++) { - page = sink->bbs_buffer + BLCKSZ * i; + page = sink->bbs_buffer + CLUSTER_BLOCK_SIZE * i; /* * Only check pages which have not been modified since the @@ -1594,7 +1595,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, */ if (!PageIsNew(page) && PageGetLSN(page) < sink->bbs_state->startptr) { - checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE); + checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE, CLUSTER_BLOCK_SIZE); phdr = (PageHeader) page; if (phdr->pd_checksum != checksum) { @@ -1625,8 +1626,8 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, /* Reread the failed block */ reread_cnt = basebackup_read_file(fd, - sink->bbs_buffer + BLCKSZ * i, - BLCKSZ, len + BLCKSZ * i, + sink->bbs_buffer + CLUSTER_BLOCK_SIZE * i, + CLUSTER_BLOCK_SIZE, len + CLUSTER_BLOCK_SIZE * i, readfilename, false); if (reread_cnt == 0) @@ -1639,7 +1640,7 @@ sendFile(bbsink *sink, const char *readfilename, const char *tarfilename, * code that handles that case. (We must fix * up cnt first, though.) */ - cnt = BLCKSZ * i; + cnt = CLUSTER_BLOCK_SIZE * i; break; } @@ -1745,12 +1746,12 @@ _tarWriteHeader(bbsink *sink, const char *filename, const char *linktarget, /* * As of this writing, the smallest supported block size is 1kB, which * is twice TAR_BLOCK_SIZE. Since the buffer size is required to be a - * multiple of BLCKSZ, it should be safe to assume that the buffer is + * multiple of CLUSTER_BLOCK_SIZE, it should be safe to assume that the buffer is * large enough to fit an entire tar block. We double-check by means * of these assertions. */ - StaticAssertDecl(TAR_BLOCK_SIZE <= BLCKSZ, - "BLCKSZ too small for tar block"); + StaticAssertDecl(TAR_BLOCK_SIZE <= DEFAULT_BLOCK_SIZE, + "DEFAULT_BLOCK_SIZE too small for tar block"); Assert(sink->bbs_buffer_length >= TAR_BLOCK_SIZE); rc = tarCreateHeader(sink->bbs_buffer, filename, linktarget, diff --git a/src/backend/backup/basebackup_lz4.c b/src/backend/backup/basebackup_lz4.c index 7acb606564..3634eec9b5 100644 --- a/src/backend/backup/basebackup_lz4.c +++ b/src/backend/backup/basebackup_lz4.c @@ -117,10 +117,10 @@ bbsink_lz4_begin_backup(bbsink *sink) &mysink->prefs); /* - * The buffer length is expected to be a multiple of BLCKSZ, so round up. + * The buffer length is expected to be a multiple of CLUSTER_BLOCK_SIZE, so round up. */ - output_buffer_bound = output_buffer_bound + BLCKSZ - - (output_buffer_bound % BLCKSZ); + output_buffer_bound = output_buffer_bound + CLUSTER_BLOCK_SIZE - + (output_buffer_bound % CLUSTER_BLOCK_SIZE); bbsink_begin_backup(sink->bbs_next, sink->bbs_state, output_buffer_bound); } diff --git a/src/backend/backup/basebackup_zstd.c b/src/backend/backup/basebackup_zstd.c index 1bb5820c88..567f334e23 100644 --- a/src/backend/backup/basebackup_zstd.c +++ b/src/backend/backup/basebackup_zstd.c @@ -143,10 +143,10 @@ bbsink_zstd_begin_backup(bbsink *sink) output_buffer_bound = ZSTD_compressBound(mysink->base.bbs_buffer_length); /* - * The buffer length is expected to be a multiple of BLCKSZ, so round up. + * The buffer length is expected to be a multiple of CLUSTER_BLOCK_SIZE, so round up. */ - output_buffer_bound = output_buffer_bound + BLCKSZ - - (output_buffer_bound % BLCKSZ); + output_buffer_bound = output_buffer_bound + CLUSTER_BLOCK_SIZE - + (output_buffer_bound % CLUSTER_BLOCK_SIZE); bbsink_begin_backup(sink->bbs_next, sink->bbs_state, output_buffer_bound); } diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 49e956b2c5..af66b12ac0 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -28,6 +28,7 @@ #include "catalog/index.h" #include "catalog/pg_collation.h" #include "catalog/pg_type.h" +#include "common/blocksize.h" #include "common/link-canary.h" #include "libpq/pqsignal.h" #include "miscadmin.h" @@ -46,6 +47,7 @@ #include "utils/relmapper.h" uint32 bootstrap_data_checksum_version = 0; /* No checksum */ +uint32 bootstrap_blocksize = DEFAULT_BLOCK_SIZE; static void CheckerModeMain(void); @@ -221,10 +223,18 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) argv++; argc--; - while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:X:-:")) != -1) + while ((flag = getopt(argc, argv, "b:B:c:d:D:Fkr:X:-:")) != -1) { switch (flag) { + case 'b': + bootstrap_blocksize = strtol(optarg, NULL, 0); + if (!IsValidBlockSize(bootstrap_blocksize)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid block size: %s; must be power of two between 1k and 32k", + optarg))); + break; case 'B': SetConfigOption("shared_buffers", optarg, PGC_POSTMASTER, PGC_S_ARGV); break; @@ -309,6 +319,8 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) if (!SelectConfigFiles(userDoption, progname)) proc_exit(1); + BlockSizeInit(bootstrap_blocksize); + /* * Validate we have been given a reasonable-looking DataDir and change * into it diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 2add053489..05877dcdd9 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -800,7 +800,7 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker) * main fork is longer than ever but FSM fork gets shorter. */ if (pendingsync->is_truncated || - total_blocks * BLCKSZ / 1024 >= wal_skip_threshold) + total_blocks * CLUSTER_BLOCK_SIZE / 1024 >= wal_skip_threshold) { /* allocate the initial array, or extend it, if needed */ if (maxrels == 0) diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index fc9a371f9b..bd6eccfa5e 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -771,9 +771,9 @@ do_analyze_rel(Relation onerel, VacuumParams *params, if (delay_in_ms > 0) { - read_rate = (double) BLCKSZ * AnalyzePageMiss / (1024 * 1024) / + read_rate = (double) CLUSTER_BLOCK_SIZE * AnalyzePageMiss / (1024 * 1024) / (delay_in_ms / 1000.0); - write_rate = (double) BLCKSZ * AnalyzePageDirty / (1024 * 1024) / + write_rate = (double) CLUSTER_BLOCK_SIZE * AnalyzePageDirty / (1024 * 1024) / (delay_in_ms / 1000.0); } diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index ef909cf4e0..ed953a5a1f 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -163,7 +163,7 @@ * than that, so changes in that data structure won't affect user-visible * restrictions. */ -#define NOTIFY_PAYLOAD_MAX_LENGTH (BLCKSZ - NAMEDATALEN - 128) +#define NOTIFY_PAYLOAD_MAX_LENGTH (DEFAULT_BLOCK_SIZE - NAMEDATALEN - 128) /* * Struct representing an entry in the global notify queue @@ -311,7 +311,7 @@ static AsyncQueueControl *asyncQueueControl; static SlruCtlData NotifyCtlData; #define NotifyCtl (&NotifyCtlData) -#define QUEUE_PAGESIZE BLCKSZ +#define QUEUE_PAGESIZE DEFAULT_BLOCK_SIZE #define QUEUE_FULL_WARN_INTERVAL 5000 /* warn at most once every 5s */ /* @@ -322,7 +322,7 @@ static SlruCtlData NotifyCtlData; * * The most data we can have in the queue at a time is QUEUE_MAX_PAGE/2 * pages, because more than that would confuse slru.c into thinking there - * was a wraparound condition. With the default BLCKSZ this means there + * was a wraparound condition. With the default CLUSTER_BLOCK_SIZE this means there * can be up to 8GB of queued-and-not-read data. * * Note: it's possible to redefine QUEUE_MAX_PAGE with a smaller multiple of diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index a79067fd46..188cf7875e 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -1029,7 +1029,7 @@ parallel_vacuum_main(dsm_segment *seg, shm_toc *toc) /* Each parallel VACUUM worker gets its own access strategy. */ pvs.bstrategy = GetAccessStrategyWithSize(BAS_VACUUM, - shared->ring_nbuffers * (BLCKSZ / 1024)); + shared->ring_nbuffers * (CLUSTER_BLOCK_SIZE / 1024)); /* Setup error traceback support for ereport() */ errcallback.callback = parallel_vacuum_error_callback; diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 468db94fe5..e8aad64aac 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -216,8 +216,8 @@ * * Tapes' buffers can take up substantial memory when many tapes are open at * once. We only need one tape open at a time in read mode (using a buffer - * that's a multiple of BLCKSZ); but we need one tape open in write mode (each - * requiring a buffer of size BLCKSZ) for each partition. + * that's a multiple of CLUSTER_BLOCK_SIZE); but we need one tape open in write mode (each + * requiring a buffer of size CLUSTER_BLOCK_SIZE) for each partition. * * Note that it's possible for transition states to start small but then * grow very large; for instance in the case of ARRAY_AGG. In such cases, @@ -299,12 +299,12 @@ /* * For reading from tapes, the buffer size must be a multiple of - * BLCKSZ. Larger values help when reading from multiple tapes concurrently, - * but that doesn't happen in HashAgg, so we simply use BLCKSZ. Writing to a - * tape always uses a buffer of size BLCKSZ. + * CLUSTER_BLOCK_SIZE. Larger values help when reading from multiple tapes concurrently, + * but that doesn't happen in HashAgg, so we simply use CLUSTER_BLOCK_SIZE. Writing to a + * tape always uses a buffer of size CLUSTER_BLOCK_SIZE. */ -#define HASHAGG_READ_BUFFER_SIZE BLCKSZ -#define HASHAGG_WRITE_BUFFER_SIZE BLCKSZ +#define HASHAGG_READ_BUFFER_SIZE CLUSTER_BLOCK_SIZE +#define HASHAGG_WRITE_BUFFER_SIZE CLUSTER_BLOCK_SIZE /* * HyperLogLog is used for estimating the cardinality of the spilled tuples in @@ -1945,7 +1945,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions) /* update disk usage */ if (aggstate->hash_tapeset != NULL) { - uint64 disk_used = LogicalTapeSetBlocks(aggstate->hash_tapeset) * (BLCKSZ / 1024); + uint64 disk_used = LogicalTapeSetBlocks(aggstate->hash_tapeset) * (CLUSTER_BLOCK_SIZE / 1024); if (aggstate->hash_disk_used < disk_used) aggstate->hash_disk_used = disk_used; diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index 29a1858441..1b34967644 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -13,7 +13,7 @@ * fact that a particular page needs to be visited. * * The "lossy" storage uses one bit per disk page, so at the standard 8K - * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb + * CLUSTER_BLOCK_SIZE, we can represent all pages in 64Gb of disk space in about 1Mb * of memory. People pushing around tables of that size should have a * couple of Mb to spare, so we don't worry about providing a second level * of lossiness. In theory we could fall back to page ranges at some @@ -53,7 +53,7 @@ * the per-page bitmaps variable size. We just legislate that the size * is this: */ -#define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPage +#define MAX_TUPLES_PER_PAGE MaxHeapTuplesPerPageLimit /* * When we have to switch over to lossy storage, we use a data structure @@ -70,7 +70,7 @@ * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to * avoid expensive integer remainder operations. So, define it like this: */ -#define PAGES_PER_CHUNK (BLCKSZ / 32) +#define PAGES_PER_CHUNK (MAX_BLOCK_SIZE / 32) /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */ diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index ef475d95a1..61d95d7412 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -1890,7 +1890,7 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost, /* * We'll have to use a disk-based sort of all the tuples */ - double npages = ceil(input_bytes / BLCKSZ); + double npages = ceil(input_bytes / CLUSTER_BLOCK_SIZE); double nruns = input_bytes / sort_mem_bytes; double mergeorder = tuplesort_merge_order(sort_mem_bytes); double log_runs; @@ -2455,7 +2455,7 @@ cost_material(Path *path, */ if (nbytes > work_mem_bytes) { - double npages = ceil(nbytes / BLCKSZ); + double npages = ceil(nbytes / CLUSTER_BLOCK_SIZE); run_cost += seq_page_cost * npages; } @@ -2764,7 +2764,7 @@ cost_agg(Path *path, PlannerInfo *root, * Estimate number of pages read and written. For each level of * recursion, a tuple must be written and then later read. */ - pages = relation_byte_size(input_tuples, input_width) / BLCKSZ; + pages = relation_byte_size(input_tuples, input_width) / CLUSTER_BLOCK_SIZE; pages_written = pages_read = pages * depth; /* @@ -4310,7 +4310,7 @@ cost_rescan(PlannerInfo *root, Path *path, if (nbytes > work_mem_bytes) { /* It will spill, so account for re-read cost */ - double npages = ceil(nbytes / BLCKSZ); + double npages = ceil(nbytes / CLUSTER_BLOCK_SIZE); run_cost += seq_page_cost * npages; } @@ -4337,7 +4337,7 @@ cost_rescan(PlannerInfo *root, Path *path, if (nbytes > work_mem_bytes) { /* It will spill, so account for re-read cost */ - double npages = ceil(nbytes / BLCKSZ); + double npages = ceil(nbytes / CLUSTER_BLOCK_SIZE); run_cost += seq_page_cost * npages; } @@ -6115,7 +6115,7 @@ relation_byte_size(double tuples, int width) static double page_size(double tuples, int width) { - return ceil(relation_byte_size(tuples, width) / BLCKSZ); + return ceil(relation_byte_size(tuples, width) / CLUSTER_BLOCK_SIZE); } /* diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 39932d3c2d..61228772bc 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -1089,7 +1089,7 @@ estimate_rel_size(Relation rel, int32 *attr_widths, tuple_width += MAXALIGN(SizeofHeapTupleHeader); tuple_width += sizeof(ItemIdData); /* note: integer division is intentional here */ - density = (BLCKSZ - SizeOfPageHeaderData) / tuple_width; + density = (CLUSTER_BLOCK_SIZE - SizeOfPageHeaderData) / tuple_width; } *tuples = rint(density * (double) curpages); diff --git a/src/backend/po/de.po b/src/backend/po/de.po index e3d9daa041..b538af2011 100644 --- a/src/backend/po/de.po +++ b/src/backend/po/de.po @@ -2314,8 +2314,8 @@ msgstr "Der Datenbank-Cluster verwendet anscheinend ein anderes Fließkommazahle #: access/transam/xlog.c:4070 #, c-format -msgid "The database cluster was initialized with BLCKSZ %d, but the server was compiled with BLCKSZ %d." -msgstr "Der Datenbank-Cluster wurde mit BLCKSZ %d initialisiert, aber der Server wurde mit BLCKSZ %d kompiliert." +msgid "The database cluster was initialized with CLUSTER_BLOCK_SIZE %d, but the server was compiled with CLUSTER_BLOCK_SIZE %d." +msgstr "Der Datenbank-Cluster wurde mit CLUSTER_BLOCK_SIZE %d initialisiert, aber der Server wurde mit CLUSTER_BLOCK_SIZE %d kompiliert." #: access/transam/xlog.c:4073 access/transam/xlog.c:4080 #: access/transam/xlog.c:4087 access/transam/xlog.c:4094 @@ -22111,8 +22111,8 @@ msgstr "debug_io_direct wird für WAL nicht unterstützt, weil XLOG_BLCKSZ zu kl #: storage/file/fd.c:3951 #, c-format -msgid "debug_io_direct is not supported for data because BLCKSZ is too small" -msgstr "debug_io_direct wird für Daten nicht unterstützt, weil BLCKSZ zu klein ist" +msgid "debug_io_direct is not supported for data because CLUSTER_BLOCK_SIZE is too small" +msgstr "debug_io_direct wird für Daten nicht unterstützt, weil CLUSTER_BLOCK_SIZEs zu klein ist" #: storage/file/reinit.c:145 #, c-format diff --git a/src/backend/po/es.po b/src/backend/po/es.po index e50a935033..6a44662063 100644 --- a/src/backend/po/es.po +++ b/src/backend/po/es.po @@ -2387,8 +2387,8 @@ msgstr "Los archivos de la base de datos parecen usar un formato de número de c #: access/transam/xlog.c:4070 #, c-format -msgid "The database cluster was initialized with BLCKSZ %d, but the server was compiled with BLCKSZ %d." -msgstr "Los archivos de base de datos fueron inicializados con BLCKSZ %d, pero el servidor fue compilado con BLCKSZ %d." +msgid "The database cluster was initialized with CLUSTER_BLOCK_SIZE %d, but the server was compiled with CLUSTER_BLOCK_SIZE %d." +msgstr "Los archivos de base de datos fueron inicializados con CLUSTER_BLOCK_SIZE %d, pero el servidor fue compilado con CLUSTER_BLOCK_SIZE %d." #: access/transam/xlog.c:4073 access/transam/xlog.c:4080 #: access/transam/xlog.c:4087 access/transam/xlog.c:4094 @@ -21808,7 +21808,7 @@ msgstr "" #: storage/file/fd.c:3901 #, c-format -msgid "io_direct is not supported for data because BLCKSZ is too small" +msgid "io_direct is not supported for data because CLUSTER_BLOCK_SIZE is too small" msgstr "" #: storage/file/reinit.c:145 diff --git a/src/backend/po/fr.po b/src/backend/po/fr.po index ff2a32bdb2..f061583bc6 100644 --- a/src/backend/po/fr.po +++ b/src/backend/po/fr.po @@ -2380,10 +2380,10 @@ msgstr "" #: access/transam/xlog.c:4096 #, c-format -msgid "The database cluster was initialized with BLCKSZ %d, but the server was compiled with BLCKSZ %d." +msgid "The database cluster was initialized with CLUSTER_BLOCK_SIZE %d, but the server was compiled with CLUSTER_BLOCK_SIZE %d." msgstr "" -"Le cluster de base de données a été initialisé avec un BLCKSZ à %d alors que\n" -"le serveur a été compilé avec un BLCKSZ à %d." +"Le cluster de base de données a été initialisé avec un CLUSTER_BLOCK_SIZE à %d alors que\n" +"le serveur a été compilé avec un CLUSTER_BLOCK_SIZE à %d." #: access/transam/xlog.c:4099 access/transam/xlog.c:4106 #: access/transam/xlog.c:4113 access/transam/xlog.c:4120 diff --git a/src/backend/po/id.po b/src/backend/po/id.po index d5d484132b..9817073200 100644 --- a/src/backend/po/id.po +++ b/src/backend/po/id.po @@ -1089,8 +1089,8 @@ msgstr "cluster database sepertinya menggunakan format nomor floating-point yang #: access/transam/xlog.c:3648 #, c-format -msgid "The database cluster was initialized with BLCKSZ %d, but the server was compiled with BLCKSZ %d." -msgstr "cluster database telah diinisialkan dengan BLCKSZ %d, tapi server telah dikompilasi dengan BLCKSZ %d." +msgid "The database cluster was initialized with CLUSTER_BLOCK_SIZE %d, but the server was compiled with CLUSTER_BLOCK_SIZE %d." +msgstr "cluster database telah diinisialkan dengan CLUSTER_BLOCK_SIZE %d, tapi server telah dikompilasi dengan CLUSTER_BLOCK_SIZE %d." #: access/transam/xlog.c:3651 access/transam/xlog.c:3658 access/transam/xlog.c:3665 access/transam/xlog.c:3672 access/transam/xlog.c:3679 access/transam/xlog.c:3686 access/transam/xlog.c:3693 access/transam/xlog.c:3701 access/transam/xlog.c:3708 access/transam/xlog.c:3717 access/transam/xlog.c:3724 access/transam/xlog.c:3733 access/transam/xlog.c:3740 #, c-format diff --git a/src/backend/po/it.po b/src/backend/po/it.po index 673e2aaf00..c0e6beee5e 100644 --- a/src/backend/po/it.po +++ b/src/backend/po/it.po @@ -2067,8 +2067,8 @@ msgstr "Il cluster di database sta usando un formato per i numeri in virgola mob #: access/transam/xlog.c:4096 #, c-format -msgid "The database cluster was initialized with BLCKSZ %d, but the server was compiled with BLCKSZ %d." -msgstr "Il cluster di database è stato inizializzato con BLCKSZ %d, ma il server è stato compilato con BLCKSZ %d." +msgid "The database cluster was initialized with CLUSTER_BLOCK_SIZE %d, but the server was compiled with CLUSTER_BLOCK_SIZE %d." +msgstr "Il cluster di database è stato inizializzato con CLUSTER_BLOCK_SIZE %d, ma il server è stato compilato con CLUSTER_BLOCK_SIZE %d." #: access/transam/xlog.c:4099 access/transam/xlog.c:4106 access/transam/xlog.c:4113 access/transam/xlog.c:4120 access/transam/xlog.c:4127 access/transam/xlog.c:4134 access/transam/xlog.c:4141 access/transam/xlog.c:4149 access/transam/xlog.c:4156 #, c-format diff --git a/src/backend/po/ja.po b/src/backend/po/ja.po index 9f6e2c3250..6a9d07a2ad 100644 --- a/src/backend/po/ja.po +++ b/src/backend/po/ja.po @@ -2081,8 +2081,8 @@ msgstr "データベースクラスタはサーバー実行ファイルと異な #: access/transam/xlog.c:4070 #, c-format -msgid "The database cluster was initialized with BLCKSZ %d, but the server was compiled with BLCKSZ %d." -msgstr "データベースクラスタは BLCKSZ %d で初期化されましたが、サーバーは BLCKSZ %d でコンパイルされています。" +msgid "The database cluster was initialized with CLUSTER_BLOCK_SIZE %d, but the server was compiled with CLUSTER_BLOCK_SIZE %d." +msgstr "データベースクラスタは CLUSTER_BLOCK_SIZE %d で初期化されましたが、サーバーは CLUSTER_BLOCK_SIZE %d でコンパイルされています。" #: access/transam/xlog.c:4073 access/transam/xlog.c:4080 access/transam/xlog.c:4087 access/transam/xlog.c:4094 access/transam/xlog.c:4101 access/transam/xlog.c:4108 access/transam/xlog.c:4115 access/transam/xlog.c:4123 access/transam/xlog.c:4130 #, c-format @@ -21426,8 +21426,8 @@ msgstr "XLOG_BLCKSZが小さすぎるためdebug_io_directはWALに対しては #: storage/file/fd.c:3901 #, c-format -msgid "debug_io_direct is not supported for data because BLCKSZ is too small" -msgstr "BLCKSZが小さすぎるためdebug_io_directはデータに対してサポートされません" +msgid "debug_io_direct is not supported for data because CLUSTER_BLOCK_SIZE is too small" +msgstr "CLUSTER_BLOCK_SIZEが小さすぎるためdebug_io_directはデータに対してサポートされません" #: storage/file/reinit.c:145 #, c-format diff --git a/src/backend/po/ko.po b/src/backend/po/ko.po index f330f3da7e..1627570ed3 100644 --- a/src/backend/po/ko.po +++ b/src/backend/po/ko.po @@ -2348,10 +2348,10 @@ msgstr "" #: access/transam/xlog.c:4802 #, c-format msgid "" -"The database cluster was initialized with BLCKSZ %d, but the server was " -"compiled with BLCKSZ %d." +"The database cluster was initialized with CLUSTER_BLOCK_SIZE %d, but the server was " +"compiled with CLUSTER_BLOCK_SIZE %d." msgstr "" -"이 데이터베이스 클러스터는 BLCKSZ %d (으)로 초기화 되었지만, 서버는 BLCKSZ " +"이 데이터베이스 클러스터는 CLUSTER_BLOCK_SIZE %d (으)로 초기화 되었지만, 서버는 CLUSTER_BLOCK_SIZE " "%d (으)로 컴파일 되어있습니다." #: access/transam/xlog.c:4805 access/transam/xlog.c:4812 diff --git a/src/backend/po/pl.po b/src/backend/po/pl.po index 3ac9d0451c..f2c70fc819 100644 --- a/src/backend/po/pl.po +++ b/src/backend/po/pl.po @@ -1927,8 +1927,8 @@ msgstr "Klaster bazy danych wydaje się używać innego formatu liczb zmiennoprz #: access/transam/xlog.c:4550 #, c-format -msgid "The database cluster was initialized with BLCKSZ %d, but the server was compiled with BLCKSZ %d." -msgstr "Klaster bazy danych został zainicjowany z BLCKSZ %d, ale serwer był skompilowany z BLCKSZ %d." +msgid "The database cluster was initialized with CLUSTER_BLOCK_SIZE %d, but the server was compiled with CLUSTER_BLOCK_SIZE %d." +msgstr "Klaster bazy danych został zainicjowany z CLUSTER_BLOCK_SIZE %d, ale serwer był skompilowany z CLUSTER_BLOCK_SIZE %d." #: access/transam/xlog.c:4553 access/transam/xlog.c:4560 #: access/transam/xlog.c:4567 access/transam/xlog.c:4574 diff --git a/src/backend/po/pt_BR.po b/src/backend/po/pt_BR.po index 37e4a28f07..949e441816 100644 --- a/src/backend/po/pt_BR.po +++ b/src/backend/po/pt_BR.po @@ -1264,7 +1264,7 @@ msgstr "O agrupamento de banco de dados parece utilizar um formato de número de #: access/transam/xlog.c:4506 #, c-format -msgid "The database cluster was initialized with BLCKSZ %d, but the server was compiled with BLCKSZ %d." +msgid "The database cluster was initialized with CLUSTER_BLOCK_SIZE %d, but the server was compiled with CLUSTER_BLOCK_SIZE %d." msgstr "O agrupamento de banco de dados foi inicializado com BLCSZ %d, mas o servidor foi compilado com BLCSZ %d." #: access/transam/xlog.c:4509 access/transam/xlog.c:4516 diff --git a/src/backend/po/ru.po b/src/backend/po/ru.po index ae9c50eed7..9118189857 100644 --- a/src/backend/po/ru.po +++ b/src/backend/po/ru.po @@ -2619,11 +2619,11 @@ msgstr "" #: access/transam/xlog.c:4096 #, c-format msgid "" -"The database cluster was initialized with BLCKSZ %d, but the server was " -"compiled with BLCKSZ %d." +"The database cluster was initialized with CLUSTER_BLOCK_SIZE %d, but the server was " +"compiled with CLUSTER_BLOCK_SIZE %d." msgstr "" -"Кластер баз данных был инициализирован с BLCKSZ %d, но сервер скомпилирован " -"с BLCKSZ %d." +"Кластер баз данных был инициализирован с CLUSTER_BLOCK_SIZE %d, но сервер скомпилирован " +"с CLUSTER_BLOCK_SIZE %d." #: access/transam/xlog.c:4099 access/transam/xlog.c:4106 #: access/transam/xlog.c:4113 access/transam/xlog.c:4120 diff --git a/src/backend/po/sv.po b/src/backend/po/sv.po index a5ac231971..2d5aef8b02 100644 --- a/src/backend/po/sv.po +++ b/src/backend/po/sv.po @@ -2306,8 +2306,8 @@ msgstr "Databasklustret verkar använda en annan flyttalsrepresentation än vad #: access/transam/xlog.c:4096 #, c-format -msgid "The database cluster was initialized with BLCKSZ %d, but the server was compiled with BLCKSZ %d." -msgstr "Databasklustret initierades med BLCKSZ %d, men servern kompilerades med BLCKSZ %d." +msgid "The database cluster was initialized with CLUSTER_BLOCK_SIZE %d, but the server was compiled with CLUSTER_BLOCK_SIZE %d." +msgstr "Databasklustret initierades med CLUSTER_BLOCK_SIZE %d, men servern kompilerades med CLUSTER_BLOCK_SIZE %d." #: access/transam/xlog.c:4099 access/transam/xlog.c:4106 #: access/transam/xlog.c:4113 access/transam/xlog.c:4120 diff --git a/src/backend/po/tr.po b/src/backend/po/tr.po index b791e886b9..87074381a5 100644 --- a/src/backend/po/tr.po +++ b/src/backend/po/tr.po @@ -1761,8 +1761,8 @@ msgstr "Veritabanı dosyaları, sunucu programından farklı ondalık sayı biç #: access/transam/xlog.c:4676 #, c-format -msgid "The database cluster was initialized with BLCKSZ %d, but the server was compiled with BLCKSZ %d." -msgstr "Veritabanı clusteri BLCKSZ %d ile ilklendirilmiştir, ancak sunucu BLCKSZ %d ile derlenmiştir." +msgid "The database cluster was initialized with CLUSTER_BLOCK_SIZE %d, but the server was compiled with CLUSTER_BLOCK_SIZE %d." +msgstr "Veritabanı clusteri CLUSTER_BLOCK_SIZE %d ile ilklendirilmiştir, ancak sunucu CLUSTER_BLOCK_SIZE %d ile derlenmiştir." #: access/transam/xlog.c:4679 access/transam/xlog.c:4686 access/transam/xlog.c:4693 access/transam/xlog.c:4700 access/transam/xlog.c:4707 access/transam/xlog.c:4714 access/transam/xlog.c:4721 access/transam/xlog.c:4729 access/transam/xlog.c:4736 access/transam/xlog.c:4745 access/transam/xlog.c:4752 #, c-format diff --git a/src/backend/po/uk.po b/src/backend/po/uk.po index 1095fd9139..ef22b41d43 100644 --- a/src/backend/po/uk.po +++ b/src/backend/po/uk.po @@ -2288,8 +2288,8 @@ msgstr "Здається, в кластері баз даних і в прогр #: access/transam/xlog.c:4096 #, c-format -msgid "The database cluster was initialized with BLCKSZ %d, but the server was compiled with BLCKSZ %d." -msgstr "Кластер бази даних було ініціалізовано з BLCKSZ %d, але сервер було скомпільовано з BLCKSZ %d." +msgid "The database cluster was initialized with CLUSTER_BLOCK_SIZE %d, but the server was compiled with CLUSTER_BLOCK_SIZE %d." +msgstr "Кластер бази даних було ініціалізовано з CLUSTER_BLOCK_SIZE %d, але сервер було скомпільовано з CLUSTER_BLOCK_SIZE %d." #: access/transam/xlog.c:4099 access/transam/xlog.c:4106 #: access/transam/xlog.c:4113 access/transam/xlog.c:4120 diff --git a/src/backend/po/zh_CN.po b/src/backend/po/zh_CN.po index 574684d775..6acbabf785 100644 --- a/src/backend/po/zh_CN.po +++ b/src/backend/po/zh_CN.po @@ -1852,8 +1852,8 @@ msgstr "数据库集群在使用与服务器执行部分不同的浮点数格式 #: access/transam/xlog.c:4677 #, c-format -msgid "The database cluster was initialized with BLCKSZ %d, but the server was compiled with BLCKSZ %d." -msgstr "数据库簇是以 BLCKSZ %d 初始化的, 但是 服务器是以 BLCKSZ %d 编译的." +msgid "The database cluster was initialized with CLUSTER_BLOCK_SIZE %d, but the server was compiled with CLUSTER_BLOCK_SIZE %d." +msgstr "数据库簇是以 CLUSTER_BLOCK_SIZE %d 初始化的, 但是 服务器是以 CLUSTER_BLOCK_SIZE %d 编译的." #: access/transam/xlog.c:4680 access/transam/xlog.c:4687 #: access/transam/xlog.c:4694 access/transam/xlog.c:4701 diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c index 0ee764d68f..83d6d735d9 100644 --- a/src/backend/replication/logical/worker.c +++ b/src/backend/replication/logical/worker.c @@ -2062,7 +2062,7 @@ apply_spooled_messages(FileSet *stream_fileset, TransactionId xid, CurrentResourceOwner = oldowner; - buffer = palloc(BLCKSZ); + buffer = palloc(CLUSTER_BLOCK_SIZE); initStringInfo(&s2); MemoryContextSwitchTo(oldcxt); diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 0057443f0c..b7e0d4afa5 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -82,7 +82,7 @@ InitBufferPool(void) BufferBlocks = (char *) TYPEALIGN(PG_IO_ALIGN_SIZE, ShmemInitStruct("Buffer Blocks", - NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, + NBuffers * (Size) CLUSTER_BLOCK_SIZE + PG_IO_ALIGN_SIZE, &foundBufs)); /* Align condition variables to cacheline boundary. */ @@ -168,7 +168,7 @@ BufferShmemSize(void) /* size of data pages, plus alignment padding */ size = add_size(size, PG_IO_ALIGN_SIZE); - size = add_size(size, mul_size(NBuffers, BLCKSZ)); + size = add_size(size, mul_size(NBuffers, CLUSTER_BLOCK_SIZE)); /* size of stuff controlled by freelist.c */ size = add_size(size, StrategyShmemSize()); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 3c59bbd04e..f31f15d7f7 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -60,7 +60,7 @@ /* Note: these two macros only work on shared buffers, not local ones! */ -#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ)) +#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * CLUSTER_BLOCK_SIZE)) #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr))) /* Note: this macro only works on local buffers, not shared ones! */ @@ -1116,7 +1116,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * wants us to allocate a buffer. */ if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) - MemSet((char *) bufBlock, 0, BLCKSZ); + MemSet((char *) bufBlock, 0, CLUSTER_BLOCK_SIZE); else { instr_time io_start = pgstat_prepare_io_time(); @@ -1137,7 +1137,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, errmsg("invalid page in block %u of relation %s; zeroing out page", blockNum, relpath(smgr->smgr_rlocator, forkNum)))); - MemSet((char *) bufBlock, 0, BLCKSZ); + MemSet((char *) bufBlock, 0, CLUSTER_BLOCK_SIZE); } else ereport(ERROR, @@ -1856,7 +1856,7 @@ ExtendBufferedRelShared(ExtendBufferedWhat eb, buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1)); /* new buffers are zero-filled */ - MemSet((char *) buf_block, 0, BLCKSZ); + MemSet((char *) buf_block, 0, CLUSTER_BLOCK_SIZE); } /* in case we need to pin an existing buffer below */ @@ -2285,7 +2285,7 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy) * not generally guaranteed to be marked undefined or * non-accessible in any case. */ - VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ); + VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), CLUSTER_BLOCK_SIZE); break; } } @@ -2350,7 +2350,7 @@ PinBuffer_Locked(BufferDesc *buf) * Valgrind (this is similar to the PinBuffer() case where the backend * doesn't already have a buffer pin) */ - VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ); + VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), CLUSTER_BLOCK_SIZE); /* * Since we hold the buffer spinlock, we can update the buffer state and @@ -2403,7 +2403,7 @@ UnpinBuffer(BufferDesc *buf) * within access method code that enforces that buffers are only * accessed while a buffer lock is held. */ - VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ); + VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), CLUSTER_BLOCK_SIZE); /* I'd better not still hold the buffer content lock */ Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf))); @@ -3491,7 +3491,7 @@ RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum) if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind)) { /* - * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore + * Not every table AM uses CLUSTER_BLOCK_SIZE wide fixed size blocks. Therefore * tableam returns the size in bytes - but for the purpose of this * routine, we want the number of blocks. Therefore divide, rounding * up. @@ -3500,7 +3500,7 @@ RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum) szbytes = table_relation_size(relation, forkNum); - return (szbytes + (BLCKSZ - 1)) / BLCKSZ; + return (szbytes + (CLUSTER_BLOCK_SIZE - 1)) / CLUSTER_BLOCK_SIZE; } else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind)) { @@ -4288,7 +4288,7 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, * Bulk extend the destination relation of the same size as the source * relation before starting to copy block by block. */ - memset(buf.data, 0, BLCKSZ); + memset(buf.data, 0, CLUSTER_BLOCK_SIZE); smgrextend(smgropen(dstlocator, InvalidBackendId), forkNum, nblocks - 1, buf.data, true); @@ -4316,7 +4316,7 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, START_CRIT_SECTION(); /* Copy page data from the source to the destination. */ - memcpy(dstPage, srcPage, BLCKSZ); + memcpy(dstPage, srcPage, CLUSTER_BLOCK_SIZE); MarkBufferDirty(dstBuf); /* WAL-log the copied page. */ diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 1c804fd2f5..326de13d10 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -589,7 +589,7 @@ GetAccessStrategyWithSize(BufferAccessStrategyType btype, int ring_size_kb) Assert(ring_size_kb >= 0); /* Figure out how many buffers ring_size_kb is */ - ring_buffers = ring_size_kb / (BLCKSZ / 1024); + ring_buffers = ring_size_kb / (CLUSTER_BLOCK_SIZE / 1024); /* 0 means unlimited, so no BufferAccessStrategy required */ if (ring_buffers == 0) diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index f684862d98..2939f908d4 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -335,7 +335,7 @@ ExtendBufferedRelLocal(ExtendBufferedWhat eb, buf_block = LocalBufHdrGetBlock(buf_hdr); /* new buffers are zero-filled */ - MemSet((char *) buf_block, 0, BLCKSZ); + MemSet((char *) buf_block, 0, CLUSTER_BLOCK_SIZE); } first_block = smgrnblocks(eb.smgr, fork); @@ -745,19 +745,19 @@ GetLocalBufferStorage(void) /* But not more than what we need for all remaining local bufs */ num_bufs = Min(num_bufs, NLocBuffer - total_bufs_allocated); /* And don't overflow MaxAllocSize, either */ - num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ); + num_bufs = Min(num_bufs, MaxAllocSize / CLUSTER_BLOCK_SIZE); /* Buffers should be I/O aligned. */ cur_block = (char *) TYPEALIGN(PG_IO_ALIGN_SIZE, MemoryContextAlloc(LocalBufferContext, - num_bufs * BLCKSZ + PG_IO_ALIGN_SIZE)); + num_bufs * CLUSTER_BLOCK_SIZE + PG_IO_ALIGN_SIZE)); next_buf_in_block = 0; num_bufs_in_block = num_bufs; } /* Allocate next buffer in current memory block */ - this_buf = cur_block + next_buf_in_block * BLCKSZ; + this_buf = cur_block + next_buf_in_block * CLUSTER_BLOCK_SIZE; next_buf_in_block++; total_bufs_allocated++; diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index 41ab64100e..f382d0a2ea 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -60,7 +60,7 @@ * tablespaces when available. */ #define MAX_PHYSICAL_FILESIZE 0x40000000 -#define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / BLCKSZ) +#define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / CLUSTER_BLOCK_SIZE) /* * This data structure represents a buffered file that consists of one or @@ -681,7 +681,7 @@ BufFileWrite(BufFile *file, const void *ptr, size_t size) while (size > 0) { - if (file->pos >= BLCKSZ) + if (file->pos >= CLUSTER_BLOCK_SIZE) { /* Buffer full, dump it out */ if (file->dirty) @@ -695,7 +695,7 @@ BufFileWrite(BufFile *file, const void *ptr, size_t size) } } - nthistime = BLCKSZ - file->pos; + nthistime = CLUSTER_BLOCK_SIZE - file->pos; if (nthistime > size) nthistime = size; Assert(nthistime > 0); @@ -839,9 +839,9 @@ BufFileTell(BufFile *file, int *fileno, off_t *offset) /* * BufFileSeekBlock --- block-oriented seek * - * Performs absolute seek to the start of the n'th BLCKSZ-sized block of + * Performs absolute seek to the start of the n'th CLUSTER_BLOCK_SIZE-sized block of * the file. Note that users of this interface will fail if their files - * exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work + * exceed CLUSTER_BLOCK_SIZE * LONG_MAX bytes, but that is quite a lot; we don't work * with tables bigger than that, either... * * Result is 0 if OK, EOF if not. Logical position is not moved if an @@ -852,7 +852,7 @@ BufFileSeekBlock(BufFile *file, long blknum) { return BufFileSeek(file, (int) (blknum / BUFFILE_SEG_SIZE), - (off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ, + (off_t) (blknum % BUFFILE_SEG_SIZE) * CLUSTER_BLOCK_SIZE, SEEK_SET); } @@ -867,7 +867,7 @@ BufFileTellBlock(BufFile *file) { long blknum; - blknum = (file->curOffset + file->pos) / BLCKSZ; + blknum = (file->curOffset + file->pos) / CLUSTER_BLOCK_SIZE; blknum += file->curFile * BUFFILE_SEG_SIZE; return blknum; } diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c index e04bc3941a..45a83750b1 100644 --- a/src/backend/storage/file/copydir.c +++ b/src/backend/storage/file/copydir.c @@ -124,7 +124,7 @@ copy_file(const char *fromfile, const char *tofile) off_t flush_offset; /* Size of copy buffer (read and write requests) */ -#define COPY_BUF_SIZE (8 * BLCKSZ) +#define COPY_BUF_SIZE (8 * CLUSTER_BLOCK_SIZE) /* * Size of data flush requests. It seems beneficial on most platforms to diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 3c2a2fbef7..a7a6cd15a9 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -3945,10 +3945,10 @@ check_io_direct(char **newval, void **extra, GucSource source) result = false; } #endif -#if BLCKSZ < PG_IO_ALIGN_SIZE +#if DEFAULT_BLOCK_SIZE < PG_IO_ALIGN_SIZE if (result && (flags & IO_DIRECT_DATA)) { - GUC_check_errdetail("debug_io_direct is not supported for data because BLCKSZ is too small"); + GUC_check_errdetail("debug_io_direct is not supported for data because DEFAULT_BLOCK_SIZE is too small"); result = false; } #endif diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README index e7ff23b76f..7d03236b95 100644 --- a/src/backend/storage/freespace/README +++ b/src/backend/storage/freespace/README @@ -14,8 +14,8 @@ It is important to keep the map small so that it can be searched rapidly. Therefore, we don't attempt to record the exact free space on a page. We allocate one map byte to each page, allowing us to record free space at a granularity of 1/256th of a page. Another way to say it is that -the stored value is the free space divided by BLCKSZ/256 (rounding down). -We assume that the free space must always be less than BLCKSZ, since +the stored value is the free space divided by CLUSTER_BLOCK_SIZE/256 (rounding down). +We assume that the free space must always be less than CLUSTER_BLOCK_SIZE, since all pages have some overhead; so the maximum map value is 255. To assist in fast searching, the map isn't simply an array of per-page @@ -97,7 +97,7 @@ has the same value as the corresponding leaf node on its parent page. The root page is always stored at physical block 0. For example, assuming each FSM page can hold information about 4 pages (in -reality, it holds (BLCKSZ - headers) / 2, or ~4000 with default BLCKSZ), +reality, it holds (CLUSTER_BLOCK_SIZE - headers) / 2, or ~4000 with default CLUSTER_BLOCK_SIZE), we get a disk layout like this: 0 <-- page 0 at level 2 (root page) @@ -136,7 +136,7 @@ and so forth. To keep things simple, the tree is always constant height. To cover the maximum relation size of 2^32-1 blocks, three levels is enough with the default -BLCKSZ (4000^3 > 2^32). +CLUSTER_BLOCK_SIZE (4000^3 > 2^32). Addressing ---------- diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index 2face615d0..5a28252dcd 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -41,8 +41,8 @@ * represents the range from 254 * FSM_CAT_STEP, inclusive, to * MaxFSMRequestSize, exclusive. * - * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming - * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the + * MaxFSMRequestSize depends on the architecture and CLUSTER_BLOCK_SIZE, but assuming + * default 8k CLUSTER_BLOCK_SIZE, and that MaxFSMRequestSize is 8164 bytes, the * categories look like this: * * @@ -62,14 +62,14 @@ * request of exactly MaxFSMRequestSize bytes. */ #define FSM_CATEGORIES 256 -#define FSM_CAT_STEP (BLCKSZ / FSM_CATEGORIES) +#define FSM_CAT_STEP (CLUSTER_BLOCK_SIZE / FSM_CATEGORIES) #define MaxFSMRequestSize MaxHeapTupleSize /* * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks, * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise, * 256 is the smallest number that satisfies X^4 >= 2^32-1. In practice, - * this means that 4096 bytes is the smallest BLCKSZ that we can get away + * this means that 4096 bytes is the smallest CLUSTER_BLOCK_SIZE that we can get away * with a 3-level tree, and 512 is the smallest we support. */ #define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4) @@ -87,8 +87,8 @@ typedef struct int logpageno; /* page number within the level */ } FSMAddress; -/* Address of the root page. */ -static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0}; +/* Address of the root page. Level is adjusted by FreeSpaceMapInit() */ +static FSMAddress FSM_ROOT_ADDRESS = {0,0}; /* functions to navigate the tree */ static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot); @@ -116,6 +116,17 @@ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, /******** Public API ********/ +/* + * FreeSpaceMapInit - initialize the FSM system with the cluster block size + * + */ + +void +FreeSpaceMapInit(void) +{ + FSM_ROOT_ADDRESS.level = FSM_ROOT_LEVEL; +} + /* * GetPageWithFreeSpace - try to find a page in the given relation with * at least the specified amount of free space. @@ -217,7 +228,7 @@ XLogRecordPageWithFreeSpace(RelFileLocator rlocator, BlockNumber heapBlk, page = BufferGetPage(buf); if (PageIsNew(page)) - PageInit(page, BLCKSZ, 0); + PageInit(page, CLUSTER_BLOCK_SIZE, 0); if (fsm_set_avail(page, slot, new_cat)) MarkBufferDirtyHint(buf, false); @@ -370,7 +381,7 @@ fsm_space_avail_to_cat(Size avail) { int cat; - Assert(avail < BLCKSZ); + Assert(avail < CLUSTER_BLOCK_SIZE); if (avail >= MaxFSMRequestSize) return 255; @@ -598,7 +609,7 @@ fsm_readbuf(Relation rel, FSMAddress addr, bool extend) { LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); if (PageIsNew(BufferGetPage(buf))) - PageInit(BufferGetPage(buf), BLCKSZ, 0); + PageInit(BufferGetPage(buf), CLUSTER_BLOCK_SIZE, 0); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } return buf; diff --git a/src/backend/storage/freespace/indexfsm.c b/src/backend/storage/freespace/indexfsm.c index fff8f4fbfb..ba10e22bc0 100644 --- a/src/backend/storage/freespace/indexfsm.c +++ b/src/backend/storage/freespace/indexfsm.c @@ -16,7 +16,7 @@ * This is similar to the FSM used for heap, in freespace.c, but instead * of tracking the amount of free space on pages, we only track whether * pages are completely free or in-use. We use the same FSM implementation - * as for heaps, using BLCKSZ - 1 to denote used pages, and 0 for unused. + * as for heaps, using CLUSTER_BLOCK_SIZE - 1 to denote used pages, and 0 for unused. * *------------------------------------------------------------------------- */ @@ -37,7 +37,7 @@ BlockNumber GetFreeIndexPage(Relation rel) { - BlockNumber blkno = GetPageWithFreeSpace(rel, BLCKSZ / 2); + BlockNumber blkno = GetPageWithFreeSpace(rel, CLUSTER_BLOCK_SIZE / 2); if (blkno != InvalidBlockNumber) RecordUsedIndexPage(rel, blkno); @@ -51,7 +51,7 @@ GetFreeIndexPage(Relation rel) void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock) { - RecordPageWithFreeSpace(rel, freeBlock, BLCKSZ - 1); + RecordPageWithFreeSpace(rel, freeBlock, CLUSTER_BLOCK_SIZE - 1); } diff --git a/src/backend/storage/large_object/inv_api.c b/src/backend/storage/large_object/inv_api.c index 84e543e731..a9bc230e4a 100644 --- a/src/backend/storage/large_object/inv_api.c +++ b/src/backend/storage/large_object/inv_api.c @@ -598,7 +598,7 @@ inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes) { bytea hdr; /* this is to make the union big enough for a LO data chunk: */ - char data[LOBLKSIZE + VARHDRSZ]; + char data[LOBLKSIZE_LIMIT + VARHDRSZ]; /* ensure union is aligned well enough: */ int32 align_it; } workbuf; @@ -789,7 +789,7 @@ inv_truncate(LargeObjectDesc *obj_desc, int64 len) { bytea hdr; /* this is to make the union big enough for a LO data chunk: */ - char data[LOBLKSIZE + VARHDRSZ]; + char data[LOBLKSIZE_LIMIT + VARHDRSZ]; /* ensure union is aligned well enough: */ int32 align_it; } workbuf; diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index 1af41213b4..1eab69fc54 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -321,7 +321,7 @@ static SlruCtlData SerialSlruCtlData; #define SerialSlruCtl (&SerialSlruCtlData) -#define SERIAL_PAGESIZE BLCKSZ +#define SERIAL_PAGESIZE CLUSTER_BLOCK_SIZE #define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo) #define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE) diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 9a302ddc30..a96fa911a5 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -45,7 +45,7 @@ PageInit(Page page, Size pageSize, Size specialSize) specialSize = MAXALIGN(specialSize); - Assert(pageSize == BLCKSZ); + Assert(pageSize == CLUSTER_BLOCK_SIZE); Assert(pageSize > specialSize + SizeOfPageHeaderData); /* Make sure all fields of page are zero, as well as unused space */ @@ -102,7 +102,7 @@ PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags) { if (DataChecksumsEnabled()) { - checksum = pg_checksum_page((char *) page, blkno); + checksum = pg_checksum_page((char *) page, blkno, CLUSTER_BLOCK_SIZE); if (checksum != p->pd_checksum) checksum_failure = true; @@ -117,7 +117,7 @@ PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags) if ((p->pd_flags & ~PD_VALID_FLAG_BITS) == 0 && p->pd_lower <= p->pd_upper && p->pd_upper <= p->pd_special && - p->pd_special <= BLCKSZ && + p->pd_special <= CLUSTER_BLOCK_SIZE && p->pd_special == MAXALIGN(p->pd_special)) header_sane = true; @@ -128,7 +128,7 @@ PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags) /* Check all-zeroes case */ all_zeroes = true; pagebytes = (size_t *) page; - for (i = 0; i < (BLCKSZ / sizeof(size_t)); i++) + for (i = 0; i < (CLUSTER_BLOCK_SIZE / sizeof(size_t)); i++) { if (pagebytes[i] != 0) { @@ -211,7 +211,7 @@ PageAddItemExtended(Page page, if (phdr->pd_lower < SizeOfPageHeaderData || phdr->pd_lower > phdr->pd_upper || phdr->pd_upper > phdr->pd_special || - phdr->pd_special > BLCKSZ) + phdr->pd_special > CLUSTER_BLOCK_SIZE) ereport(PANIC, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", @@ -702,7 +702,7 @@ PageRepairFragmentation(Page page) Offset pd_upper = ((PageHeader) page)->pd_upper; Offset pd_special = ((PageHeader) page)->pd_special; Offset last_offset; - itemIdCompactData itemidbase[MaxHeapTuplesPerPage]; + itemIdCompactData itemidbase[MaxHeapTuplesPerPageLimit]; itemIdCompact itemidptr; ItemId lp; int nline, @@ -723,7 +723,7 @@ PageRepairFragmentation(Page page) if (pd_lower < SizeOfPageHeaderData || pd_lower > pd_upper || pd_upper > pd_special || - pd_special > BLCKSZ || + pd_special > CLUSTER_BLOCK_SIZE || pd_special != MAXALIGN(pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), @@ -1066,7 +1066,7 @@ PageIndexTupleDelete(Page page, OffsetNumber offnum) if (phdr->pd_lower < SizeOfPageHeaderData || phdr->pd_lower > phdr->pd_upper || phdr->pd_upper > phdr->pd_special || - phdr->pd_special > BLCKSZ || + phdr->pd_special > CLUSTER_BLOCK_SIZE || phdr->pd_special != MAXALIGN(phdr->pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), @@ -1165,8 +1165,8 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) Offset pd_upper = phdr->pd_upper; Offset pd_special = phdr->pd_special; Offset last_offset; - itemIdCompactData itemidbase[MaxIndexTuplesPerPage]; - ItemIdData newitemids[MaxIndexTuplesPerPage]; + itemIdCompactData itemidbase[MaxIndexTuplesPerPageLimit]; + ItemIdData newitemids[MaxIndexTuplesPerPageLimit]; itemIdCompact itemidptr; ItemId lp; int nline, @@ -1201,7 +1201,7 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) if (pd_lower < SizeOfPageHeaderData || pd_lower > pd_upper || pd_upper > pd_special || - pd_special > BLCKSZ || + pd_special > CLUSTER_BLOCK_SIZE || pd_special != MAXALIGN(pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), @@ -1307,7 +1307,7 @@ PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum) if (phdr->pd_lower < SizeOfPageHeaderData || phdr->pd_lower > phdr->pd_upper || phdr->pd_upper > phdr->pd_special || - phdr->pd_special > BLCKSZ || + phdr->pd_special > CLUSTER_BLOCK_SIZE || phdr->pd_special != MAXALIGN(phdr->pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), @@ -1419,7 +1419,7 @@ PageIndexTupleOverwrite(Page page, OffsetNumber offnum, if (phdr->pd_lower < SizeOfPageHeaderData || phdr->pd_lower > phdr->pd_upper || phdr->pd_upper > phdr->pd_special || - phdr->pd_special > BLCKSZ || + phdr->pd_special > CLUSTER_BLOCK_SIZE || phdr->pd_special != MAXALIGN(phdr->pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), @@ -1523,12 +1523,12 @@ PageSetChecksumCopy(Page page, BlockNumber blkno) */ if (pageCopy == NULL) pageCopy = MemoryContextAllocAligned(TopMemoryContext, - BLCKSZ, + CLUSTER_BLOCK_SIZE, PG_IO_ALIGN_SIZE, 0); - memcpy(pageCopy, (char *) page, BLCKSZ); - ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno); + memcpy(pageCopy, (char *) page, CLUSTER_BLOCK_SIZE); + ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno, CLUSTER_BLOCK_SIZE); return pageCopy; } @@ -1545,5 +1545,5 @@ PageSetChecksumInplace(Page page, BlockNumber blkno) if (PageIsNew(page) || !DataChecksumsEnabled()) return; - ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno); + ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno, CLUSTER_BLOCK_SIZE); } diff --git a/src/backend/storage/smgr/README b/src/backend/storage/smgr/README index cf3aa5632b..9a32663416 100644 --- a/src/backend/storage/smgr/README +++ b/src/backend/storage/smgr/README @@ -50,3 +50,55 @@ to relfilelocator and block number, to identify which relation fork you want to access. Since most code wants to access the main fork, a shortcut version of ReadBuffer that accesses MAIN_FORKNUM is provided in the buffer manager for convenience. + + +Adjustments for Variable Block Sizes +==================================== + +Prior to 17, the disk block size (aka BLCKSZ) was set at compile-time. While +each database cluster still has a fixed block size, this is now selected at +`initdb` time, meaning that a single PostgreSQL binary installation can support +clusters of varying sizes. + +As this is now a per-cluster setting, the block size is set in the pg_control +file. There are multiple computed settings which rely on the cluster block size, +so these also need to be initialized at cluster start. This is managed via +BlockSizeInit() in src/common/blocksize.c. This routine computes a table for +all values which would have been computed at compile time. + +The expressions used to calculate the values in this table have been moved from +their original locations and parameterized based on blocksize; these are named +CalcXXXX(blocksize), and are stored in src/include/common/blocksize.h. + +In order to minimize code changes, we have kept the names of former constants +(such as MaxHeapTuplesPerPage) and replaced their definitions with an +appropriate lookup in this table. Since these lookups are now handled at +runtime, it is important to initialize these as early as possible so any code +usage of these constants uses the computed value after it has been initialized +instead of 0. This applies to any piece of the code which needs to know about +the block size or uses any value which was based on it. + +To ensure that any code that depended on the old behavior is properly adjusted, +we have removed BLCKSZ as a constant, which will ensure that external code will +fail to compile. When writing new code which cases about the block size, you +should use one of the following values: + +- CLUSTER_BLOCK_SIZE - evaluates to the cluster's block size as initialized by + BlockSizeInit(). + +- DEFAULT_BLOCK_SIZE - the compiled-in default block size; 8k + +- MIN_BLOCK_SIZE - the smallest supported block size; 1k + +- MAX_BLOCK_SIZE - the largest supported block size; 32k + +In general, you should use CLUSTER_BLOCK_SIZE unless you are allocating space on +the stack, and even then you should be careful to allocate based on +MAX_BLOCK_SIZE, but still limit accesses to these structs based on the /actual/ +runtime cluster block size. The core code has bene modified to follow these +standards. + +Occasionally, we had been using BLCKSZ as a "big enough buffer" rather than +caring about any specific size correlated with the cluster. In those cases, you +should just use DEFAULT_BLOCK_SIZE. + diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 30dbc02f82..24454420c0 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -47,13 +47,13 @@ * easier to support relations that are larger than the operating * system's file size limit (often 2GBytes). In order to do that, * we break relations up into "segment" files that are each shorter than - * the OS file size limit. The segment size is set by the RELSEG_SIZE + * the OS file size limit. The segment size is set by the CLUSTER_RELSEG_SIZE * configuration constant in pg_config.h. * * On disk, a relation must consist of consecutively numbered segment * files in the pattern - * -- Zero or more full segments of exactly RELSEG_SIZE blocks each - * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks + * -- Zero or more full segments of exactly CLUSTER_RELSEG_SIZE blocks each + * -- Exactly one partial segment of size 0 <= size < CLUSTER_RELSEG_SIZE blocks * -- Optionally, any number of inactive segments of size 0 blocks. * The full and partial segments are collectively the "active" segments. * Inactive segments are those that once contained data but are currently @@ -110,7 +110,7 @@ static MemoryContext MdCxt; /* context for all MdfdVec objects */ #define EXTENSION_CREATE_RECOVERY (1 << 3) /* * Allow opening segments which are preceded by segments smaller than - * RELSEG_SIZE, e.g. inactive segments (see above). Note that this breaks + * CLUSTER_RELSEG_SIZE, e.g. inactive segments (see above). Note that this breaks * mdnblocks() and related functionality henceforth - which currently is ok, * because this is only required in the checkpointer which never uses * mdnblocks(). @@ -464,7 +464,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, MdfdVec *v; /* If this build supports direct I/O, the buffer must be I/O aligned. */ - if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) + if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= CLUSTER_BLOCK_SIZE) Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); /* This assert is too expensive to have on normally ... */ @@ -487,11 +487,11 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (off_t) CLUSTER_BLOCK_SIZE * (blocknum % ((BlockNumber) CLUSTER_RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (off_t) CLUSTER_BLOCK_SIZE * CLUSTER_RELSEG_SIZE); - if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) + if ((nbytes = FileWrite(v->mdfd_vfd, buffer, CLUSTER_BLOCK_SIZE, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != CLUSTER_BLOCK_SIZE) { if (nbytes < 0) ereport(ERROR, @@ -504,14 +504,14 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, (errcode(ERRCODE_DISK_FULL), errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u", FilePathName(v->mdfd_vfd), - nbytes, BLCKSZ, blocknum), + nbytes, CLUSTER_BLOCK_SIZE, blocknum), errhint("Check free disk space."))); } if (!skipFsync && !SmgrIsTemp(reln)) register_dirty_segment(reln, forknum, v); - Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) CLUSTER_RELSEG_SIZE)); } /* @@ -549,19 +549,19 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, while (remblocks > 0) { - BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE); - off_t seekpos = (off_t) BLCKSZ * segstartblock; + BlockNumber segstartblock = curblocknum % ((BlockNumber) CLUSTER_RELSEG_SIZE); + off_t seekpos = (off_t) CLUSTER_BLOCK_SIZE * segstartblock; int numblocks; - if (segstartblock + remblocks > RELSEG_SIZE) - numblocks = RELSEG_SIZE - segstartblock; + if (segstartblock + remblocks > CLUSTER_RELSEG_SIZE) + numblocks = CLUSTER_RELSEG_SIZE - segstartblock; else numblocks = remblocks; v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE); - Assert(segstartblock < RELSEG_SIZE); - Assert(segstartblock + numblocks <= RELSEG_SIZE); + Assert(segstartblock < CLUSTER_RELSEG_SIZE); + Assert(segstartblock + numblocks <= CLUSTER_RELSEG_SIZE); /* * If available and useful, use posix_fallocate() (via @@ -579,7 +579,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, int ret; ret = FileFallocate(v->mdfd_vfd, - seekpos, (off_t) BLCKSZ * numblocks, + seekpos, (off_t) CLUSTER_BLOCK_SIZE * numblocks, WAIT_EVENT_DATA_FILE_EXTEND); if (ret != 0) { @@ -602,7 +602,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, * whole length of the extension. */ ret = FileZero(v->mdfd_vfd, - seekpos, (off_t) BLCKSZ * numblocks, + seekpos, (off_t) CLUSTER_BLOCK_SIZE * numblocks, WAIT_EVENT_DATA_FILE_EXTEND); if (ret < 0) ereport(ERROR, @@ -615,7 +615,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, if (!skipFsync && !SmgrIsTemp(reln)) register_dirty_segment(reln, forknum, v); - Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) CLUSTER_RELSEG_SIZE)); remblocks -= numblocks; curblocknum += numblocks; @@ -667,7 +667,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) mdfd->mdfd_vfd = fd; mdfd->mdfd_segno = 0; - Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE)); + Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) CLUSTER_RELSEG_SIZE)); return mdfd; } @@ -723,11 +723,11 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) if (v == NULL) return false; - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (off_t) CLUSTER_BLOCK_SIZE * (blocknum % ((BlockNumber) CLUSTER_RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (off_t) CLUSTER_BLOCK_SIZE * CLUSTER_RELSEG_SIZE); - (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH); + (void) FilePrefetch(v->mdfd_vfd, seekpos, CLUSTER_BLOCK_SIZE, WAIT_EVENT_DATA_FILE_PREFETCH); #endif /* USE_PREFETCH */ return true; @@ -745,7 +745,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, MdfdVec *v; /* If this build supports direct I/O, the buffer must be I/O aligned. */ - if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) + if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= CLUSTER_BLOCK_SIZE) Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, @@ -757,11 +757,11 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (off_t) CLUSTER_BLOCK_SIZE * (blocknum % ((BlockNumber) CLUSTER_RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (off_t) CLUSTER_BLOCK_SIZE * CLUSTER_RELSEG_SIZE); - nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ); + nbytes = FileRead(v->mdfd_vfd, buffer, CLUSTER_BLOCK_SIZE, seekpos, WAIT_EVENT_DATA_FILE_READ); TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, reln->smgr_rlocator.locator.spcOid, @@ -769,9 +769,9 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, reln->smgr_rlocator.locator.relNumber, reln->smgr_rlocator.backend, nbytes, - BLCKSZ); + CLUSTER_BLOCK_SIZE); - if (nbytes != BLCKSZ) + if (nbytes != CLUSTER_BLOCK_SIZE) { if (nbytes < 0) ereport(ERROR, @@ -788,13 +788,13 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * update a block that was later truncated away. */ if (zero_damaged_pages || InRecovery) - MemSet(buffer, 0, BLCKSZ); + MemSet(buffer, 0, CLUSTER_BLOCK_SIZE); else ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("could not read block %u in file \"%s\": read only %d of %d bytes", blocknum, FilePathName(v->mdfd_vfd), - nbytes, BLCKSZ))); + nbytes, CLUSTER_BLOCK_SIZE))); } } @@ -814,7 +814,7 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, MdfdVec *v; /* If this build supports direct I/O, the buffer must be I/O aligned. */ - if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) + if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= CLUSTER_BLOCK_SIZE) Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); /* This assert is too expensive to have on normally ... */ @@ -831,11 +831,11 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (off_t) CLUSTER_BLOCK_SIZE * (blocknum % ((BlockNumber) CLUSTER_RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (off_t) CLUSTER_BLOCK_SIZE * CLUSTER_RELSEG_SIZE); - nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE); + nbytes = FileWrite(v->mdfd_vfd, buffer, CLUSTER_BLOCK_SIZE, seekpos, WAIT_EVENT_DATA_FILE_WRITE); TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, reln->smgr_rlocator.locator.spcOid, @@ -843,9 +843,9 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, reln->smgr_rlocator.locator.relNumber, reln->smgr_rlocator.backend, nbytes, - BLCKSZ); + CLUSTER_BLOCK_SIZE); - if (nbytes != BLCKSZ) + if (nbytes != CLUSTER_BLOCK_SIZE) { if (nbytes < 0) ereport(ERROR, @@ -858,7 +858,7 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes", blocknum, FilePathName(v->mdfd_vfd), - nbytes, BLCKSZ), + nbytes, CLUSTER_BLOCK_SIZE), errhint("Check free disk space."))); } @@ -904,19 +904,19 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum, return; /* compute offset inside the current segment */ - segnum_start = blocknum / RELSEG_SIZE; + segnum_start = blocknum / CLUSTER_RELSEG_SIZE; /* compute number of desired writes within the current segment */ - segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE; + segnum_end = (blocknum + nblocks - 1) / CLUSTER_RELSEG_SIZE; if (segnum_start != segnum_end) - nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)); + nflush = CLUSTER_RELSEG_SIZE - (blocknum % ((BlockNumber) CLUSTER_RELSEG_SIZE)); Assert(nflush >= 1); Assert(nflush <= nblocks); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (off_t) CLUSTER_BLOCK_SIZE * (blocknum % ((BlockNumber) CLUSTER_RELSEG_SIZE)); - FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH); + FileWriteback(v->mdfd_vfd, seekpos, (off_t) CLUSTER_BLOCK_SIZE * nflush, WAIT_EVENT_DATA_FILE_FLUSH); nblocks -= nflush; blocknum += nflush; @@ -945,7 +945,7 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) /* * Start from the last open segments, to avoid redundant seeks. We have - * previously verified that these segments are exactly RELSEG_SIZE long, + * previously verified that these segments are exactly CLUSTER_RELSEG_SIZE long, * and it's useless to recheck that each time. * * NOTE: this assumption could only be wrong if another backend has @@ -962,13 +962,13 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) for (;;) { nblocks = _mdnblocks(reln, forknum, v); - if (nblocks > ((BlockNumber) RELSEG_SIZE)) + if (nblocks > ((BlockNumber) CLUSTER_RELSEG_SIZE)) elog(FATAL, "segment too big"); - if (nblocks < ((BlockNumber) RELSEG_SIZE)) - return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks; + if (nblocks < ((BlockNumber) CLUSTER_RELSEG_SIZE)) + return (segno * ((BlockNumber) CLUSTER_RELSEG_SIZE)) + nblocks; /* - * If segment is exactly RELSEG_SIZE, advance to next one. + * If segment is exactly CLUSTER_RELSEG_SIZE, advance to next one. */ segno++; @@ -981,7 +981,7 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) */ v = _mdfd_openseg(reln, forknum, segno, 0); if (v == NULL) - return segno * ((BlockNumber) RELSEG_SIZE); + return segno * ((BlockNumber) CLUSTER_RELSEG_SIZE); } } @@ -1022,7 +1022,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) { MdfdVec *v; - priorblocks = (curopensegs - 1) * RELSEG_SIZE; + priorblocks = (curopensegs - 1) * CLUSTER_RELSEG_SIZE; v = &reln->md_seg_fds[forknum][curopensegs - 1]; @@ -1047,18 +1047,18 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) FileClose(v->mdfd_vfd); _fdvec_resize(reln, forknum, curopensegs - 1); } - else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks) + else if (priorblocks + ((BlockNumber) CLUSTER_RELSEG_SIZE) > nblocks) { /* * This is the last segment we want to keep. Truncate the file to * the right length. NOTE: if nblocks is exactly a multiple K of - * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but + * CLUSTER_RELSEG_SIZE, we will truncate the K+1st segment to 0 length but * keep it. This adheres to the invariant given in the header * comments. */ BlockNumber lastsegblocks = nblocks - priorblocks; - if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) + if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * CLUSTER_BLOCK_SIZE, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not truncate file \"%s\" to %u blocks: %m", @@ -1369,7 +1369,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, v->mdfd_vfd = fd; v->mdfd_segno = segno; - Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) CLUSTER_RELSEG_SIZE)); /* all done */ return v; @@ -1396,7 +1396,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL | EXTENSION_DONT_OPEN)); - targetseg = blkno / ((BlockNumber) RELSEG_SIZE); + targetseg = blkno / ((BlockNumber) CLUSTER_RELSEG_SIZE); /* if an existing and opened segment, we're done */ if (targetseg < reln->md_num_open_segs[forknum]) @@ -1433,7 +1433,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, Assert(nextsegno == v->mdfd_segno + 1); - if (nblocks > ((BlockNumber) RELSEG_SIZE)) + if (nblocks > ((BlockNumber) CLUSTER_RELSEG_SIZE)) elog(FATAL, "segment too big"); if ((behavior & EXTENSION_CREATE) || @@ -1448,30 +1448,30 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, * ahead and create the segments so we can finish out the replay. * * We have to maintain the invariant that segments before the last - * active segment are of size RELSEG_SIZE; therefore, if + * active segment are of size CLUSTER_RELSEG_SIZE; therefore, if * extending, pad them out with zeroes if needed. (This only * matters if in recovery, or if the caller is extending the * relation discontiguously, but that can happen in hash indexes.) */ - if (nblocks < ((BlockNumber) RELSEG_SIZE)) + if (nblocks < ((BlockNumber) CLUSTER_RELSEG_SIZE)) { - char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, + char *zerobuf = palloc_aligned(CLUSTER_BLOCK_SIZE, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); mdextend(reln, forknum, - nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, + nextsegno * ((BlockNumber) CLUSTER_RELSEG_SIZE) - 1, zerobuf, skipFsync); pfree(zerobuf); } flags = O_CREAT; } else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) && - nblocks < ((BlockNumber) RELSEG_SIZE)) + nblocks < ((BlockNumber) CLUSTER_RELSEG_SIZE)) { /* * When not extending (or explicitly including truncated * segments), only open the next segment if the current one is - * exactly RELSEG_SIZE. If not (this branch), either return NULL + * exactly CLUSTER_RELSEG_SIZE. If not (this branch), either return NULL * or fail. */ if (behavior & EXTENSION_RETURN_NULL) @@ -1526,7 +1526,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) errmsg("could not seek to end of file \"%s\": %m", FilePathName(seg->mdfd_vfd)))); /* note that this calculation will ignore any partial block at EOF */ - return (BlockNumber) (len / BLCKSZ); + return (BlockNumber) (len / CLUSTER_BLOCK_SIZE); } /* diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 2a4c8ef87f..27254a8b9d 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -1404,12 +1404,12 @@ pg_stat_get_io(PG_FUNCTION_ARGS) values[IO_COL_RESET_TIME] = TimestampTzGetDatum(reset_time); /* - * Hard-code this to the value of BLCKSZ for now. Future + * Hard-code this to the value of CLUSTER_BLOCK_SIZE for now. Future * values could include XLOG_BLCKSZ, once WAL IO is tracked, * and constant multipliers, once non-block-oriented IO (e.g. * temporary file IO) is tracked. */ - values[IO_COL_CONVERSION] = Int64GetDatum(BLCKSZ); + values[IO_COL_CONVERSION] = Int64GetDatum(CLUSTER_BLOCK_SIZE); for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++) { diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index c4fcd0076e..9a21c946df 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -7760,7 +7760,7 @@ gincostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * around 3 bytes per item is fairly typical. */ dataPagesFetchedBySel = ceil(*indexSelectivity * - (numTuples / (BLCKSZ / 3))); + (numTuples / (CLUSTER_BLOCK_SIZE / 3))); if (dataPagesFetchedBySel > dataPagesFetched) dataPagesFetched = dataPagesFetchedBySel; diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index a604432126..3ef8bc7e66 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -1519,8 +1519,8 @@ AddToDataDirLockFile(int target_line, const char *str) int lineno; char *srcptr; char *destptr; - char srcbuffer[BLCKSZ]; - char destbuffer[BLCKSZ]; + char srcbuffer[DEFAULT_BLOCK_SIZE]; + char destbuffer[DEFAULT_BLOCK_SIZE]; fd = open(DIRECTORY_LOCK_FILE, O_RDWR | PG_BINARY, 0); if (fd < 0) @@ -1644,7 +1644,7 @@ RecheckDataDirLockFile(void) int fd; int len; long file_pid; - char buffer[BLCKSZ]; + char buffer[DEFAULT_BLOCK_SIZE]; fd = open(DIRECTORY_LOCK_FILE, O_RDWR | PG_BINARY, 0); if (fd < 0) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 5308896c87..f2d92b9ecd 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -30,6 +30,7 @@ #include "access/xact.h" #include "access/xlog.h" +#include "access/xlog_internal.h" #include "catalog/objectaccess.h" #include "catalog/pg_authid.h" #include "catalog/pg_parameter_acl.h" @@ -105,8 +106,8 @@ typedef struct } unit_conversion; /* Ensure that the constants in the tables don't overflow or underflow */ -#if BLCKSZ < 1024 || BLCKSZ > (1024*1024) -#error BLCKSZ must be between 1KB and 1MB +#if DEFAULT_BLOCK_SIZE < 1024 || DEFAULT_BLOCK_SIZE > (1024*1024) +#error DEFAULT_BLOCK_SIZE must be between 1KB and 1MB #endif #if XLOG_BLCKSZ < 1024 || XLOG_BLCKSZ > (1024*1024) #error XLOG_BLCKSZ must be between 1KB and 1MB @@ -134,11 +135,11 @@ static const unit_conversion memory_unit_conversion_table[] = {"kB", GUC_UNIT_MB, 1.0 / 1024.0}, {"B", GUC_UNIT_MB, 1.0 / (1024.0 * 1024.0)}, - {"TB", GUC_UNIT_BLOCKS, (1024.0 * 1024.0 * 1024.0) / (BLCKSZ / 1024)}, - {"GB", GUC_UNIT_BLOCKS, (1024.0 * 1024.0) / (BLCKSZ / 1024)}, - {"MB", GUC_UNIT_BLOCKS, 1024.0 / (BLCKSZ / 1024)}, - {"kB", GUC_UNIT_BLOCKS, 1.0 / (BLCKSZ / 1024)}, - {"B", GUC_UNIT_BLOCKS, 1.0 / BLCKSZ}, + {"TB", GUC_UNIT_BLOCKS, (1024.0 * 1024.0 * 1024.0) / (DEFAULT_BLOCK_SIZE / 1024)}, + {"GB", GUC_UNIT_BLOCKS, (1024.0 * 1024.0) / (DEFAULT_BLOCK_SIZE / 1024)}, + {"MB", GUC_UNIT_BLOCKS, 1024.0 / (DEFAULT_BLOCK_SIZE / 1024)}, + {"kB", GUC_UNIT_BLOCKS, 1.0 / (DEFAULT_BLOCK_SIZE / 1024)}, + {"B", GUC_UNIT_BLOCKS, 1.0 / DEFAULT_BLOCK_SIZE}, {"TB", GUC_UNIT_XBLOCKS, (1024.0 * 1024.0 * 1024.0) / (XLOG_BLCKSZ / 1024)}, {"GB", GUC_UNIT_XBLOCKS, (1024.0 * 1024.0) / (XLOG_BLCKSZ / 1024)}, @@ -1503,6 +1504,16 @@ InitializeGUCOptions(void) */ pg_timezone_initialize(); + /* Load our block size -- no valid in Bootstrap or init mode, since control file hasn't been written */ + if (!(IsInitProcessingMode() || IsBootstrapProcessingMode())) + { + BlockSizeInit(ClusterBlockSize()); + /* + * Ensure GUCs with dynamic limits have been properly initialized + */ + update_dynamic_gucs(); + } + /* * Create GUCMemoryContext and build hash table of all GUC variables. */ @@ -2782,7 +2793,7 @@ get_config_unit_name(int flags) /* initialize if first time through */ if (bbuf[0] == '\0') - snprintf(bbuf, sizeof(bbuf), "%dkB", BLCKSZ / 1024); + snprintf(bbuf, sizeof(bbuf), "%dkB", CLUSTER_BLOCK_SIZE / 1024); return bbuf; } case GUC_UNIT_XBLOCKS: diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 71e27f8eb0..0cf735f02e 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -41,6 +41,7 @@ #include "commands/trigger.h" #include "commands/user.h" #include "commands/vacuum.h" +#include "common/blocksize.h" #include "common/scram-common.h" #include "jit/jit.h" #include "libpq/auth.h" @@ -497,8 +498,11 @@ bool log_btree_build_stats = false; char *event_source; bool row_security; + bool check_function_bodies = true; +int block_size; + /* * This GUC exists solely for backward compatibility, check its definition for * details. @@ -580,7 +584,6 @@ static char *session_authorization_string; static int max_function_args; static int max_index_keys; static int max_identifier_length; -static int block_size; static int segment_size; static int shared_memory_size_mb; static int shared_memory_size_in_huge_pages; @@ -3116,7 +3119,7 @@ struct config_int ConfigureNamesInt[] = GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE }, &block_size, - BLCKSZ, BLCKSZ, BLCKSZ, + DEFAULT_BLOCK_SIZE, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE, NULL, NULL, NULL }, @@ -3366,7 +3369,8 @@ struct config_int ConfigureNamesInt[] = GUC_UNIT_BLOCKS | GUC_EXPLAIN, }, &min_parallel_table_scan_size, - (8 * 1024 * 1024) / BLCKSZ, 0, INT_MAX / 3, + /* This is set dynamically based on CLUSTER_BLOCK_SIZE, but works for a default */ + (8 * 1024 * 1024) / DEFAULT_BLOCK_SIZE, 0, INT_MAX / 3, NULL, NULL, NULL }, @@ -3377,7 +3381,8 @@ struct config_int ConfigureNamesInt[] = GUC_UNIT_BLOCKS | GUC_EXPLAIN, }, &min_parallel_index_scan_size, - (512 * 1024) / BLCKSZ, 0, INT_MAX / 3, + /* This is set dynamically based on CLUSTER_BLOCK_SIZE, but works for a default */ + (512 * 1024) / DEFAULT_BLOCK_SIZE, 0, INT_MAX / 3, NULL, NULL, NULL }, @@ -4973,3 +4978,29 @@ struct config_enum ConfigureNamesEnum[] = {NULL, 0, 0, NULL, NULL}, NULL, 0, NULL, NULL, NULL, NULL } }; + + + +/* + * Update the specific GUCs which have dynamic limits. Due to having variable + * block sizes, what were originally constants now depend on the runtime block + * size, so handle the adjustments to those boot values here. + */ +void +update_dynamic_gucs(void) +{ + int i; + + for (i = 0; ConfigureNamesInt[i].gen.name; i++) + { + if (strcmp("min_parallel_table_scan_size", ConfigureNamesInt[i].gen.name) == 0) + { + ConfigureNamesInt[i].boot_val = (8 * 1024 * 1024) / CLUSTER_BLOCK_SIZE; + } + else if (strcmp("min_parallel_index_scan_size", ConfigureNamesInt[i].gen.name) == 0) + { + ConfigureNamesInt[i].boot_val = (512 * 1024) / CLUSTER_BLOCK_SIZE; + } + } + +} diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c index 52b8898d5e..a8025d2f7e 100644 --- a/src/backend/utils/sort/logtape.c +++ b/src/backend/utils/sort/logtape.c @@ -27,7 +27,7 @@ * larger size than the underlying OS may support. * * For simplicity, we allocate and release space in the underlying file - * in BLCKSZ-size blocks. Space allocation boils down to keeping track + * in CLUSTER_BLOCK_SIZE-size blocks. Space allocation boils down to keeping track * of which blocks in the underlying file belong to which logical tape, * plus any blocks that are free (recycled and not yet reused). * The blocks in each logical tape form a chain, with a prev- and next- @@ -86,7 +86,7 @@ #include "utils/memutils.h" /* - * A TapeBlockTrailer is stored at the end of each BLCKSZ block. + * A TapeBlockTrailer is stored at the end of each CLUSTER_BLOCK_SIZE block. * * The first block of a tape has prev == -1. The last block of a tape * stores the number of valid bytes on the block, inverted, in 'next' @@ -100,7 +100,7 @@ typedef struct TapeBlockTrailer * bytes on last block (if < 0) */ } TapeBlockTrailer; -#define TapeBlockPayloadSize (BLCKSZ - sizeof(TapeBlockTrailer)) +#define TapeBlockPayloadSize (CLUSTER_BLOCK_SIZE - sizeof(TapeBlockTrailer)) #define TapeBlockGetTrailer(buf) \ ((TapeBlockTrailer *) ((char *) buf + TapeBlockPayloadSize)) @@ -192,7 +192,7 @@ struct LogicalTapeSet /* * File size tracking. nBlocksWritten is the size of the underlying file, - * in BLCKSZ blocks. nBlocksAllocated is the number of blocks allocated + * in CLUSTER_BLOCK_SIZE blocks. nBlocksAllocated is the number of blocks allocated * by ltsReleaseBlock(), and it is always greater than or equal to * nBlocksWritten. Blocks between nBlocksAllocated and nBlocksWritten are * blocks that have been allocated for a tape, but have not been written @@ -265,7 +265,7 @@ ltsWriteBlock(LogicalTapeSet *lts, long blocknum, const void *buffer) (errcode_for_file_access(), errmsg("could not seek to block %ld of temporary file", blocknum))); - BufFileWrite(lts->pfile, buffer, BLCKSZ); + BufFileWrite(lts->pfile, buffer, CLUSTER_BLOCK_SIZE); /* Update nBlocksWritten, if we extended the file */ if (blocknum == lts->nBlocksWritten) @@ -286,7 +286,7 @@ ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer) (errcode_for_file_access(), errmsg("could not seek to block %ld of temporary file", blocknum))); - BufFileReadExact(lts->pfile, buffer, BLCKSZ); + BufFileReadExact(lts->pfile, buffer, CLUSTER_BLOCK_SIZE); } /* @@ -328,7 +328,7 @@ ltsReadFillBuffer(LogicalTape *lt) lt->nextBlockNumber = TapeBlockGetTrailer(thisbuf)->next; /* Advance to next block, if we have buffer space left */ - } while (lt->buffer_size - lt->nbytes > BLCKSZ); + } while (lt->buffer_size - lt->nbytes > CLUSTER_BLOCK_SIZE); return (lt->nbytes > 0); } @@ -640,7 +640,7 @@ LogicalTapeImport(LogicalTapeSet *lts, int worker, TapeShare *shared) } /* Don't allocate more for read buffer than could possibly help */ lt->max_size = Min(MaxAllocSize, filesize); - tapeblocks = filesize / BLCKSZ; + tapeblocks = filesize / CLUSTER_BLOCK_SIZE; /* * Update # of allocated blocks and # blocks written to reflect the @@ -769,8 +769,8 @@ LogicalTapeWrite(LogicalTape *lt, const void *ptr, size_t size) /* Allocate data buffer and first block on first write */ if (lt->buffer == NULL) { - lt->buffer = (char *) palloc(BLCKSZ); - lt->buffer_size = BLCKSZ; + lt->buffer = (char *) palloc(CLUSTER_BLOCK_SIZE); + lt->buffer_size = CLUSTER_BLOCK_SIZE; } if (lt->curBlockNumber == -1) { @@ -783,7 +783,7 @@ LogicalTapeWrite(LogicalTape *lt, const void *ptr, size_t size) TapeBlockGetTrailer(lt->buffer)->prev = -1L; } - Assert(lt->buffer_size == BLCKSZ); + Assert(lt->buffer_size == CLUSTER_BLOCK_SIZE); while (size > 0) { if (lt->pos >= (int) TapeBlockPayloadSize) @@ -837,9 +837,9 @@ LogicalTapeWrite(LogicalTape *lt, const void *ptr, size_t size) * * 'buffer_size' specifies how much memory to use for the read buffer. * Regardless of the argument, the actual amount of memory used is between - * BLCKSZ and MaxAllocSize, and is a multiple of BLCKSZ. The given value is + * CLUSTER_BLOCK_SIZE and MaxAllocSize, and is a multiple of CLUSTER_BLOCK_SIZE. The given value is * rounded down and truncated to fit those constraints, if necessary. If the - * tape is frozen, the 'buffer_size' argument is ignored, and a small BLCKSZ + * tape is frozen, the 'buffer_size' argument is ignored, and a small CLUSTER_BLOCK_SIZE * byte buffer is used. */ void @@ -851,19 +851,19 @@ LogicalTapeRewindForRead(LogicalTape *lt, size_t buffer_size) * Round and cap buffer_size if needed. */ if (lt->frozen) - buffer_size = BLCKSZ; + buffer_size = CLUSTER_BLOCK_SIZE; else { /* need at least one block */ - if (buffer_size < BLCKSZ) - buffer_size = BLCKSZ; + if (buffer_size < CLUSTER_BLOCK_SIZE) + buffer_size = CLUSTER_BLOCK_SIZE; /* palloc() larger than max_size is unlikely to be helpful */ if (buffer_size > lt->max_size) buffer_size = lt->max_size; - /* round down to BLCKSZ boundary */ - buffer_size -= buffer_size % BLCKSZ; + /* round down to CLUSTER_BLOCK_SIZE boundary */ + buffer_size -= buffer_size % CLUSTER_BLOCK_SIZE; } if (lt->writing) @@ -1015,12 +1015,12 @@ LogicalTapeFreeze(LogicalTape *lt, TapeShare *share) * we're reading from multiple tapes. But at the end of a sort, when a * tape is frozen, we only read from a single tape anyway. */ - if (!lt->buffer || lt->buffer_size != BLCKSZ) + if (!lt->buffer || lt->buffer_size != CLUSTER_BLOCK_SIZE) { if (lt->buffer) pfree(lt->buffer); - lt->buffer = palloc(BLCKSZ); - lt->buffer_size = BLCKSZ; + lt->buffer = palloc(CLUSTER_BLOCK_SIZE); + lt->buffer_size = CLUSTER_BLOCK_SIZE; } /* Read the first block, or reset if tape is empty */ @@ -1064,7 +1064,7 @@ LogicalTapeBackspace(LogicalTape *lt, size_t size) size_t seekpos = 0; Assert(lt->frozen); - Assert(lt->buffer_size == BLCKSZ); + Assert(lt->buffer_size == CLUSTER_BLOCK_SIZE); if (lt->buffer == NULL) ltsInitReadBuffer(lt); @@ -1134,7 +1134,7 @@ LogicalTapeSeek(LogicalTape *lt, long blocknum, int offset) { Assert(lt->frozen); Assert(offset >= 0 && offset <= TapeBlockPayloadSize); - Assert(lt->buffer_size == BLCKSZ); + Assert(lt->buffer_size == CLUSTER_BLOCK_SIZE); if (lt->buffer == NULL) ltsInitReadBuffer(lt); @@ -1167,7 +1167,7 @@ LogicalTapeTell(LogicalTape *lt, long *blocknum, int *offset) Assert(lt->offsetBlockNumber == 0L); /* With a larger buffer, 'pos' wouldn't be the same as offset within page */ - Assert(lt->buffer_size == BLCKSZ); + Assert(lt->buffer_size == CLUSTER_BLOCK_SIZE); *blocknum = lt->curBlockNumber; *offset = lt->pos; diff --git a/src/backend/utils/sort/sharedtuplestore.c b/src/backend/utils/sort/sharedtuplestore.c index 236be65f22..4f0cebf5be 100644 --- a/src/backend/utils/sort/sharedtuplestore.c +++ b/src/backend/utils/sort/sharedtuplestore.c @@ -37,7 +37,7 @@ */ #define STS_CHUNK_PAGES 4 #define STS_CHUNK_HEADER_SIZE offsetof(SharedTuplestoreChunk, data) -#define STS_CHUNK_DATA_SIZE (STS_CHUNK_PAGES * BLCKSZ - STS_CHUNK_HEADER_SIZE) +#define STS_CHUNK_DATA_SIZE (STS_CHUNK_PAGES * CLUSTER_BLOCK_SIZE - STS_CHUNK_HEADER_SIZE) /* Chunk written to disk. */ typedef struct SharedTuplestoreChunk @@ -198,7 +198,7 @@ sts_flush_chunk(SharedTuplestoreAccessor *accessor) { size_t size; - size = STS_CHUNK_PAGES * BLCKSZ; + size = STS_CHUNK_PAGES * CLUSTER_BLOCK_SIZE; BufFileWrite(accessor->write_file, accessor->write_chunk, size); memset(accessor->write_chunk, 0, size); accessor->write_pointer = &accessor->write_chunk->data[0]; @@ -332,11 +332,11 @@ sts_puttuple(SharedTuplestoreAccessor *accessor, void *meta_data, /* First time through. Allocate chunk. */ accessor->write_chunk = (SharedTuplestoreChunk *) MemoryContextAllocZero(accessor->context, - STS_CHUNK_PAGES * BLCKSZ); + STS_CHUNK_PAGES * CLUSTER_BLOCK_SIZE); accessor->write_chunk->ntuples = 0; accessor->write_pointer = &accessor->write_chunk->data[0]; accessor->write_end = (char *) - accessor->write_chunk + STS_CHUNK_PAGES * BLCKSZ; + accessor->write_chunk + STS_CHUNK_PAGES * CLUSTER_BLOCK_SIZE; } else { @@ -445,7 +445,7 @@ sts_read_tuple(SharedTuplestoreAccessor *accessor, void *meta_data) } remaining_size = size - sizeof(uint32); this_chunk_size = Min(remaining_size, - BLCKSZ * STS_CHUNK_PAGES - accessor->read_bytes); + CLUSTER_BLOCK_SIZE * STS_CHUNK_PAGES - accessor->read_bytes); destination = accessor->read_buffer + sizeof(uint32); BufFileReadExact(accessor->read_file, destination, this_chunk_size); accessor->read_bytes += this_chunk_size; @@ -468,7 +468,7 @@ sts_read_tuple(SharedTuplestoreAccessor *accessor, void *meta_data) errdetail_internal("Expected overflow chunk."))); accessor->read_next_page += STS_CHUNK_PAGES; this_chunk_size = Min(remaining_size, - BLCKSZ * STS_CHUNK_PAGES - + CLUSTER_BLOCK_SIZE * STS_CHUNK_PAGES - STS_CHUNK_HEADER_SIZE); BufFileReadExact(accessor->read_file, destination, this_chunk_size); accessor->read_bytes += this_chunk_size; diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index e5a4e5b371..09e762a015 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -179,8 +179,8 @@ typedef enum */ #define MINORDER 6 /* minimum merge order */ #define MAXORDER 500 /* maximum merge order */ -#define TAPE_BUFFER_OVERHEAD BLCKSZ -#define MERGE_BUFFER_SIZE (BLCKSZ * 32) +#define TAPE_BUFFER_OVERHEAD CLUSTER_BLOCK_SIZE +#define MERGE_BUFFER_SIZE (CLUSTER_BLOCK_SIZE * 32) /* @@ -1003,7 +1003,7 @@ tuplesort_updatemax(Tuplesortstate *state) if (state->tapeset) { isSpaceDisk = true; - spaceUsed = LogicalTapeSetBlocks(state->tapeset) * BLCKSZ; + spaceUsed = LogicalTapeSetBlocks(state->tapeset) * CLUSTER_BLOCK_SIZE; } else { diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index fc1fb363e7..2cabd41a03 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -69,6 +69,7 @@ #include "catalog/pg_class_d.h" /* pgrminclude ignore */ #include "catalog/pg_collation_d.h" #include "catalog/pg_database_d.h" /* pgrminclude ignore */ +#include "common/blocksize.h" #include "common/file_perm.h" #include "common/file_utils.h" #include "common/logging.h" @@ -165,6 +166,8 @@ static bool data_checksums = false; static char *xlog_dir = NULL; static char *str_wal_segment_size_mb = NULL; static int wal_segment_size_mb; +static char *str_block_size = NULL; +int block_size = DEFAULT_BLOCK_SIZE; /* internal vars */ @@ -1108,8 +1111,8 @@ test_config_settings(void) for (i = 0; i < bufslen; i++) { - /* Use same amount of memory, independent of BLCKSZ */ - test_buffs = (trial_bufs[i] * 8192) / BLCKSZ; + /* Use same amount of memory, independent of block_size */ + test_buffs = (trial_bufs[i] * 8192) / block_size; if (test_buffs <= ok_buffers) { test_buffs = ok_buffers; @@ -1121,10 +1124,10 @@ test_config_settings(void) } n_buffers = test_buffs; - if ((n_buffers * (BLCKSZ / 1024)) % 1024 == 0) - printf("%dMB\n", (n_buffers * (BLCKSZ / 1024)) / 1024); + if ((n_buffers * (block_size / 1024)) % 1024 == 0) + printf("%dMB\n", (n_buffers * (block_size / 1024)) / 1024); else - printf("%dkB\n", n_buffers * (BLCKSZ / 1024)); + printf("%dkB\n", n_buffers * (block_size / 1024)); printf(_("selecting default time zone ... ")); fflush(stdout); @@ -1145,11 +1148,12 @@ test_specific_config_settings(int test_conns, int test_buffs) /* Set up the test postmaster invocation */ printfPQExpBuffer(cmd, - "\"%s\" --check %s %s " + "\"%s\" --check %s %s -b %d " "-c max_connections=%d " "-c shared_buffers=%d " "-c dynamic_shared_memory_type=%s", backend_exec, boot_options, extra_options, + block_size, test_conns, test_buffs, dynamic_shared_memory_type); @@ -1214,12 +1218,12 @@ setup_config(void) conflines = replace_guc_value(conflines, "max_connections", repltok, false); - if ((n_buffers * (BLCKSZ / 1024)) % 1024 == 0) + if ((n_buffers * (block_size / 1024)) % 1024 == 0) snprintf(repltok, sizeof(repltok), "%dMB", - (n_buffers * (BLCKSZ / 1024)) / 1024); + (n_buffers * (block_size / 1024)) / 1024); else snprintf(repltok, sizeof(repltok), "%dkB", - n_buffers * (BLCKSZ / 1024)); + n_buffers * (block_size / 1024)); conflines = replace_guc_value(conflines, "shared_buffers", repltok, false); @@ -1294,21 +1298,21 @@ setup_config(void) #if DEFAULT_BACKEND_FLUSH_AFTER > 0 snprintf(repltok, sizeof(repltok), "%dkB", - DEFAULT_BACKEND_FLUSH_AFTER * (BLCKSZ / 1024)); + DEFAULT_BACKEND_FLUSH_AFTER * (block_size / 1024)); conflines = replace_guc_value(conflines, "backend_flush_after", repltok, true); #endif #if DEFAULT_BGWRITER_FLUSH_AFTER > 0 snprintf(repltok, sizeof(repltok), "%dkB", - DEFAULT_BGWRITER_FLUSH_AFTER * (BLCKSZ / 1024)); + DEFAULT_BGWRITER_FLUSH_AFTER * (block_size / 1024)); conflines = replace_guc_value(conflines, "bgwriter_flush_after", repltok, true); #endif #if DEFAULT_CHECKPOINT_FLUSH_AFTER > 0 snprintf(repltok, sizeof(repltok), "%dkB", - DEFAULT_CHECKPOINT_FLUSH_AFTER * (BLCKSZ / 1024)); + DEFAULT_CHECKPOINT_FLUSH_AFTER * (block_size / 1024)); conflines = replace_guc_value(conflines, "checkpoint_flush_after", repltok, true); #endif @@ -1531,9 +1535,10 @@ bootstrap_template1(void) unsetenv("PGCLIENTENCODING"); snprintf(cmd, sizeof(cmd), - "\"%s\" --boot -X %d %s %s %s %s", + "\"%s\" --boot -X %d -b %d %s %s %s %s", backend_exec, wal_segment_size_mb * (1024 * 1024), + block_size, data_checksums ? "-k" : "", boot_options, extra_options, debug ? "-d 5" : ""); @@ -2431,6 +2436,7 @@ usage(const char *progname) printf(_(" -A, --auth=METHOD default authentication method for local connections\n")); printf(_(" --auth-host=METHOD default authentication method for local TCP/IP connections\n")); printf(_(" --auth-local=METHOD default authentication method for local-socket connections\n")); + printf(_(" -b, --block-size=SIZE size of database blocks, in kilobytes\n")); printf(_(" [-D, --pgdata=]DATADIR location for this database cluster\n")); printf(_(" -E, --encoding=ENCODING set default encoding for new databases\n")); printf(_(" -g, --allow-group-access allow group read/execute on data directory\n")); @@ -3094,6 +3100,7 @@ main(int argc, char *argv[]) {"sync-only", no_argument, NULL, 'S'}, {"waldir", required_argument, NULL, 'X'}, {"wal-segsize", required_argument, NULL, 12}, + {"block-size", required_argument, NULL, 'b'}, {"data-checksums", no_argument, NULL, 'k'}, {"allow-group-access", no_argument, NULL, 'g'}, {"discard-caches", no_argument, NULL, 14}, @@ -3141,7 +3148,7 @@ main(int argc, char *argv[]) /* process command-line options */ - while ((c = getopt_long(argc, argv, "A:c:dD:E:gkL:nNsST:U:WX:", + while ((c = getopt_long(argc, argv, "A:b:c:dD:E:gkL:nNsST:U:WX:", long_options, &option_index)) != -1) { switch (c) @@ -3165,6 +3172,9 @@ main(int argc, char *argv[]) case 11: authmethodhost = pg_strdup(optarg); break; + case 'b': + str_block_size = pg_strdup(optarg); + break; case 'c': { char *buf = pg_strdup(optarg); @@ -3359,6 +3369,26 @@ main(int argc, char *argv[]) pg_fatal("argument of --wal-segsize must be a power of 2 between 1 and 1024"); } + if (str_block_size == NULL) + block_size = DEFAULT_BLOCK_SIZE; + else + { + char *endptr; + + /* check that the argument is a number */ + block_size = strtol(str_block_size, &endptr, 10); + + /* verify that the segment size is valid */ + if (endptr == str_block_size || *endptr != '\0') + pg_fatal("argument of --block-size must be a number"); + block_size *= 1024; + /* check for valid block_size; last is bitwise power of two check */ + if (!IsValidBlockSize(block_size)) + pg_fatal("argument of --block-size must be a power of 2 between 1 and 32"); + } + + BlockSizeInit(block_size); + get_restricted_token(); setup_pgdata(); @@ -3392,6 +3422,8 @@ main(int argc, char *argv[]) else printf(_("Data page checksums are disabled.\n")); + printf(_("Selected server block size: %d\n"), block_size); + if (pwprompt || pwfilename) get_su_pwd(); diff --git a/src/bin/initdb/meson.build b/src/bin/initdb/meson.build index 49743630aa..e2b8d88bff 100644 --- a/src/bin/initdb/meson.build +++ b/src/bin/initdb/meson.build @@ -31,6 +31,7 @@ tests += { 'env': {'with_icu': icu.found() ? 'yes' : 'no'}, 'tests': [ 't/001_initdb.pl', + 't/002_blocksize.pl', ], }, } diff --git a/src/bin/initdb/t/002_blocksize.pl b/src/bin/initdb/t/002_blocksize.pl new file mode 100644 index 0000000000..d8d4208fed --- /dev/null +++ b/src/bin/initdb/t/002_blocksize.pl @@ -0,0 +1,24 @@ + +# Copyright (c) 2023, PostgreSQL Global Development Group + + +use strict; +use warnings; +use Fcntl ':mode'; +use File::stat qw{lstat}; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +for my $blocksize (1,2,4,8,16,32) { + my $node1 = PostgreSQL::Test::Cluster->new('node' . $blocksize); + $node1->init(extra => ['--block-size='.($blocksize)] ); + $node1->start; + + is($node1->safe_psql('postgres',q{SELECT current_setting('block_size')}), + 1024 * $blocksize, "initdb with blocksize " . $blocksize . "k"); + + $node1->stop; +} + +done_testing(); diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 1dc8efe0cb..b983c2c354 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -28,6 +28,7 @@ #include "access/xlog_internal.h" #include "backup/basebackup.h" #include "bbstreamer.h" +#include "common/blocksize.h" #include "common/compression.h" #include "common/file_perm.h" #include "common/file_utils.h" @@ -2735,6 +2736,12 @@ main(int argc, char **argv) if (!RetrieveWalSegSize(conn)) exit(1); + /* detect and set remote server's block size */ + if (!RetrieveBlockSize(conn)) + exit(1); + + BlockSizeInit(BlockSize); + /* Create pg_wal symlink, if required */ if (xlog_dir) { diff --git a/src/bin/pg_basebackup/pg_receivewal.c b/src/bin/pg_basebackup/pg_receivewal.c index d0a4079d50..e5f5c804ee 100644 --- a/src/bin/pg_basebackup/pg_receivewal.c +++ b/src/bin/pg_basebackup/pg_receivewal.c @@ -897,6 +897,10 @@ main(int argc, char **argv) /* determine remote server's xlog segment size */ if (!RetrieveWalSegSize(conn)) exit(1); + if (!RetrieveBlockSize(conn)) + exit(1); + + BlockSizeInit(BlockSize); /* * Don't close the connection here so that subsequent StreamLog() can diff --git a/src/bin/pg_basebackup/streamutil.c b/src/bin/pg_basebackup/streamutil.c index 15514599c4..c8aea95c2f 100644 --- a/src/bin/pg_basebackup/streamutil.c +++ b/src/bin/pg_basebackup/streamutil.c @@ -32,6 +32,7 @@ #define ERRCODE_DUPLICATE_OBJECT "42710" int WalSegSz; +int BlockSize; static bool RetrieveDataDirCreatePerm(PGconn *conn); @@ -331,6 +332,64 @@ RetrieveWalSegSize(PGconn *conn) return true; } +/* + * Use SHOW block_size since ControlFile is not accessible here. + */ +bool +RetrieveBlockSize(PGconn *conn) +{ + PGresult *res; + + /* check connection existence */ + Assert(conn != NULL); + + /* for previous versions set the default xlog seg size */ + if (PQserverVersion(conn) < MINIMUM_VERSION_FOR_SHOW_CMD) + { + BlockSize = DEFAULT_BLOCK_SIZE; + return true; + } + + res = PQexec(conn, "SHOW block_size"); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + pg_log_error("could not send replication command \"%s\": %s", + "SHOW block_size", PQerrorMessage(conn)); + + PQclear(res); + return false; + } + if (PQntuples(res) != 1 || PQnfields(res) < 1) + { + pg_log_error("could not fetch block size: got %d rows and %d fields, expected %d rows and %d or more fields", + PQntuples(res), PQnfields(res), 1, 1); + + PQclear(res); + return false; + } + + /* fetch xlog value and unit from the result */ + if (sscanf(PQgetvalue(res, 0, 0), "%d", &BlockSize) != 1) + { + pg_log_error("block size could not be parsed"); + PQclear(res); + return false; + } + + PQclear(res); + + if (!IsValidBlockSize(BlockSize)) + { + pg_log_error(ngettext("Block size must be a power of two between 1k and 32k, but the remote server reported a value of %d byte", + "Block size must be a power of two between 1k and 32k, but the remote server reported a value of %d bytes", + BlockSize), + BlockSize); + return false; + } + + return true; +} + /* * RetrieveDataDirCreatePerm * diff --git a/src/bin/pg_basebackup/streamutil.h b/src/bin/pg_basebackup/streamutil.h index 268c163213..c97f9fa969 100644 --- a/src/bin/pg_basebackup/streamutil.h +++ b/src/bin/pg_basebackup/streamutil.h @@ -13,6 +13,7 @@ #define STREAMUTIL_H #include "access/xlogdefs.h" +#include "common/blocksize.h" #include "datatype/timestamp.h" #include "libpq-fe.h" #include "pqexpbuffer.h" @@ -25,6 +26,7 @@ extern char *dbport; extern char *dbname; extern int dbgetpassword; extern int WalSegSz; +extern int BlockSize; /* Connection kept global so we can disconnect easily */ extern PGconn *conn; @@ -56,6 +58,7 @@ extern bool GetSlotInformation(PGconn *conn, const char *slot_name, XLogRecPtr *restart_lsn, TimeLineID *restart_tli); extern bool RetrieveWalSegSize(PGconn *conn); +extern bool RetrieveBlockSize(PGconn *conn); extern TimestampTz feGetCurrentTimestamp(void); extern void feTimestampDifference(TimestampTz start_time, TimestampTz stop_time, long *secs, int *microsecs); diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 19eb67e485..0affb90e66 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -21,6 +21,7 @@ #include #include "access/xlog_internal.h" +#include "common/blocksize.h" #include "common/controldata_utils.h" #include "common/file_perm.h" #include "common/file_utils.h" @@ -204,18 +205,18 @@ scan_file(const char *fn, int segmentno) for (blockno = 0;; blockno++) { uint16 csum; - int r = read(f, buf.data, BLCKSZ); + int r = read(f, buf.data, CLUSTER_BLOCK_SIZE); if (r == 0) break; - if (r != BLCKSZ) + if (r != CLUSTER_BLOCK_SIZE) { if (r < 0) pg_fatal("could not read block %u in file \"%s\": %m", blockno, fn); else pg_fatal("could not read block %u in file \"%s\": read %d of %d", - blockno, fn, r, BLCKSZ); + blockno, fn, r, CLUSTER_BLOCK_SIZE); } blocks_scanned++; @@ -231,14 +232,14 @@ scan_file(const char *fn, int segmentno) if (PageIsNew(buf.data)) continue; - csum = pg_checksum_page(buf.data, blockno + segmentno * RELSEG_SIZE); + csum = pg_checksum_page(buf.data, blockno + segmentno * RELSEG_SIZE, CLUSTER_BLOCK_SIZE); if (mode == PG_MODE_CHECK) { if (csum != header->pd_checksum) { if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION) - pg_log_error("checksum verification failed in file \"%s\", block %u: calculated checksum %X but block contains %X", - fn, blockno, csum, header->pd_checksum); + pg_log_error("checksum verification failed in file \"%s\", block %u, block_size: %u: calculated checksum %X but block contains %X", + fn, blockno, CLUSTER_BLOCK_SIZE, csum, header->pd_checksum); badblocks++; } } @@ -259,19 +260,19 @@ scan_file(const char *fn, int segmentno) header->pd_checksum = csum; /* Seek back to beginning of block */ - if (lseek(f, -BLCKSZ, SEEK_CUR) < 0) + if (lseek(f, -(int)CLUSTER_BLOCK_SIZE, SEEK_CUR) < 0) pg_fatal("seek failed for block %u in file \"%s\": %m", blockno, fn); /* Write block with checksum */ - w = write(f, buf.data, BLCKSZ); - if (w != BLCKSZ) + w = write(f, buf.data, CLUSTER_BLOCK_SIZE); + if (w != CLUSTER_BLOCK_SIZE) { if (w < 0) pg_fatal("could not write block %u in file \"%s\": %m", blockno, fn); else pg_fatal("could not write block %u in file \"%s\": wrote %d of %d", - blockno, fn, w, BLCKSZ); + blockno, fn, w, CLUSTER_BLOCK_SIZE); } } @@ -551,14 +552,6 @@ main(int argc, char *argv[]) if (ControlFile->pg_control_version != PG_CONTROL_VERSION) pg_fatal("cluster is not compatible with this version of pg_checksums"); - if (ControlFile->blcksz != BLCKSZ) - { - pg_log_error("database cluster is not compatible"); - pg_log_error_detail("The database cluster was initialized with block size %u, but pg_checksums was compiled with block size %u.", - ControlFile->blcksz, BLCKSZ); - exit(1); - } - /* * Check if cluster is running. A clean shutdown is required to avoid * random checksum failures caused by torn pages. Note that this doesn't @@ -580,6 +573,11 @@ main(int argc, char *argv[]) mode == PG_MODE_ENABLE) pg_fatal("data checksums are already enabled in cluster"); + if (IsValidBlockSize(ControlFile->blcksz)) + BlockSizeInit(ControlFile->blcksz); + else + pg_fatal("invalid cluster block size in control file"); + /* Operate on all files if checking or enabling checksums */ if (mode == PG_MODE_CHECK || mode == PG_MODE_ENABLE) { diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index e7ef2b8bd0..0c2be791df 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -49,6 +49,7 @@ #include "access/transam.h" #include "access/xlog.h" #include "access/xlog_internal.h" +#include "common/blocksize.h" #include "common/controldata_utils.h" #include "common/fe_memutils.h" #include "common/file_perm.h" @@ -75,6 +76,7 @@ static TimeLineID minXlogTli = 0; static XLogSegNo minXlogSegNo = 0; static int WalSegSz; static int set_wal_segsize; +static int set_blocksize = 0; static void CheckDataVersion(void); static bool read_controlfile(void); @@ -93,6 +95,7 @@ int main(int argc, char *argv[]) { static struct option long_options[] = { + {"block-size", required_argument, NULL, 'b'}, {"commit-timestamp-ids", required_argument, NULL, 'c'}, {"pgdata", required_argument, NULL, 'D'}, {"epoch", required_argument, NULL, 'e'}, @@ -137,10 +140,19 @@ main(int argc, char *argv[]) } - while ((c = getopt_long(argc, argv, "c:D:e:fl:m:no:O:u:x:", long_options, NULL)) != -1) + while ((c = getopt_long(argc, argv, "b:c:D:e:fl:m:no:O:u:x:", long_options, NULL)) != -1) { switch (c) { + case 'b': + errno = 0; + set_blocksize = strtol(optarg, &endptr, 10); + if (endptr == optarg || *endptr != '\0' || errno != 0) + pg_fatal("argument of --block-size must be a number"); + set_blocksize *= 1024; + if (!IsValidBlockSize(set_blocksize)) + pg_fatal("argument of --block-size must be a power of 2 between 1 and 32"); + break; case 'D': DataDir = optarg; break; @@ -387,6 +399,33 @@ main(int argc, char *argv[]) else WalSegSz = ControlFile.xlog_seg_size; + /* + * If a blocksize was specified, compare to existing ControlFile; if we + * are wrong, we won't be able to read the data. We will only want to set + * it if we guessed. + */ + if (set_blocksize == 0) + { + if (guessed) + pg_fatal("Cannot determine cluster block size; provide explicitly via --block-size"); + } + else + { + if (!guessed && set_blocksize != ControlFile.blcksz) + pg_fatal("Cannot change blocksize in cluster"); + + /* hope this is right, but by default we don't know; likely this is + * DEFAULT_BLOCK_SIZE */ + ControlFile.blcksz = set_blocksize; + } + + /* + * Set some dependent calculated fields stored in pg_control + */ + BlockSizeInit(ControlFile.blcksz); + ControlFile.toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE; + ControlFile.loblksize = LOBLKSIZE; + if (log_fname != NULL) XLogFromFileName(log_fname, &minXlogTli, &minXlogSegNo, WalSegSz); @@ -610,6 +649,16 @@ read_controlfile(void) return false; } + /* return false if block size is not valid */ + if (!IsValidBlockSize(ControlFile.blcksz)) + { + pg_log_warning(ngettext("pg_control specifies invalid block size (%d byte); proceed with caution", + "pg_control specifies invalid block size (%d bytes); proceed with caution", + ControlFile.blcksz), + ControlFile.blcksz); + return false; + } + return true; } @@ -682,7 +731,7 @@ GuessControlValues(void) ControlFile.maxAlign = MAXIMUM_ALIGNOF; ControlFile.floatFormat = FLOATFORMAT_VALUE; - ControlFile.blcksz = BLCKSZ; + ControlFile.blcksz = DEFAULT_BLOCK_SIZE; ControlFile.relseg_size = RELSEG_SIZE; ControlFile.xlog_blcksz = XLOG_BLCKSZ; ControlFile.xlog_seg_size = DEFAULT_XLOG_SEG_SIZE; @@ -1127,6 +1176,7 @@ usage(void) printf(_("%s resets the PostgreSQL write-ahead log.\n\n"), progname); printf(_("Usage:\n %s [OPTION]... DATADIR\n\n"), progname); printf(_("Options:\n")); + printf(_(" --block-size=SIZE cluster block size, in bytes\n")); printf(_(" -c, --commit-timestamp-ids=XID,XID\n" " set oldest and newest transactions bearing\n" " commit timestamp (zero means no change)\n")); diff --git a/src/bin/pg_resetwal/t/002_corrupted.pl b/src/bin/pg_resetwal/t/002_corrupted.pl index 6d19a1efd5..84ed4e92b0 100644 --- a/src/bin/pg_resetwal/t/002_corrupted.pl +++ b/src/bin/pg_resetwal/t/002_corrupted.pl @@ -31,7 +31,7 @@ print $fh pack("x[$size]"); close $fh; command_checks_all( - [ 'pg_resetwal', '-n', $node->data_dir ], + [ 'pg_resetwal', '-b', '8', '-n', $node->data_dir ], 0, [qr/pg_control version number/], [ @@ -47,7 +47,7 @@ print $fh $data, pack("x[" . ($size - 16) . "]"); close $fh; command_checks_all( - [ 'pg_resetwal', '-n', $node->data_dir ], + [ 'pg_resetwal', '-b', '8', '-n', $node->data_dir ], 0, [qr/pg_control version number/], [ diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c index bd5c598e20..e354432aa5 100644 --- a/src/bin/pg_rewind/filemap.c +++ b/src/bin/pg_rewind/filemap.c @@ -27,6 +27,7 @@ #include #include "catalog/pg_tablespace_d.h" +#include "common/blocksize.h" #include "common/hashfn.h" #include "common/string.h" #include "datapagemap.h" @@ -333,7 +334,7 @@ process_target_wal_block_change(ForkNumber forknum, RelFileLocator rlocator, { off_t end_offset; - end_offset = (blkno_inseg + 1) * BLCKSZ; + end_offset = (blkno_inseg + 1) * CLUSTER_BLOCK_SIZE; if (end_offset <= entry->source_size && end_offset <= entry->target_size) datapagemap_add(&entry->target_pages_to_overwrite, blkno_inseg); } @@ -468,7 +469,7 @@ calculate_totals(filemap_t *filemap) iter = datapagemap_iterate(&entry->target_pages_to_overwrite); while (datapagemap_next(iter, &blk)) - filemap->fetch_size += BLCKSZ; + filemap->fetch_size += CLUSTER_BLOCK_SIZE; pg_free(iter); } diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index f7f3b8227f..dda31f6768 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -18,6 +18,7 @@ #include "access/xlog_internal.h" #include "catalog/catversion.h" #include "catalog/pg_control.h" +#include "common/blocksize.h" #include "common/controldata_utils.h" #include "common/file_perm.h" #include "common/restricted_token.h" @@ -340,6 +341,13 @@ main(int argc, char **argv) sanityChecks(); + if (IsValidBlockSize(ControlFile_source.blcksz) && + ControlFile_source.blcksz == ControlFile_target.blcksz) + BlockSizeInit(ControlFile_source.blcksz); + else + pg_fatal("cluster block sizes do not match or are invalid: %d", + ControlFile_source.blcksz); + /* * Usually, the TLI can be found in the latest checkpoint record. But if * the source server is just being promoted (or it's a standby that's @@ -571,8 +579,8 @@ perform_rewind(filemap_t *filemap, rewind_source *source, iter = datapagemap_iterate(&entry->target_pages_to_overwrite); while (datapagemap_next(iter, &blkno)) { - offset = blkno * BLCKSZ; - source->queue_fetch_range(source, entry->path, offset, BLCKSZ); + offset = blkno * CLUSTER_BLOCK_SIZE; + source->queue_fetch_range(source, entry->path, offset, CLUSTER_BLOCK_SIZE); } pg_free(iter); } diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index d173602882..57c11a43ab 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -19,14 +19,17 @@ #include #endif + #include "access/visibilitymapdefs.h" +#include "common/blocksize.h" #include "common/file_perm.h" #include "pg_upgrade.h" +#undef CLUSTER_BLOCK_SIZE +#define CLUSTER_BLOCK_SIZE BlockSize #include "storage/bufpage.h" #include "storage/checksum.h" #include "storage/checksum_impl.h" - /* * cloneFile() * @@ -96,7 +99,7 @@ copyFile(const char *src, const char *dst, schemaName, relName, dst, strerror(errno)); /* copy in fairly large chunks for best efficiency */ -#define COPY_BUF_SIZE (50 * BLCKSZ) +#define COPY_BUF_SIZE (50 * BlockSize) buffer = (char *) pg_malloc(COPY_BUF_SIZE); @@ -187,7 +190,7 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, struct stat statbuf; /* Compute number of old-format bytes per new page */ - rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2; + rewriteVmBytesPerPage = (BlockSize - SizeOfPageHeaderData) / 2; if ((src_fd = open(fromfile, O_RDONLY | PG_BINARY, 0)) < 0) pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s", @@ -220,7 +223,7 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, PageHeaderData pageheader; bool old_lastblk; - if ((bytesRead = read(src_fd, buffer.data, BLCKSZ)) != BLCKSZ) + if ((bytesRead = read(src_fd, buffer.data, BlockSize)) != BlockSize) { if (bytesRead < 0) pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s", @@ -230,7 +233,7 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, schemaName, relName, fromfile); } - totalBytesRead += BLCKSZ; + totalBytesRead += BlockSize; old_lastblk = (totalBytesRead == src_filesize); /* Save the page header data */ @@ -293,10 +296,10 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, /* Set new checksum for visibility map page, if enabled */ if (new_cluster.controldata.data_checksum_version != 0) ((PageHeader) new_vmbuf.data)->pd_checksum = - pg_checksum_page(new_vmbuf.data, new_blkno); + pg_checksum_page(new_vmbuf.data, new_blkno, BlockSize); errno = 0; - if (write(dst_fd, new_vmbuf.data, BLCKSZ) != BLCKSZ) + if (write(dst_fd, new_vmbuf.data, BlockSize) != BlockSize) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build index 12a97f84e2..06b26e54a5 100644 --- a/src/bin/pg_upgrade/meson.build +++ b/src/bin/pg_upgrade/meson.build @@ -1,5 +1,7 @@ # Copyright (c) 2022-2023, PostgreSQL Global Development Group +fs = import('fs') + pg_upgrade_sources = files( 'check.c', 'controldata.c', diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 4562dafcff..6a64d8689a 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -63,6 +63,7 @@ static void setup(char *argv0, bool *live_check); ClusterInfo old_cluster, new_cluster; OSInfo os_info; +uint32 BlockSize; char *output_files[] = { SERVER_LOG_FILE, @@ -126,6 +127,8 @@ main(int argc, char **argv) check_cluster_compatibility(live_check); + BlockSize = old_cluster.controldata.blocksz; + check_and_dump_old_cluster(live_check); diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 3eea0139c7..1c98fd51cc 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -336,7 +336,7 @@ extern UserOpts user_opts; extern ClusterInfo old_cluster, new_cluster; extern OSInfo os_info; - +extern uint32 BlockSize; /* check.c */ diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index 96845e1a1a..feb7f4af45 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -23,6 +23,8 @@ #include "access/xlogreader.h" #include "access/xlogrecord.h" #include "access/xlogstats.h" +#include "common/blocksize.h" +#include "common/controldata_utils.h" #include "common/fe_memutils.h" #include "common/file_perm.h" #include "common/file_utils.h" @@ -38,8 +40,10 @@ */ static const char *progname; +static char *pgdata; static int WalSegSz; +static int BlockSize; static volatile sig_atomic_t time_to_stop = false; static const RelFileLocator emptyRelFileLocator = {0, 0, 0}; @@ -527,7 +531,7 @@ XLogRecordSaveFPWs(XLogReaderState *record, const char *savepath) if (!file) pg_fatal("could not open file \"%s\": %m", filename); - if (fwrite(page, BLCKSZ, 1, file) != 1) + if (fwrite(page, BlockSize, 1, file) != 1) pg_fatal("could not write file \"%s\": %m", filename); if (fclose(file) != 0) @@ -808,6 +812,7 @@ main(int argc, char **argv) {"help", no_argument, NULL, '?'}, {"limit", required_argument, NULL, 'n'}, {"path", required_argument, NULL, 'p'}, + {"pgdata", required_argument, NULL, 'D'}, {"quiet", no_argument, NULL, 'q'}, {"relation", required_argument, NULL, 'R'}, {"rmgr", required_argument, NULL, 'r'}, @@ -881,7 +886,7 @@ main(int argc, char **argv) goto bad_argument; } - while ((option = getopt_long(argc, argv, "bB:e:fF:n:p:qr:R:s:t:wx:z", + while ((option = getopt_long(argc, argv, "bB:D:e:fF:n:p:qr:R:s:t:wx:z", long_options, &optindex)) != -1) { switch (option) @@ -899,6 +904,9 @@ main(int argc, char **argv) config.filter_by_relation_block_enabled = true; config.filter_by_extended = true; break; + case 'D': + pgdata = pg_strdup(optarg); + break; case 'e': if (sscanf(optarg, "%X/%X", &xlogid, &xrecoff) != 2) { @@ -1103,7 +1111,31 @@ main(int argc, char **argv) } if (config.save_fullpage_path != NULL) + { + ControlFileData *ControlFile; + bool crc_ok_p; + + /* parse the control file if we're saving the fullpage images */ + + if (!pgdata) + pgdata = getenv("PGDATA"); + if (!pgdata || strlen(pgdata) == 0) + pg_fatal("Must provide data directory via -D or PGDATA envvar"); + + ControlFile = get_controlfile(pgdata, &crc_ok_p); + + if (!ControlFile) + pg_fatal("Could not locate control file"); + + BlockSize = ControlFile->blcksz; + + if (!IsValidBlockSize(BlockSize)) + pg_fatal("read invalid block size from control file"); + + BlockSizeInit(BlockSize); + create_fullpage_directory(config.save_fullpage_path); + } /* parse files as start/end boundaries, extract path if not specified */ if (optind < argc) diff --git a/src/bin/pg_waldump/t/002_save_fullpage.pl b/src/bin/pg_waldump/t/002_save_fullpage.pl index f0725805f2..5cd8cffd5c 100644 --- a/src/bin/pg_waldump/t/002_save_fullpage.pl +++ b/src/bin/pg_waldump/t/002_save_fullpage.pl @@ -73,6 +73,7 @@ $node->command_ok( [ 'pg_waldump', '--quiet', '--save-fullpage', "$tmp_folder/raw", + '-D', $node->data_dir, '--relation', $relation, $walfile ], diff --git a/src/common/Makefile b/src/common/Makefile index 113029bf7b..da96f2dc8d 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -48,6 +48,7 @@ LIBS += $(PTHREAD_LIBS) OBJS_COMMON = \ archive.o \ base64.o \ + blocksize.o \ checksum_helper.o \ compression.o \ config_info.o \ diff --git a/src/common/blocksize.c b/src/common/blocksize.c new file mode 100644 index 0000000000..ef2b3ed34c --- /dev/null +++ b/src/common/blocksize.c @@ -0,0 +1,56 @@ +/*------------------------------------------------------------------------- + * + * blocksize.c + * This file contains methods to calculate various size constants for variable-sized blocks. + * + * + * Copyright (c) 2023, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/access/common/clustersizes.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "access/heaptoast.h" +#include "access/htup_details.h" +#include "access/itup.h" +#include "access/nbtree_int.h" +#include "common/blocksize.h" +#ifndef FRONTEND +#include "storage/freespace.h" +#endif + +PGDLLIMPORT uint32 calculated_block_sizes[BS_NUM_SIZES]; + +/* + * This routine will calculate and cache the necessary constants. This should + * be called once very very early in the process (as soon as the native block + * size is known, so after reading ControlFile). + */ + +void +BlockSizeInit(Size rawblocksize) +{ +// Assert(IsValidBlockSize(rawblocksize)); + + calculated_block_sizes[BS_BLOCK_SIZE] = rawblocksize; +// Assert(rawblocksize <= MAX_BLOCK_SIZE); + calculated_block_sizes[BS_MAX_HEAP_TUPLES] = CalcMaxHeapTupleSize(rawblocksize); +// Assert(calculated_block_sizes[BS_MAX_HEAP_TUPLES] <= MaxHeapTupleSizeLimit); + calculated_block_sizes[BS_MAX_HEAP_TUPLES_PER_PAGE] = CalcMaxHeapTuplesPerPage(rawblocksize); +// Assert(calculated_block_sizes[BS_MAX_HEAP_TUPLES_PER_PAGE] <= MaxHeapTuplesPerPageLimit); + calculated_block_sizes[BS_MAX_INDEX_TUPLES_PER_PAGE] = CalcMaxIndexTuplesPerPage(rawblocksize); +// Assert(calculated_block_sizes[BS_MAX_INDEX_TUPLES_PER_PAGE] <= MaxIndexTuplesPerPageLimit); + calculated_block_sizes[BS_MAX_TIDS_PER_BTREE_PAGE] = CalcMaxTIDsPerBTreePage(rawblocksize); +// Assert(calculated_block_sizes[BS_MAX_TIDS_PER_BTREE_PAGE] <= MaxTIDsPerBTreePageLimit); + calculated_block_sizes[BS_TOAST_MAX_CHUNK_SIZE] = CalcToastMaxChunkSize(rawblocksize); +// Assert(calculated_block_sizes[BS_TOAST_MAX_CHUNK_SIZE] <= TOAST_MAX_CHUNK_SIZE_LIMIT); + + #ifndef FRONTEND + /* also setup the FreeSpaceMap internal sizing */ + FreeSpaceMapInit(); + #endif +} diff --git a/src/common/file_utils.c b/src/common/file_utils.c index 74833c4acb..d218510b23 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -540,7 +540,7 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset) { - static const PGIOAlignedBlock zbuffer = {{0}}; /* worth BLCKSZ */ + static const PGIOAlignedBlock zbuffer = {{0}}; /* worth MAX_BLOCK_SIZE */ void *zerobuf_addr = unconstify(PGIOAlignedBlock *, &zbuffer)->data; struct iovec iov[PG_IOV_MAX]; size_t remaining_size = size; @@ -558,10 +558,10 @@ pg_pwrite_zeros(int fd, size_t size, off_t offset) iov[iovcnt].iov_base = zerobuf_addr; - if (remaining_size < BLCKSZ) + if (remaining_size < sizeof(zbuffer.data)) this_iov_size = remaining_size; else - this_iov_size = BLCKSZ; + this_iov_size = sizeof(zbuffer.data); iov[iovcnt].iov_len = this_iov_size; remaining_size -= this_iov_size; diff --git a/src/common/meson.build b/src/common/meson.build index 9efc80ac02..140c864c6c 100644 --- a/src/common/meson.build +++ b/src/common/meson.build @@ -3,6 +3,7 @@ common_sources = files( 'archive.c', 'base64.c', + 'blocksize.c', 'checksum_helper.c', 'compression.c', 'controldata_utils.c', diff --git a/src/include/access/brin_page.h b/src/include/access/brin_page.h index 3670ca6010..070bb261e9 100644 --- a/src/include/access/brin_page.h +++ b/src/include/access/brin_page.h @@ -86,7 +86,7 @@ typedef struct RevmapContents } RevmapContents; #define REVMAP_CONTENT_SIZE \ - (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \ + (CLUSTER_BLOCK_SIZE - MAXALIGN(SizeOfPageHeaderData) - \ offsetof(RevmapContents, rm_tids) - \ MAXALIGN(sizeof(BrinSpecialSpace))) /* max num of items in the array */ diff --git a/src/include/access/ginblock.h b/src/include/access/ginblock.h index c59790ec5a..13e5b12cd7 100644 --- a/src/include/access/ginblock.h +++ b/src/include/access/ginblock.h @@ -248,7 +248,7 @@ typedef signed char GinNullCategory; */ #define GinMaxItemSize \ Min(INDEX_SIZE_MASK, \ - MAXALIGN_DOWN(((BLCKSZ - \ + MAXALIGN_DOWN(((CLUSTER_BLOCK_SIZE - \ MAXALIGN(SizeOfPageHeaderData + 3 * sizeof(ItemIdData)) - \ MAXALIGN(sizeof(GinPageOpaqueData))) / 3))) @@ -318,7 +318,7 @@ typedef signed char GinNullCategory; GinPageGetOpaque(page)->maxoff * sizeof(PostingItem)) #define GinDataPageMaxDataSize \ - (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \ + (CLUSTER_BLOCK_SIZE - MAXALIGN(SizeOfPageHeaderData) \ - MAXALIGN(sizeof(ItemPointerData)) \ - MAXALIGN(sizeof(GinPageOpaqueData))) @@ -326,7 +326,7 @@ typedef signed char GinNullCategory; * List pages */ #define GinListPageSize \ - ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GinPageOpaqueData)) ) + ( CLUSTER_BLOCK_SIZE - SizeOfPageHeaderData - MAXALIGN(sizeof(GinPageOpaqueData)) ) /* * A compressed posting list. diff --git a/src/include/access/gist.h b/src/include/access/gist.h index 0235716c06..2272ac09e7 100644 --- a/src/include/access/gist.h +++ b/src/include/access/gist.h @@ -96,7 +96,7 @@ typedef GISTPageOpaqueData *GISTPageOpaque; * key size using opclass parameters. */ #define GISTMaxIndexTupleSize \ - MAXALIGN_DOWN((BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)) / \ + MAXALIGN_DOWN((CLUSTER_BLOCK_SIZE - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)) / \ 4 - sizeof(ItemIdData)) #define GISTMaxIndexKeySize \ diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 3edc740a3f..43d724d160 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -54,7 +54,7 @@ typedef struct /* Returns free space in node buffer page */ #define PAGE_FREE_SPACE(nbp) (nbp->freespace) /* Checks if node buffer page is empty */ -#define PAGE_IS_EMPTY(nbp) (nbp->freespace == BLCKSZ - BUFFER_PAGE_DATA_OFFSET) +#define PAGE_IS_EMPTY(nbp) (nbp->freespace == CLUSTER_BLOCK_SIZE - BUFFER_PAGE_DATA_OFFSET) /* Checks if node buffers page don't contain sufficient space for index tuple */ #define PAGE_NO_SPACE(nbp, itup) (PAGE_FREE_SPACE(nbp) < \ MAXALIGN(IndexTupleSize(itup))) @@ -171,7 +171,7 @@ typedef struct GISTScanOpaqueData GistNSN curPageLSN; /* pos in the WAL stream when page was read */ /* In a non-ordered search, returnable heap items are stored here: */ - GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)]; + GISTSearchHeapItem pageData[MAX_BLOCK_SIZE / sizeof(IndexTupleData)]; OffsetNumber nPageData; /* number of valid items in array */ OffsetNumber curPageData; /* next item to return */ MemoryContext pageDataCxt; /* context holding the fetched tuples, for @@ -474,7 +474,7 @@ extern void gistadjustmembers(Oid opfamilyoid, /* gistutil.c */ #define GiSTPageSize \ - ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GISTPageOpaqueData)) ) + ( CLUSTER_BLOCK_SIZE - SizeOfPageHeaderData - MAXALIGN(sizeof(GISTPageOpaqueData)) ) #define GIST_MIN_FILLFACTOR 10 #define GIST_DEFAULT_FILLFACTOR 90 diff --git a/src/include/access/hash.h b/src/include/access/hash.h index 9e035270a1..8eab28c582 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -124,7 +124,7 @@ typedef struct HashScanPosData int lastItem; /* last valid index in items[] */ int itemIndex; /* current index in items[] */ - HashScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ + HashScanPosItem items[MaxIndexTuplesPerPageLimit]; /* MUST BE LAST */ } HashScanPosData; #define HashScanPosIsPinned(scanpos) \ @@ -224,10 +224,10 @@ typedef HashScanOpaqueData *HashScanOpaque; * needing to fit into the metapage. (With 8K block size, 1024 bitmaps * limit us to 256 GB of overflow space...). For smaller block size we * can not use 1024 bitmaps as it will lead to the meta page data crossing - * the block size boundary. So we use BLCKSZ to determine the maximum number + * the block size boundary. So we use CLUSTER_BLOCK_SIZE to determine the maximum number * of bitmaps. */ -#define HASH_MAX_BITMAPS Min(BLCKSZ / 8, 1024) +#define HASH_MAX_BITMAPS Min(DEFAULT_BLOCK_SIZE / 8, 1024) #define HASH_SPLITPOINT_PHASE_BITS 2 #define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS) @@ -279,7 +279,7 @@ typedef struct HashOptions ((HashOptions *) (relation)->rd_options)->fillfactor : \ HASH_DEFAULT_FILLFACTOR) #define HashGetTargetPageUsage(relation) \ - (BLCKSZ * HashGetFillFactor(relation) / 100) + (CLUSTER_BLOCK_SIZE * HashGetFillFactor(relation) / 100) /* * Maximum size of a hash index item (it's okay to have only one per page) diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index faf5026519..644c03145b 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -75,7 +75,7 @@ typedef struct HeapScanDescData /* these fields only used in page-at-a-time mode and for bitmap scans */ int rs_cindex; /* current tuple's index in vistuples */ int rs_ntuples; /* number of visible tuples on page */ - OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]; /* their offsets */ + OffsetNumber rs_vistuples[MaxHeapTuplesPerPageLimit]; /* their offsets */ } HeapScanDescData; typedef struct HeapScanDescData *HeapScanDesc; diff --git a/src/include/access/heaptoast.h b/src/include/access/heaptoast.h index 5c0a796f66..f7c87d11c3 100644 --- a/src/include/access/heaptoast.h +++ b/src/include/access/heaptoast.h @@ -20,10 +20,7 @@ /* * Find the maximum size of a tuple if there are to be N tuples per page. */ -#define MaximumBytesPerTuple(tuplesPerPage) \ - MAXALIGN_DOWN((BLCKSZ - \ - MAXALIGN(SizeOfPageHeaderData + (tuplesPerPage) * sizeof(ItemIdData))) \ - / (tuplesPerPage)) +#define MaximumBytesPerTuple(tuplesPerPage) CalcMaximumBytesPerTuple(CLUSTER_BLOCK_SIZE,tuplesPerPage) /* * These symbols control toaster activation. If a tuple is larger than @@ -45,7 +42,7 @@ */ #define TOAST_TUPLES_PER_PAGE 4 -#define TOAST_TUPLE_THRESHOLD MaximumBytesPerTuple(TOAST_TUPLES_PER_PAGE) +#define TOAST_TUPLE_THRESHOLD CalcMaximumBytesPerTuple(CLUSTER_BLOCK_SIZE,TOAST_TUPLES_PER_PAGE) #define TOAST_TUPLE_TARGET TOAST_TUPLE_THRESHOLD @@ -58,7 +55,7 @@ */ #define TOAST_TUPLES_PER_PAGE_MAIN 1 -#define TOAST_TUPLE_TARGET_MAIN MaximumBytesPerTuple(TOAST_TUPLES_PER_PAGE_MAIN) +#define TOAST_TUPLE_TARGET_MAIN CalcMaximumBytesPerTuple(CLUSTER_BLOCK_SIZE,TOAST_TUPLES_PER_PAGE_MAIN) /* * If an index value is larger than TOAST_INDEX_TARGET, we will try to @@ -81,12 +78,9 @@ #define EXTERN_TUPLE_MAX_SIZE MaximumBytesPerTuple(EXTERN_TUPLES_PER_PAGE) -#define TOAST_MAX_CHUNK_SIZE \ - (EXTERN_TUPLE_MAX_SIZE - \ - MAXALIGN(SizeofHeapTupleHeader) - \ - sizeof(Oid) - \ - sizeof(int32) - \ - VARHDRSZ) +#define TOAST_MAX_CHUNK_SIZE_LIMIT CalcToastMaxChunkSize(MAX_BLOCK_SIZE) +#define TOAST_MAX_CHUNK_SIZE GetBlockSize(BS_TOAST_MAX_CHUNK_SIZE) + /* ---------- * heap_toast_insert_or_update - diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h index e01f4f35c8..a76c9a658a 100644 --- a/src/include/access/htup_details.h +++ b/src/include/access/htup_details.h @@ -18,6 +18,7 @@ #include "access/transam.h" #include "access/tupdesc.h" #include "access/tupmacs.h" +#include "common/blocksize.h" #include "storage/bufpage.h" #include "varatt.h" @@ -427,8 +428,8 @@ do { \ (tup)->t_choice.t_heap.t_field3.t_xvac = (xid); \ } while (0) -StaticAssertDecl(MaxOffsetNumber < SpecTokenOffsetNumber, - "invalid speculative token constant"); +/* StaticAssertDecl(MaxOffsetNumber < SpecTokenOffsetNumber, */ +/* "invalid speculative token constant"); */ #define HeapTupleHeaderIsSpeculative(tup) \ ( \ @@ -551,7 +552,7 @@ StaticAssertDecl(MaxOffsetNumber < SpecTokenOffsetNumber, /* * MaxHeapTupleSize is the maximum allowed size of a heap tuple, including - * header and MAXALIGN alignment padding. Basically it's BLCKSZ minus the + * header and MAXALIGN alignment padding. Basically it's CLUSTER_BLOCK_SIZE minus the * other stuff that has to be on a disk page. Since heap pages use no * "special space", there's no deduction for that. * @@ -560,7 +561,8 @@ StaticAssertDecl(MaxOffsetNumber < SpecTokenOffsetNumber, * ItemIds and tuples have different alignment requirements, don't assume that * you can, say, fit 2 tuples of size MaxHeapTupleSize/2 on the same page. */ -#define MaxHeapTupleSize (BLCKSZ - MAXALIGN(SizeOfPageHeaderData + sizeof(ItemIdData))) +#define MaxHeapTupleSize GetBlockSize(BS_MAX_HEAP_TUPLES) +#define MaxHeapTupleSizeLimit CalcMaxHeapTupleSize(MAX_BLOCK_SIZE) #define MinHeapTupleSize MAXALIGN(SizeofHeapTupleHeader) /* @@ -574,9 +576,9 @@ StaticAssertDecl(MaxOffsetNumber < SpecTokenOffsetNumber, * pointers to this anyway, to avoid excessive line-pointer bloat and not * require increases in the size of work arrays. */ -#define MaxHeapTuplesPerPage \ - ((int) ((BLCKSZ - SizeOfPageHeaderData) / \ - (MAXALIGN(SizeofHeapTupleHeader) + sizeof(ItemIdData)))) + +#define MaxHeapTuplesPerPage GetBlockSize(BS_MAX_HEAP_TUPLES_PER_PAGE) +#define MaxHeapTuplesPerPageLimit CalcMaxHeapTuplesPerPage(MAX_BLOCK_SIZE) /* * MaxAttrSize is a somewhat arbitrary upper limit on the declared size of diff --git a/src/include/access/itup.h b/src/include/access/itup.h index 2e2b8c7a47..2d6de24e67 100644 --- a/src/include/access/itup.h +++ b/src/include/access/itup.h @@ -163,8 +163,7 @@ index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull) * estimated here, seemingly allowing one more tuple than estimated here. * But such a page always has at least MAXALIGN special space, so we're safe. */ -#define MaxIndexTuplesPerPage \ - ((int) ((BLCKSZ - SizeOfPageHeaderData) / \ - (MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData)))) +#define MaxIndexTuplesPerPage GetBlockSize(BS_MAX_INDEX_TUPLES_PER_PAGE) +#define MaxIndexTuplesPerPageLimit CalcMaxIndexTuplesPerPage(MAX_BLOCK_SIZE) #endif /* ITUP_H */ diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 8891fa7973..bed8595537 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -15,6 +15,7 @@ #define NBTREE_H #include "access/amapi.h" +#include "access/nbtree_int.h" #include "access/itup.h" #include "access/sdir.h" #include "access/tableam.h" @@ -25,208 +26,6 @@ #include "storage/bufmgr.h" #include "storage/shm_toc.h" -/* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ -typedef uint16 BTCycleId; - -/* - * BTPageOpaqueData -- At the end of every page, we store a pointer - * to both siblings in the tree. This is used to do forward/backward - * index scans. The next-page link is also critical for recovery when - * a search has navigated to the wrong page due to concurrent page splits - * or deletions; see src/backend/access/nbtree/README for more info. - * - * In addition, we store the page's btree level (counting upwards from - * zero at a leaf page) as well as some flag bits indicating the page type - * and status. If the page is deleted, a BTDeletedPageData struct is stored - * in the page's tuple area, while a standard BTPageOpaqueData struct is - * stored in the page special area. - * - * We also store a "vacuum cycle ID". When a page is split while VACUUM is - * processing the index, a nonzero value associated with the VACUUM run is - * stored into both halves of the split page. (If VACUUM is not running, - * both pages receive zero cycleids.) This allows VACUUM to detect whether - * a page was split since it started, with a small probability of false match - * if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs - * ago. Also, during a split, the BTP_SPLIT_END flag is cleared in the left - * (original) page, and set in the right page, but only if the next page - * to its right has a different cycleid. - * - * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested - * instead. - * - * NOTE: the btpo_level field used to be a union type in order to allow - * deleted pages to store a 32-bit safexid in the same field. We now store - * 64-bit/full safexid values using BTDeletedPageData instead. - */ - -typedef struct BTPageOpaqueData -{ - BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */ - BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */ - uint32 btpo_level; /* tree level --- zero for leaf pages */ - uint16 btpo_flags; /* flag bits, see below */ - BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */ -} BTPageOpaqueData; - -typedef BTPageOpaqueData *BTPageOpaque; - -#define BTPageGetOpaque(page) ((BTPageOpaque) PageGetSpecialPointer(page)) - -/* Bits defined in btpo_flags */ -#define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */ -#define BTP_ROOT (1 << 1) /* root page (has no parent) */ -#define BTP_DELETED (1 << 2) /* page has been deleted from tree */ -#define BTP_META (1 << 3) /* meta-page */ -#define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */ -#define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */ -#define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples (deprecated) */ -#define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */ -#define BTP_HAS_FULLXID (1 << 8) /* contains BTDeletedPageData */ - -/* - * The max allowed value of a cycle ID is a bit less than 64K. This is - * for convenience of pg_filedump and similar utilities: we want to use - * the last 2 bytes of special space as an index type indicator, and - * restricting cycle ID lets btree use that space for vacuum cycle IDs - * while still allowing index type to be identified. - */ -#define MAX_BT_CYCLE_ID 0xFF7F - - -/* - * The Meta page is always the first page in the btree index. - * Its primary purpose is to point to the location of the btree root page. - * We also point to the "fast" root, which is the current effective root; - * see README for discussion. - */ - -typedef struct BTMetaPageData -{ - uint32 btm_magic; /* should contain BTREE_MAGIC */ - uint32 btm_version; /* nbtree version (always <= BTREE_VERSION) */ - BlockNumber btm_root; /* current root location */ - uint32 btm_level; /* tree level of the root page */ - BlockNumber btm_fastroot; /* current "fast" root location */ - uint32 btm_fastlevel; /* tree level of the "fast" root page */ - /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */ - - /* number of deleted, non-recyclable pages during last cleanup */ - uint32 btm_last_cleanup_num_delpages; - /* number of heap tuples during last cleanup (deprecated) */ - float8 btm_last_cleanup_num_heap_tuples; - - bool btm_allequalimage; /* are all columns "equalimage"? */ -} BTMetaPageData; - -#define BTPageGetMeta(p) \ - ((BTMetaPageData *) PageGetContents(p)) - -/* - * The current Btree version is 4. That's what you'll get when you create - * a new index. - * - * Btree version 3 was used in PostgreSQL v11. It is mostly the same as - * version 4, but heap TIDs were not part of the keyspace. Index tuples - * with duplicate keys could be stored in any order. We continue to - * support reading and writing Btree versions 2 and 3, so that they don't - * need to be immediately re-indexed at pg_upgrade. In order to get the - * new heapkeyspace semantics, however, a REINDEX is needed. - * - * Deduplication is safe to use when the btm_allequalimage field is set to - * true. It's safe to read the btm_allequalimage field on version 3, but - * only version 4 indexes make use of deduplication. Even version 4 - * indexes created on PostgreSQL v12 will need a REINDEX to make use of - * deduplication, though, since there is no other way to set - * btm_allequalimage to true (pg_upgrade hasn't been taught to set the - * metapage field). - * - * Btree version 2 is mostly the same as version 3. There are two new - * fields in the metapage that were introduced in version 3. A version 2 - * metapage will be automatically upgraded to version 3 on the first - * insert to it. INCLUDE indexes cannot use version 2. - */ -#define BTREE_METAPAGE 0 /* first page is meta */ -#define BTREE_MAGIC 0x053162 /* magic number in metapage */ -#define BTREE_VERSION 4 /* current version number */ -#define BTREE_MIN_VERSION 2 /* minimum supported version */ -#define BTREE_NOVAC_VERSION 3 /* version with all meta fields set */ - -/* - * Maximum size of a btree index entry, including its tuple header. - * - * We actually need to be able to fit three items on every page, - * so restrict any one item to 1/3 the per-page available space. - * - * There are rare cases where _bt_truncate() will need to enlarge - * a heap index tuple to make space for a tiebreaker heap TID - * attribute, which we account for here. - */ -#define BTMaxItemSize(page) \ - (MAXALIGN_DOWN((PageGetPageSize(page) - \ - MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \ - MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - \ - MAXALIGN(sizeof(ItemPointerData))) -#define BTMaxItemSizeNoHeapTid(page) \ - MAXALIGN_DOWN((PageGetPageSize(page) - \ - MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \ - MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - -/* - * MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples - * that may be stored on a btree leaf page. It is used to size the - * per-page temporary buffers. - * - * Note: we don't bother considering per-tuple overheads here to keep - * things simple (value is based on how many elements a single array of - * heap TIDs must have to fill the space between the page header and - * special area). The value is slightly higher (i.e. more conservative) - * than necessary as a result, which is considered acceptable. - */ -#define MaxTIDsPerBTreePage \ - (int) ((BLCKSZ - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \ - sizeof(ItemPointerData)) - -/* - * The leaf-page fillfactor defaults to 90% but is user-adjustable. - * For pages above the leaf level, we use a fixed 70% fillfactor. - * The fillfactor is applied during index build and when splitting - * a rightmost page; when splitting non-rightmost pages we try to - * divide the data equally. When splitting a page that's entirely - * filled with a single value (duplicates), the effective leaf-page - * fillfactor is 96%, regardless of whether the page is a rightmost - * page. - */ -#define BTREE_MIN_FILLFACTOR 10 -#define BTREE_DEFAULT_FILLFACTOR 90 -#define BTREE_NONLEAF_FILLFACTOR 70 -#define BTREE_SINGLEVAL_FILLFACTOR 96 - -/* - * In general, the btree code tries to localize its knowledge about - * page layout to a couple of routines. However, we need a special - * value to indicate "no page number" in those places where we expect - * page numbers. We can use zero for this because we never need to - * make a pointer to the metadata page. - */ - -#define P_NONE 0 - -/* - * Macros to test whether a page is leftmost or rightmost on its tree level, - * as well as other state info kept in the opaque data. - */ -#define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE) -#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE) -#define P_ISLEAF(opaque) (((opaque)->btpo_flags & BTP_LEAF) != 0) -#define P_ISROOT(opaque) (((opaque)->btpo_flags & BTP_ROOT) != 0) -#define P_ISDELETED(opaque) (((opaque)->btpo_flags & BTP_DELETED) != 0) -#define P_ISMETA(opaque) (((opaque)->btpo_flags & BTP_META) != 0) -#define P_ISHALFDEAD(opaque) (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0) -#define P_IGNORE(opaque) (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0) -#define P_HAS_GARBAGE(opaque) (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0) -#define P_INCOMPLETE_SPLIT(opaque) (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0) -#define P_HAS_FULLXID(opaque) (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0) - /* * BTDeletedPageData is the page contents of a deleted page */ @@ -892,7 +691,7 @@ typedef struct BTDedupStateData * are implicitly unchanged by deduplication pass). */ int nintervals; /* current number of intervals in array */ - BTDedupInterval intervals[MaxIndexTuplesPerPage]; + BTDedupInterval intervals[MaxIndexTuplesPerPageLimit]; } BTDedupStateData; typedef BTDedupStateData *BTDedupState; @@ -987,7 +786,7 @@ typedef struct BTScanPosData int lastItem; /* last valid index in items[] */ int itemIndex; /* current index in items[] */ - BTScanPosItem items[MaxTIDsPerBTreePage]; /* MUST BE LAST */ + BTScanPosItem items[MaxTIDsPerBTreePageLimit]; /* MUST BE LAST */ } BTScanPosData; typedef BTScanPosData *BTScanPos; @@ -1057,7 +856,7 @@ typedef struct BTScanOpaqueData /* * If we are doing an index-only scan, these are the tuple storage * workspaces for the currPos and markPos respectively. Each is of size - * BLCKSZ, so it can hold as much as a full page's worth of tuples. + * CLUSTER_BLOCK_SIZE, so it can hold as much as a full page's worth of tuples. */ char *currTuples; /* tuple storage for currPos */ char *markTuples; /* tuple storage for markPos */ @@ -1104,7 +903,7 @@ typedef struct BTOptions ((BTOptions *) (relation)->rd_options)->fillfactor : \ BTREE_DEFAULT_FILLFACTOR) #define BTGetTargetPageFreeSpace(relation) \ - (BLCKSZ * (100 - BTGetFillFactor(relation)) / 100) + (CLUSTER_BLOCK_SIZE * (100 - BTGetFillFactor(relation)) / 100) #define BTGetDeduplicateItems(relation) \ (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \ relation->rd_rel->relam == BTREE_AM_OID), \ diff --git a/src/include/access/nbtree_int.h b/src/include/access/nbtree_int.h new file mode 100644 index 0000000000..b6dce4743f --- /dev/null +++ b/src/include/access/nbtree_int.h @@ -0,0 +1,220 @@ +/*------------------------------------------------------------------------- + * + * nbtree_int.h + * internal defs for header file for postgres btree access method implementation. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/nbtree_int.h + * + *------------------------------------------------------------------------- + */ +#ifndef NBTREE_INT_H +#define NBTREE_INT_H + +#include "storage/block.h" + +/* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ +typedef uint16 BTCycleId; + +/* + * BTPageOpaqueData -- At the end of every page, we store a pointer + * to both siblings in the tree. This is used to do forward/backward + * index scans. The next-page link is also critical for recovery when + * a search has navigated to the wrong page due to concurrent page splits + * or deletions; see src/backend/access/nbtree/README for more info. + * + * In addition, we store the page's btree level (counting upwards from + * zero at a leaf page) as well as some flag bits indicating the page type + * and status. If the page is deleted, a BTDeletedPageData struct is stored + * in the page's tuple area, while a standard BTPageOpaqueData struct is + * stored in the page special area. + * + * We also store a "vacuum cycle ID". When a page is split while VACUUM is + * processing the index, a nonzero value associated with the VACUUM run is + * stored into both halves of the split page. (If VACUUM is not running, + * both pages receive zero cycleids.) This allows VACUUM to detect whether + * a page was split since it started, with a small probability of false match + * if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs + * ago. Also, during a split, the BTP_SPLIT_END flag is cleared in the left + * (original) page, and set in the right page, but only if the next page + * to its right has a different cycleid. + * + * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested + * instead. + * + * NOTE: the btpo_level field used to be a union type in order to allow + * deleted pages to store a 32-bit safexid in the same field. We now store + * 64-bit/full safexid values using BTDeletedPageData instead. + */ + +typedef struct BTPageOpaqueData +{ + BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */ + BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */ + uint32 btpo_level; /* tree level --- zero for leaf pages */ + uint16 btpo_flags; /* flag bits, see below */ + BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */ +} BTPageOpaqueData; + +typedef BTPageOpaqueData *BTPageOpaque; + +#define BTPageGetOpaque(page) ((BTPageOpaque) PageGetSpecialPointer(page)) + +/* Bits defined in btpo_flags */ +#define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */ +#define BTP_ROOT (1 << 1) /* root page (has no parent) */ +#define BTP_DELETED (1 << 2) /* page has been deleted from tree */ +#define BTP_META (1 << 3) /* meta-page */ +#define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */ +#define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */ +#define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples (deprecated) */ +#define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */ +#define BTP_HAS_FULLXID (1 << 8) /* contains BTDeletedPageData */ + +/* + * The max allowed value of a cycle ID is a bit less than 64K. This is + * for convenience of pg_filedump and similar utilities: we want to use + * the last 2 bytes of special space as an index type indicator, and + * restricting cycle ID lets btree use that space for vacuum cycle IDs + * while still allowing index type to be identified. + */ +#define MAX_BT_CYCLE_ID 0xFF7F + + +/* + * The Meta page is always the first page in the btree index. + * Its primary purpose is to point to the location of the btree root page. + * We also point to the "fast" root, which is the current effective root; + * see README for discussion. + */ + +typedef struct BTMetaPageData +{ + uint32 btm_magic; /* should contain BTREE_MAGIC */ + uint32 btm_version; /* nbtree version (always <= BTREE_VERSION) */ + BlockNumber btm_root; /* current root location */ + uint32 btm_level; /* tree level of the root page */ + BlockNumber btm_fastroot; /* current "fast" root location */ + uint32 btm_fastlevel; /* tree level of the "fast" root page */ + /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */ + + /* number of deleted, non-recyclable pages during last cleanup */ + uint32 btm_last_cleanup_num_delpages; + /* number of heap tuples during last cleanup (deprecated) */ + float8 btm_last_cleanup_num_heap_tuples; + + bool btm_allequalimage; /* are all columns "equalimage"? */ +} BTMetaPageData; + +#define BTPageGetMeta(p) \ + ((BTMetaPageData *) PageGetContents(p)) + +/* + * The current Btree version is 4. That's what you'll get when you create + * a new index. + * + * Btree version 3 was used in PostgreSQL v11. It is mostly the same as + * version 4, but heap TIDs were not part of the keyspace. Index tuples + * with duplicate keys could be stored in any order. We continue to + * support reading and writing Btree versions 2 and 3, so that they don't + * need to be immediately re-indexed at pg_upgrade. In order to get the + * new heapkeyspace semantics, however, a REINDEX is needed. + * + * Deduplication is safe to use when the btm_allequalimage field is set to + * true. It's safe to read the btm_allequalimage field on version 3, but + * only version 4 indexes make use of deduplication. Even version 4 + * indexes created on PostgreSQL v12 will need a REINDEX to make use of + * deduplication, though, since there is no other way to set + * btm_allequalimage to true (pg_upgrade hasn't been taught to set the + * metapage field). + * + * Btree version 2 is mostly the same as version 3. There are two new + * fields in the metapage that were introduced in version 3. A version 2 + * metapage will be automatically upgraded to version 3 on the first + * insert to it. INCLUDE indexes cannot use version 2. + */ +#define BTREE_METAPAGE 0 /* first page is meta */ +#define BTREE_MAGIC 0x053162 /* magic number in metapage */ +#define BTREE_VERSION 4 /* current version number */ +#define BTREE_MIN_VERSION 2 /* minimum supported version */ +#define BTREE_NOVAC_VERSION 3 /* version with all meta fields set */ + +/* + * Maximum size of a btree index entry, including its tuple header. + * + * We actually need to be able to fit three items on every page, + * so restrict any one item to 1/3 the per-page available space. + * + * There are rare cases where _bt_truncate() will need to enlarge + * a heap index tuple to make space for a tiebreaker heap TID + * attribute, which we account for here. + */ +#define BTMaxItemSize(page) \ + (MAXALIGN_DOWN((PageGetPageSize(page) - \ + MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \ + MAXALIGN(sizeof(BTPageOpaqueData))) / 3) - \ + MAXALIGN(sizeof(ItemPointerData))) +#define BTMaxItemSizeNoHeapTid(page) \ + MAXALIGN_DOWN((PageGetPageSize(page) - \ + MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \ + MAXALIGN(sizeof(BTPageOpaqueData))) / 3) + +/* + * MaxTIDsPerBTreePage is an upper bound on the number of heap TIDs tuples + * that may be stored on a btree leaf page. It is used to size the + * per-page temporary buffers. + * + * Note: we don't bother considering per-tuple overheads here to keep + * things simple (value is based on how many elements a single array of + * heap TIDs must have to fill the space between the page header and + * special area). The value is slightly higher (i.e. more conservative) + * than necessary as a result, which is considered acceptable. + */ +#define MaxTIDsPerBTreePage GetBlockSize(BS_MAX_TIDS_PER_BTREE_PAGE) +#define MaxTIDsPerBTreePageLimit CalcMaxTIDsPerBTreePage(MAX_BLOCK_SIZE) + +/* + * The leaf-page fillfactor defaults to 90% but is user-adjustable. + * For pages above the leaf level, we use a fixed 70% fillfactor. + * The fillfactor is applied during index build and when splitting + * a rightmost page; when splitting non-rightmost pages we try to + * divide the data equally. When splitting a page that's entirely + * filled with a single value (duplicates), the effective leaf-page + * fillfactor is 96%, regardless of whether the page is a rightmost + * page. + */ +#define BTREE_MIN_FILLFACTOR 10 +#define BTREE_DEFAULT_FILLFACTOR 90 +#define BTREE_NONLEAF_FILLFACTOR 70 +#define BTREE_SINGLEVAL_FILLFACTOR 96 + +/* + * In general, the btree code tries to localize its knowledge about + * page layout to a couple of routines. However, we need a special + * value to indicate "no page number" in those places where we expect + * page numbers. We can use zero for this because we never need to + * make a pointer to the metadata page. + */ + +#define P_NONE 0 + +/* + * Macros to test whether a page is leftmost or rightmost on its tree level, + * as well as other state info kept in the opaque data. + */ +#define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE) +#define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE) +#define P_ISLEAF(opaque) (((opaque)->btpo_flags & BTP_LEAF) != 0) +#define P_ISROOT(opaque) (((opaque)->btpo_flags & BTP_ROOT) != 0) +#define P_ISDELETED(opaque) (((opaque)->btpo_flags & BTP_DELETED) != 0) +#define P_ISMETA(opaque) (((opaque)->btpo_flags & BTP_META) != 0) +#define P_ISHALFDEAD(opaque) (((opaque)->btpo_flags & BTP_HALF_DEAD) != 0) +#define P_IGNORE(opaque) (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0) +#define P_HAS_GARBAGE(opaque) (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0) +#define P_INCOMPLETE_SPLIT(opaque) (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0) +#define P_HAS_FULLXID(opaque) (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0) + +#endif diff --git a/src/include/access/slru.h b/src/include/access/slru.h index a8a424d92d..a477bcdf6b 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -19,7 +19,7 @@ /* - * Define SLRU segment size. A page is the same BLCKSZ as is used everywhere + * Define SLRU segment size. A page is the same CLUSTER_BLOCK_SIZE as is used everywhere * else in Postgres. The segment size can be chosen somewhat arbitrarily; * we make it 32 pages by default, or 256Kb, i.e. 1M transactions for CLOG * or 64K transactions for SUBTRANS. diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h index c6ef46fc20..cb5a10e6b3 100644 --- a/src/include/access/spgist_private.h +++ b/src/include/access/spgist_private.h @@ -36,7 +36,7 @@ typedef struct SpGistOptions ((SpGistOptions *) (relation)->rd_options)->fillfactor : \ SPGIST_DEFAULT_FILLFACTOR) #define SpGistGetTargetPageFreeSpace(relation) \ - (BLCKSZ * (100 - SpGistGetFillFactor(relation)) / 100) + (CLUSTER_BLOCK_SIZE * (100 - SpGistGetFillFactor(relation)) / 100) /* SPGiST leaf tuples have one key column, optionally have included columns */ @@ -226,14 +226,14 @@ typedef struct SpGistScanOpaqueData TupleDesc reconTupDesc; /* if so, descriptor for reconstructed tuples */ int nPtrs; /* number of TIDs found on current page */ int iPtr; /* index for scanning through same */ - ItemPointerData heapPtrs[MaxIndexTuplesPerPage]; /* TIDs from cur page */ - bool recheck[MaxIndexTuplesPerPage]; /* their recheck flags */ - bool recheckDistances[MaxIndexTuplesPerPage]; /* distance recheck + ItemPointerData heapPtrs[MaxIndexTuplesPerPageLimit]; /* TIDs from cur page */ + bool recheck[MaxIndexTuplesPerPageLimit]; /* their recheck flags */ + bool recheckDistances[MaxIndexTuplesPerPageLimit]; /* distance recheck * flags */ - HeapTuple reconTups[MaxIndexTuplesPerPage]; /* reconstructed tuples */ + HeapTuple reconTups[MaxIndexTuplesPerPageLimit]; /* reconstructed tuples */ /* distances (for recheck) */ - IndexOrderByDistance *distances[MaxIndexTuplesPerPage]; + IndexOrderByDistance *distances[MaxIndexTuplesPerPageLimit]; /* * Note: using MaxIndexTuplesPerPage above is a bit hokey since @@ -445,7 +445,7 @@ typedef SpGistDeadTupleData *SpGistDeadTuple; /* Page capacity after allowing for fixed header and special space */ #define SPGIST_PAGE_CAPACITY \ - MAXALIGN_DOWN(BLCKSZ - \ + MAXALIGN_DOWN(CLUSTER_BLOCK_SIZE - \ SizeOfPageHeaderData - \ MAXALIGN(sizeof(SpGistPageOpaqueData))) diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index b0fd338a00..3bc419ed78 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -391,6 +391,7 @@ extern void GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli); extern void XLogRecGetBlockRefInfo(XLogReaderState *record, bool pretty, bool detailed_format, StringInfo buf, uint32 *fpi_len); +extern uint32 ClusterBlockSize(void); /* * Exported for the functions in timeline.c and xlogarchive.c. Only valid diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h index f355e08e1d..ef4cad64ac 100644 --- a/src/include/access/xlogrecord.h +++ b/src/include/access/xlogrecord.h @@ -122,21 +122,21 @@ typedef struct XLogRecordBlockHeader * in the middle, which contains only zero bytes. Since we know that the * "hole" is all zeros, we remove it from the stored data (and it's not counted * in the XLOG record's CRC, either). Hence, the amount of block data actually - * present is (BLCKSZ - ). + * present is (CLUSTER_BLOCK_SIZE - ). * * Additionally, when wal_compression is enabled, we will try to compress full * page images using one of the supported algorithms, after removing the * "hole". This can reduce the WAL volume, but at some extra cost of CPU spent * on the compression during WAL logging. In this case, since the "hole" * length cannot be calculated by subtracting the number of page image bytes - * from BLCKSZ, basically it needs to be stored as an extra information. + * from CLUSTER_BLOCK_SIZE, basically it needs to be stored as an extra information. * But when no "hole" exists, we can assume that the "hole" length is zero * and no such an extra information needs to be stored. Note that * the original version of page image is stored in WAL instead of the * compressed one if the number of bytes saved by compression is less than * the length of extra information. Hence, when a page image is successfully * compressed, the amount of block data actually present is less than - * BLCKSZ - the length of "hole" bytes - the length of extra information. + * CLUSTER_BLOCK_SIZE - the length of "hole" bytes - the length of extra information. */ typedef struct XLogRecordBlockImageHeader { diff --git a/src/include/backup/basebackup_sink.h b/src/include/backup/basebackup_sink.h index 224732e333..012bfb7725 100644 --- a/src/include/backup/basebackup_sink.h +++ b/src/include/backup/basebackup_sink.h @@ -80,7 +80,7 @@ typedef struct bbsink_state * 'bbs_ops' is the relevant callback table. * * 'bbs_buffer' is the buffer into which data destined for the bbsink - * should be stored. It must be a multiple of BLCKSZ. + * should be stored. It must be a multiple of CLUSTER_BLOCK_SIZE. * * 'bbs_buffer_length' is the allocated length of the buffer. * @@ -183,7 +183,7 @@ bbsink_begin_backup(bbsink *sink, bbsink_state *state, int buffer_length) sink->bbs_ops->begin_backup(sink); Assert(sink->bbs_buffer != NULL); - Assert((sink->bbs_buffer_length % BLCKSZ) == 0); + Assert((sink->bbs_buffer_length % CLUSTER_BLOCK_SIZE) == 0); } /* Begin an archive. */ diff --git a/src/include/c.h b/src/include/c.h index f69d739be5..82584b123a 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -1118,7 +1118,7 @@ extern void ExceptionalCondition(const char *conditionName, ((var) = ((var) < 0) ? 1 : -(var)) /* - * Use this, not "char buf[BLCKSZ]", to declare a field or local variable + * Use this, not "char buf[MAX_BLOCK_SIZE]", to declare a field or local variable * holding a page buffer, if that page might be accessed as a page. Otherwise * the variable might be under-aligned, causing problems on alignment-picky * hardware. We include both "double" and "int64" in the union to ensure that @@ -1127,7 +1127,7 @@ extern void ExceptionalCondition(const char *conditionName, */ typedef union PGAlignedBlock { - char data[BLCKSZ]; + char data[MAX_BLOCK_SIZE]; double force_align_d; int64 force_align_i64; } PGAlignedBlock; @@ -1145,7 +1145,7 @@ typedef union PGIOAlignedBlock #ifdef pg_attribute_aligned pg_attribute_aligned(PG_IO_ALIGN_SIZE) #endif - char data[BLCKSZ]; + char data[MAX_BLOCK_SIZE]; double force_align_d; int64 force_align_i64; } PGIOAlignedBlock; diff --git a/src/include/common/blocksize.h b/src/include/common/blocksize.h new file mode 100644 index 0000000000..d8343c4b51 --- /dev/null +++ b/src/include/common/blocksize.h @@ -0,0 +1,91 @@ +/*------------------------------------------------------------------------- + * + * blocksize.h + * defintions for cluster-specific limits/structure defs + * + * + * Copyright (c) 2023, PostgreSQL Global Development Group + * + * IDENTIFICATION: src/include/clustersizes.h + * + *------------------------------------------------------------------------- + */ + +#ifndef BLOCKSIZE_H +#define BLOCKSIZE_H + +#ifndef DEFAULT_BLOCK_SIZE +#define DEFAULT_BLOCK_SIZE 8192 +#endif + +#ifndef MIN_BLOCK_SIZE +#define MIN_BLOCK_SIZE 1024 +#endif + +#ifndef MAX_BLOCK_SIZE +#define MAX_BLOCK_SIZE 32*1024 +#endif + +#define IsValidBlockSize(size) ((size) >= MIN_BLOCK_SIZE && \ + (size) <= MAX_BLOCK_SIZE && \ + ((size)&((size)-1)) == 0) + +typedef enum ClusterSize { + BS_BLOCK_SIZE = 0, + BS_MAX_HEAP_TUPLES, + BS_MAX_HEAP_TUPLES_PER_PAGE, + BS_MAX_INDEX_TUPLES_PER_PAGE, + BS_MAX_TIDS_PER_BTREE_PAGE, + BS_TOAST_MAX_CHUNK_SIZE, + BS_NUM_SIZES +} ClusterSize; + +extern PGDLLIMPORT uint32 calculated_block_sizes[BS_NUM_SIZES]; + +void BlockSizeInit(Size rawblocksize); +#define GetBlockSize(theSize) (calculated_block_sizes[theSize]) +#define CLUSTER_BLOCK_SIZE GetBlockSize(BS_BLOCK_SIZE) +#define CLUSTER_RELSEG_SIZE (RELSEG_SIZE * DEFAULT_BLOCK_SIZE / CLUSTER_BLOCK_SIZE) + +/* Specific calculations' now parameterized sources */ + +/* originally in heaptoast.h */ + +#define CalcMaximumBytesPerTuple(blocksize,tuplesPerPage) \ + MAXALIGN_DOWN((blocksize - \ + MAXALIGN(SizeOfPageHeaderData + (tuplesPerPage) * sizeof(ItemIdData))) \ + / (tuplesPerPage)) + +#define CalcToastMaxChunkSize(blocksize) \ + (CalcMaximumBytesPerTuple(blocksize,EXTERN_TUPLES_PER_PAGE) - \ + MAXALIGN(SizeofHeapTupleHeader) - \ + sizeof(Oid) - \ + sizeof(int32) - \ + VARHDRSZ) + +/* originally in htup_details.h */ + +#define CalcMaxHeapTupleSize(size) (size - MAXALIGN(SizeOfPageHeaderData + sizeof(ItemIdData))) + +#define CalcMaxHeapTuplesPerPage(size) \ + ((int) (((size) - SizeOfPageHeaderData) / \ + (MAXALIGN(SizeofHeapTupleHeader) + sizeof(ItemIdData)))) + +/* originally in itup.h */ + +#define CalcMaxIndexTuplesPerPage(size) \ + ((int) ((size - SizeOfPageHeaderData) / \ + (MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData)))) + +/* originally in nbtree_int.h */ + +#define CalcMaxTIDsPerBTreePage(size) \ + (int) ((size - SizeOfPageHeaderData - sizeof(BTPageOpaqueData)) / \ + sizeof(ItemPointerData)) + +/* originally in bloom.h */ +#define CalcFreeBlockNumberElems(size) (MAXALIGN_DOWN(size - SizeOfPageHeaderData - MAXALIGN(sizeof(BloomPageOpaqueData)) \ + - MAXALIGN(sizeof(uint16) * 2 + sizeof(uint32) + sizeof(BloomOptions)) \ + ) / sizeof(BlockNumber)) + +#endif diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 6d572c3820..5c9fd21eca 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -24,11 +24,11 @@ /* Size of a disk block --- this also limits the size of a tuple. You can set it bigger if you need bigger tuples (although TOAST should reduce the need to have large tuples, since fields can be spread across multiple tuples). - BLCKSZ must be a power of 2. The maximum possible value of BLCKSZ is + CLUSTER_BLOCK_SIZE must be a power of 2. The maximum possible value of CLUSTER_BLOCK_SIZE is currently 2^15 (32768). This is determined by the 15-bit widths of the lp_off and lp_len fields in ItemIdData (see include/storage/itemid.h). - Changing BLCKSZ requires an initdb. */ -#undef BLCKSZ + Changing CLUSTER_BLOCK_SIZE requires an initdb. */ +#undef CLUSTER_BLOCK_SIZE /* Saved arguments from configure */ #undef CONFIGURE_ARGS @@ -660,8 +660,8 @@ #undef PTHREAD_CREATE_JOINABLE /* RELSEG_SIZE is the maximum number of blocks allowed in one disk file. Thus, - the maximum size of a single file is RELSEG_SIZE * BLCKSZ; relations bigger - than that are divided into multiple files. RELSEG_SIZE * BLCKSZ must be + the maximum size of a single file is RELSEG_SIZE * CLUSTER_BLOCK_SIZE; relations bigger + than that are divided into multiple files. RELSEG_SIZE * CLUSTER_BLOCK_SIZE must be less than your OS' limit on file size. This is often 2 GB or 4GB in a 32-bit operating system, unless you have large file support enabled. By default, we make the limit 1 GB to avoid any possible integer-overflow @@ -781,7 +781,7 @@ # endif #endif -/* Size of a WAL file block. This need have no particular relation to BLCKSZ. +/* Size of a WAL file block. This need have no particular relation to CLUSTER_BLOCK_SIZE. XLOG_BLCKSZ must be a power of 2, and if your system supports O_DIRECT I/O, XLOG_BLCKSZ must be a multiple of the alignment requirement for direct-I/O buffers, else direct I/O may fail. Changing XLOG_BLCKSZ requires an initdb. diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index a1a93ad706..950b651b4a 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -33,7 +33,7 @@ * * The minimum value is 8 (GIN indexes use 8-argument support functions). * The maximum possible value is around 600 (limited by index tuple size in - * pg_proc's index; BLCKSZ larger than 8K would allow more). Values larger + * pg_proc's index; CLUSTER_BLOCK_SIZE larger than 8K would allow more). Values larger * than needed will waste memory and processing time, but do not directly * cost disk space. * @@ -233,6 +233,14 @@ */ #define PG_IO_ALIGN_SIZE 4096 +/* + * Blocksize-related constants + */ + +#define MIN_BLOCK_SIZE 1024 +#define MAX_BLOCK_SIZE 32768 +/* #define DEFAULT_BLOCK_SIZE 8192 */ + /* *------------------------------------------------------------------------ * The following symbols are for enabling debugging code, not for diff --git a/src/include/postgres.h b/src/include/postgres.h index 8a028ff789..4651389fc2 100644 --- a/src/include/postgres.h +++ b/src/include/postgres.h @@ -45,6 +45,7 @@ #include "c.h" #include "utils/elog.h" #include "utils/palloc.h" +#include "common/blocksize.h" /* ---------------------------------------------------------------- * Section 1: Datum type + support functions diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 0f5fb6be00..9255457dbc 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -323,7 +323,7 @@ BufferGetBlock(Buffer buffer) if (BufferIsLocal(buffer)) return LocalBufferBlockPointers[-buffer - 1]; else - return (Block) (BufferBlocks + ((Size) (buffer - 1)) * BLCKSZ); + return (Block) (BufferBlocks + ((Size) (buffer - 1)) * CLUSTER_BLOCK_SIZE); } /* @@ -341,7 +341,7 @@ static inline Size BufferGetPageSize(Buffer buffer) { AssertMacro(BufferIsValid(buffer)); - return (Size) BLCKSZ; + return (Size) CLUSTER_BLOCK_SIZE; } /* diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index 424ecba028..7cd273d5c1 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -325,7 +325,7 @@ static inline void PageValidateSpecialPointer(Page page) { Assert(page); - Assert(((PageHeader) page)->pd_special <= BLCKSZ); + Assert(((PageHeader) page)->pd_special <= CLUSTER_BLOCK_SIZE); Assert(((PageHeader) page)->pd_special >= SizeOfPageHeaderData); } @@ -475,15 +475,15 @@ do { \ PageIsVerifiedExtended(page, blkno, \ PIV_LOG_WARNING | PIV_REPORT_STAT) -/* - * Check that BLCKSZ is a multiple of sizeof(size_t). In - * PageIsVerifiedExtended(), it is much faster to check if a page is - * full of zeroes using the native word size. Note that this assertion - * is kept within a header to make sure that StaticAssertDecl() works - * across various combinations of platforms and compilers. - */ -StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)), - "BLCKSZ has to be a multiple of sizeof(size_t)"); +/* /\* */ +/* * Check that CLUSTER_BLOCK_SIZE is a multiple of sizeof(size_t). In */ +/* * PageIsVerifiedExtended(), it is much faster to check if a page is */ +/* * full of zeroes using the native word size. Note that this assertion */ +/* * is kept within a header to make sure that StaticAssertDecl() works */ +/* * across various combinations of platforms and compilers. */ +/* *\/ */ +/* StaticAssertDecl(CLUSTER_BLOCK_SIZE == ((CLUSTER_BLOCK_SIZE / sizeof(size_t)) * sizeof(size_t)), */ +/* "CLUSTER_BLOCK_SIZE has to be a multiple of sizeof(size_t)"); */ extern void PageInit(Page page, Size pageSize, Size specialSize); extern bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags); diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h index 4afd25a0af..bea1cbd5b6 100644 --- a/src/include/storage/checksum.h +++ b/src/include/storage/checksum.h @@ -19,6 +19,6 @@ * Compute the checksum for a Postgres page. The page must be aligned on a * 4-byte boundary. */ -extern uint16 pg_checksum_page(char *page, BlockNumber blkno); +extern uint16 pg_checksum_page(char *page, BlockNumber blkno, Size pagesize); #endif /* CHECKSUM_H */ diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h index 7b157161a2..683f7c1c76 100644 --- a/src/include/storage/checksum_impl.h +++ b/src/include/storage/checksum_impl.h @@ -101,6 +101,7 @@ */ #include "storage/bufpage.h" +#include "common/blocksize.h" /* number of checksums to calculate in parallel */ #define N_SUMS 32 @@ -111,7 +112,7 @@ typedef union { PageHeaderData phdr; - uint32 data[BLCKSZ / (sizeof(uint32) * N_SUMS)][N_SUMS]; + uint32 data[MAX_BLOCK_SIZE / (sizeof(uint32) * N_SUMS)][N_SUMS]; } PGChecksummablePage; /* @@ -143,7 +144,7 @@ do { \ * (at least on 4-byte boundary). */ static uint32 -pg_checksum_block(const PGChecksummablePage *page) +pg_checksum_block(const PGChecksummablePage *page, Size blocksize) { uint32 sums[N_SUMS]; uint32 result = 0; @@ -151,13 +152,13 @@ pg_checksum_block(const PGChecksummablePage *page) j; /* ensure that the size is compatible with the algorithm */ - Assert(sizeof(PGChecksummablePage) == BLCKSZ); + Assert(CLUSTER_BLOCK_SIZE == blocksize); /* initialize partial checksums to their corresponding offsets */ memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); /* main checksum calculation */ - for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++) + for (i = 0; i < (uint32) (blocksize / (sizeof(uint32) * N_SUMS)); i++) for (j = 0; j < N_SUMS; j++) CHECKSUM_COMP(sums[j], page->data[i][j]); @@ -184,7 +185,7 @@ pg_checksum_block(const PGChecksummablePage *page) * checksum itself), and the page data. */ uint16 -pg_checksum_page(char *page, BlockNumber blkno) +pg_checksum_page(char *page, BlockNumber blkno, Size pagesize) { PGChecksummablePage *cpage = (PGChecksummablePage *) page; uint16 save_checksum; @@ -201,7 +202,7 @@ pg_checksum_page(char *page, BlockNumber blkno) */ save_checksum = cpage->phdr.pd_checksum; cpage->phdr.pd_checksum = 0; - checksum = pg_checksum_block(cpage); + checksum = pg_checksum_block(cpage, pagesize); cpage->phdr.pd_checksum = save_checksum; /* Mix in the block number to detect transposed pages */ diff --git a/src/include/storage/freespace.h b/src/include/storage/freespace.h index 9e1a85a141..4bbad4aaad 100644 --- a/src/include/storage/freespace.h +++ b/src/include/storage/freespace.h @@ -19,6 +19,7 @@ #include "utils/relcache.h" /* prototypes for public functions in freespace.c */ +extern void FreeSpaceMapInit(void); extern Size GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk); extern BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded); extern BlockNumber RecordAndGetPageWithFreeSpace(Relation rel, diff --git a/src/include/storage/fsm_internals.h b/src/include/storage/fsm_internals.h index 9e314c83fa..95e4810d3a 100644 --- a/src/include/storage/fsm_internals.h +++ b/src/include/storage/fsm_internals.h @@ -48,10 +48,10 @@ typedef FSMPageData *FSMPage; * Number of non-leaf and leaf nodes, and nodes in total, on an FSM page. * These definitions are internal to fsmpage.c. */ -#define NodesPerPage (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \ +#define NodesPerPage (CLUSTER_BLOCK_SIZE - MAXALIGN(SizeOfPageHeaderData) - \ offsetof(FSMPageData, fp_nodes)) -#define NonLeafNodesPerPage (BLCKSZ / 2 - 1) +#define NonLeafNodesPerPage (CLUSTER_BLOCK_SIZE / 2 - 1) #define LeafNodesPerPage (NodesPerPage - NonLeafNodesPerPage) /* diff --git a/src/include/storage/large_object.h b/src/include/storage/large_object.h index db521f23eb..911667b732 100644 --- a/src/include/storage/large_object.h +++ b/src/include/storage/large_object.h @@ -54,7 +54,7 @@ typedef struct LargeObjectDesc /* * Each "page" (tuple) of a large object can hold this much data * - * We could set this as high as BLCKSZ less some overhead, but it seems + * We could set this as high as CLUSTER_BLOCK_SIZE less some overhead, but it seems * better to make it a smaller value, so that not as much space is used * up when a page-tuple is updated. Note that the value is deliberately * chosen large enough to trigger the tuple toaster, so that we will @@ -67,7 +67,9 @@ typedef struct LargeObjectDesc * * NB: Changing LOBLKSIZE requires an initdb. */ -#define LOBLKSIZE (BLCKSZ / 4) +#define CalcLOBLKSIZE(size) (size/4) +#define LOBLKSIZE (CLUSTER_BLOCK_SIZE / 4) +#define LOBLKSIZE_LIMIT CalcLOBLKSIZE(MAX_BLOCK_SIZE) /* * Maximum length in bytes for a large object. To make this larger, we'd diff --git a/src/include/storage/off.h b/src/include/storage/off.h index 3540308069..a608d21bbf 100644 --- a/src/include/storage/off.h +++ b/src/include/storage/off.h @@ -25,7 +25,9 @@ typedef uint16 OffsetNumber; #define InvalidOffsetNumber ((OffsetNumber) 0) #define FirstOffsetNumber ((OffsetNumber) 1) -#define MaxOffsetNumber ((OffsetNumber) (BLCKSZ / sizeof(ItemIdData))) +#define CalcMaxOffsetNumber(size) ((OffsetNumber) (size / sizeof(ItemIdData))) +#define MaxOffsetNumber ((OffsetNumber)CalcMaxOffsetNumber(CLUSTER_BLOCK_SIZE)) +#define MaxOffsetNumberLimit ((OffsetNumber)CalcMaxOffsetNumber(MAX_BLOCK_SIZE)) /* ---------------- * support macros diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index d5a0880678..a47b6b6c2d 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -298,6 +298,7 @@ extern bool ConfigOptionIsVisible(struct config_generic *conf); /* get the current set of variables */ extern struct config_generic **get_guc_variables(int *num_vars); +extern void update_dynamic_gucs(void); extern void build_guc_variables(void); /* search in enum options */ diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index 1426a353cd..4e1e6e1d97 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -368,14 +368,14 @@ typedef struct StdRdOptions * Returns the relation's desired space usage per page in bytes. */ #define RelationGetTargetPageUsage(relation, defaultff) \ - (BLCKSZ * RelationGetFillFactor(relation, defaultff) / 100) + (CLUSTER_BLOCK_SIZE * RelationGetFillFactor(relation, defaultff) / 100) /* * RelationGetTargetPageFreeSpace * Returns the relation's desired freespace per page in bytes. */ #define RelationGetTargetPageFreeSpace(relation, defaultff) \ - (BLCKSZ * (100 - RelationGetFillFactor(relation, defaultff)) / 100) + (CLUSTER_BLOCK_SIZE * (100 - RelationGetFillFactor(relation, defaultff)) / 100) /* * RelationIsUsedAsCatalogTable diff --git a/src/test/modules/test_slru/test_slru.c b/src/test/modules/test_slru/test_slru.c index ae21444c47..f3a66aa4a0 100644 --- a/src/test/modules/test_slru/test_slru.c +++ b/src/test/modules/test_slru/test_slru.c @@ -77,7 +77,7 @@ test_slru_page_write(PG_FUNCTION_ARGS) /* write given data to the page, up to the limit of the page */ strncpy(TestSlruCtl->shared->page_buffer[slotno], data, - BLCKSZ - 1); + CLUSTER_BLOCK_SIZE - 1); SimpleLruWritePage(TestSlruCtl, slotno); LWLockRelease(TestSLRULock); diff --git a/src/test/regress/expected/btree_index.out b/src/test/regress/expected/btree_index.out index 93ed5e8cc0..eae79e793e 100644 --- a/src/test/regress/expected/btree_index.out +++ b/src/test/regress/expected/btree_index.out @@ -320,7 +320,7 @@ CREATE UNIQUE INDEX dedup_unique ON dedup_unique_test_table (a) WITH (deduplicat CREATE UNIQUE INDEX plain_unique ON dedup_unique_test_table (a) WITH (deduplicate_items=off); -- Generate enough garbage tuples in index to ensure that even the unique index -- with deduplication enabled has to check multiple leaf pages during unique --- checking (at least with a BLCKSZ of 8192 or less) +-- checking (at least with a CLUSTER_BLOCK_SIZE of 8192 or less) DO $$ BEGIN FOR r IN 1..1350 LOOP @@ -331,7 +331,7 @@ END$$; -- Exercise the LP_DEAD-bit-set tuple deletion code with a posting list tuple. -- The implementation prefers deleting existing items to merging any duplicate -- tuples into a posting list, so we need an explicit test to make sure we get --- coverage (note that this test also assumes BLCKSZ is 8192 or less): +-- coverage (note that this test also assumes CLUSTER_BLOCK_SIZE is 8192 or less): DROP INDEX plain_unique; DELETE FROM dedup_unique_test_table WHERE a = 1; INSERT INTO dedup_unique_test_table SELECT i FROM generate_series(0,450) i; diff --git a/src/test/regress/expected/largeobject.out b/src/test/regress/expected/largeobject.out index bdcede6728..b2e9ab8f84 100644 --- a/src/test/regress/expected/largeobject.out +++ b/src/test/regress/expected/largeobject.out @@ -353,7 +353,7 @@ SELECT lo_lseek(fd, 0, 2) FROM lotest_stash_values; 670800 (1 row) --- with the default BLCKSZ, LOBLKSIZE = 2048, so this positions us for a block +-- with the default CLUSTER_BLOCK_SIZE, LOBLKSIZE = 2048, so this positions us for a block -- edge case SELECT lo_lseek(fd, 2030, 0) FROM lotest_stash_values; lo_lseek diff --git a/src/test/regress/expected/largeobject_1.out b/src/test/regress/expected/largeobject_1.out index d700910c35..f2f4853682 100644 --- a/src/test/regress/expected/largeobject_1.out +++ b/src/test/regress/expected/largeobject_1.out @@ -353,7 +353,7 @@ SELECT lo_lseek(fd, 0, 2) FROM lotest_stash_values; 680800 (1 row) --- with the default BLCKSZ, LOBLKSIZE = 2048, so this positions us for a block +-- with the default CLUSTER_BLOCK_SIZE, LOBLKSIZE = 2048, so this positions us for a block -- edge case SELECT lo_lseek(fd, 2030, 0) FROM lotest_stash_values; lo_lseek diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql index 239f4a4755..372e8efbff 100644 --- a/src/test/regress/sql/btree_index.sql +++ b/src/test/regress/sql/btree_index.sql @@ -173,7 +173,7 @@ CREATE UNIQUE INDEX dedup_unique ON dedup_unique_test_table (a) WITH (deduplicat CREATE UNIQUE INDEX plain_unique ON dedup_unique_test_table (a) WITH (deduplicate_items=off); -- Generate enough garbage tuples in index to ensure that even the unique index -- with deduplication enabled has to check multiple leaf pages during unique --- checking (at least with a BLCKSZ of 8192 or less) +-- checking (at least with a CLUSTER_BLOCK_SIZE of 8192 or less) DO $$ BEGIN FOR r IN 1..1350 LOOP @@ -185,7 +185,7 @@ END$$; -- Exercise the LP_DEAD-bit-set tuple deletion code with a posting list tuple. -- The implementation prefers deleting existing items to merging any duplicate -- tuples into a posting list, so we need an explicit test to make sure we get --- coverage (note that this test also assumes BLCKSZ is 8192 or less): +-- coverage (note that this test also assumes CLUSTER_BLOCK_SIZE is 8192 or less): DROP INDEX plain_unique; DELETE FROM dedup_unique_test_table WHERE a = 1; INSERT INTO dedup_unique_test_table SELECT i FROM generate_series(0,450) i; diff --git a/src/test/regress/sql/largeobject.sql b/src/test/regress/sql/largeobject.sql index 800e4fcc6a..0740c7e796 100644 --- a/src/test/regress/sql/largeobject.sql +++ b/src/test/regress/sql/largeobject.sql @@ -191,7 +191,7 @@ UPDATE lotest_stash_values SET fd=lo_open(loid, CAST(x'20000' | x'40000' AS inte -- verify length of large object SELECT lo_lseek(fd, 0, 2) FROM lotest_stash_values; --- with the default BLCKSZ, LOBLKSIZE = 2048, so this positions us for a block +-- with the default CLUSTER_BLOCK_SIZE, LOBLKSIZE = 2048, so this positions us for a block -- edge case SELECT lo_lseek(fd, 2030, 0) FROM lotest_stash_values; diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm index 9e05eb91b1..ead1d4e26f 100644 --- a/src/tools/msvc/Mkvcbuild.pm +++ b/src/tools/msvc/Mkvcbuild.pm @@ -135,7 +135,7 @@ sub mkvcbuild } our @pgcommonallfiles = qw( - archive.c base64.c checksum_helper.c compression.c + archive.c base64.c blocksize.c checksum_helper.c compression.c config_info.c controldata_utils.c d2s.c encnames.c exec.c f2s.c file_perm.c file_utils.c hashfn.c ip.c jsonapi.c keywords.c kwlookup.c link-canary.c md5_common.c percentrepl.c diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm index b6d31c3583..280f41a875 100644 --- a/src/tools/msvc/Solution.pm +++ b/src/tools/msvc/Solution.pm @@ -212,10 +212,10 @@ sub GenerateFiles ALIGNOF_PG_INT128_TYPE => undef, ALIGNOF_SHORT => 2, AC_APPLE_UNIVERSAL_BUILD => undef, - BLCKSZ => 1024 * $self->{options}->{blocksize}, CONFIGURE_ARGS => '"' . $self->GetFakeConfigure() . '"', DEF_PGPORT => $port, DEF_PGPORT_STR => qq{"$port"}, + DEFAULT_BLOCK_SIZE => 1024 * $self->{options}->{blocksize}, DLSUFFIX => '".dll"', ENABLE_GSS => $self->{options}->{gss} ? 1 : undef, ENABLE_NLS => $self->{options}->{nls} ? 1 : undef, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 260854747b..ada9fc4f70 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -893,7 +893,6 @@ Form_pg_ts_template Form_pg_type Form_pg_user_mapping FormatNode -FreeBlockNumberArray FreeListData FreePageBtree FreePageBtreeHeader -- 2.39.2